Skip to content

Instantly share code, notes, and snippets.

@DiracSpace
Created August 13, 2024 08:10
Show Gist options
  • Save DiracSpace/b5c13392f861439c57c2438f01f2c481 to your computer and use it in GitHub Desktop.
Save DiracSpace/b5c13392f861439c57c2438f01f2c481 to your computer and use it in GitHub Desktop.
Simple scrapper of game titles from Myrient
#!/usr/bin/python3
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
from http import HTTPStatus
from typing import List
from dataclasses import dataclass
from enum import Enum
class MyrientConsole(Enum):
unknown='unknown'
playstation_3='playstation_3'
gamecube='gamecube'
@staticmethod
def from_string(console: str):
console = console.lower().strip()
if console in 'unknown':
return MyrientConsole.unknown
elif console in 'playstation_3' or 'playstation 3':
return MyrientConsole.playstation_3
elif console in 'gamecube':
return MyrientConsole.gamecube
else:
raise NotImplementedError(f'Unsupported console {console}.')
@staticmethod
def to_domain(console: str):
return MYRENT_DOMAINS[MyrientConsole.from_string(console)]
MYRENT_DOMAINS = {
MyrientConsole.playstation_3 : 'https://myrient.erista.me/files/No-Intro/Sony%20-%20PlayStation%203%20(PSN)%20(Content)/',
MyrientConsole.gamecube : 'https://myrient.erista.me/files/Redump/Nintendo%20-%20GameCube%20-%20NKit%20RVZ%20[zstd-19-128k]/'
}
class ContentResponse:
url: str = ''
status_code: int = 0
content: str = ''
bad_requests: List[HTTPStatus] = [
HTTPStatus.BAD_GATEWAY,
HTTPStatus.BAD_REQUEST,
HTTPStatus.GATEWAY_TIMEOUT,
HTTPStatus.REQUEST_TIMEOUT,
HTTPStatus.INTERNAL_SERVER_ERROR,
]
def __init__(self, url: str, status_code: int, content: str):
self.url = url
self.status_code = status_code
self.content = content
def is_failure(self) -> bool:
return self.status_code in self.bad_requests
def is_success(self) -> bool:
return self.status_code not in self.bad_requests
class HttpContent:
url: str = ''
headers = {
'User Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',
}
def __init__(self, url: str):
self.url = url
def fetch_url_content(self) -> ContentResponse:
try:
print(f'Making request to {self.url}.')
response = requests.get(self.url, headers=self.headers, timeout=10)
response.raise_for_status() # Raise HTTPError for bad responses (4xx and 5xx)
print(f'Received response for {self.url} with status code {response.status_code}.')
return ContentResponse(self.url, response.status_code, response.text)
except requests.exceptions.HTTPError as errh:
return ContentResponse(self.url, errh.response.status_code, errh.response.text)
except requests.exceptions.ConnectionError as connection_error:
return ContentResponse(self.url, connection_error.response.status_code, connection_error.response.text)
except requests.exceptions.Timeout as connection_timeout:
return ContentResponse(self.url, connection_timeout.response.status_code, connection_timeout.response.text)
except requests.exceptions.RequestException as request_error:
return ContentResponse(self.url, request_error.response.status_code, request_error.response.text)
class MyrientTableRow:
table_row: Tag
def __init__(self, table_row: Tag):
self.table_row = table_row
def link(self) -> str:
table_row_link_data_cell = self.table_row.find('td', attrs={ 'class' : 'link' })
if table_row_link_data_cell is None:
raise RuntimeError('Could not find link tag in table row.')
data_cell_link_a_tag = table_row_link_data_cell.find('a')
if data_cell_link_a_tag is None:
raise RuntimeError('Could not find a tag in table data cell.')
elif isinstance(data_cell_link_a_tag, NavigableString):
raise RuntimeError('Cannot traverse instance of NavigatableString.')
elif isinstance(data_cell_link_a_tag, int):
raise RuntimeError('Cannot traverse instance of int.')
href = data_cell_link_a_tag.get('href')
if href is None:
raise RuntimeError('Could not obtain href from table data cell.')
elif isinstance(href, List):
return href[0]
return href
def title(self) -> str:
table_row_link_data_cell = self.table_row.find('td')
if table_row_link_data_cell is None:
raise RuntimeError('Could not find table data cells in table row.')
data_cell_link_a_tag = table_row_link_data_cell.find('a')
if data_cell_link_a_tag is None:
raise RuntimeError('Could not find a tag in table data cell.')
elif isinstance(data_cell_link_a_tag, NavigableString):
raise RuntimeError('Cannot traverse instance of NavigatableString.')
elif isinstance(data_cell_link_a_tag, int):
raise RuntimeError('Cannot traverse instance of int.')
title = data_cell_link_a_tag.get('title')
if title is None:
raise RuntimeError('Could not obtain href from table data cell.')
elif isinstance(title, List):
return title[0]
return title
def size(self) -> str:
return ''
def date(self) -> str:
return ''
class MyrientPlaystation3Parser:
content: BeautifulSoup
query: str
def __init__(self, content: str, query: str):
self.content = BeautifulSoup(content, features="html.parser")
self.query = query
def parse_content(self) -> List[MyrientTableRow]:
table = self.content.find('table', attrs={ 'id' : 'list' })
if table is None:
raise RuntimeError('Could not find table in provided content.')
elif isinstance(table, NavigableString):
raise RuntimeError('Cannot traverse instance of NavigatableString.')
parsed_table_rows: List[MyrientTableRow] = []
for index, table_row in enumerate(table.find_all('tr')):
# skip headers and file traversal row
if index == 0 or index == 1:
continue
elif table_row is None:
continue
parsed_table_rows.append(MyrientTableRow(table_row))
return parsed_table_rows
class MyrientGamecubeParser:
content: str
query: str
def __init__(self, content: str, query: str):
self.content = content
self.query = query
def parse_content(self) -> List[MyrientTableRow]:
return []
@dataclass
class ConsoleParser:
console: MyrientConsole
content: ContentResponse
query: str
@property
def results(self) -> List[MyrientTableRow]:
print(f'Parsing console {self.console}.')
parsed_results = []
if (self.console == MyrientConsole.playstation_3):
parsed_results = MyrientPlaystation3Parser(self.content.content, self.query).parse_content()
elif (self.console == MyrientConsole.gamecube):
parsed_results = MyrientGamecubeParser(self.content.content, self.query).parse_content()
else:
print(f'No parser found for {self.console}.')
raise RuntimeError(f'Unsupported console: {self.console}')
return [result for result in parsed_results if self.query.lower().strip() in result.title().lower().strip()]
def main():
"""
entry point for script
"""
game_title = input('Please input your game title: ')
game_console = input('Please input the console: ')
http_content_response = HttpContent(MyrientConsole.to_domain(game_console)).fetch_url_content()
if (http_content_response.is_failure()):
raise RuntimeError(f'Failure to obtain response content. Status Code: {http_content_response.status_code}, Message: {http_content_response.content}')
parsed_myrient_results = ConsoleParser(MyrientConsole.from_string(game_console), http_content_response, game_title).results
for query_result in parsed_myrient_results:
print(query_result.title())
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment