Created
August 13, 2024 08:10
-
-
Save DiracSpace/b5c13392f861439c57c2438f01f2c481 to your computer and use it in GitHub Desktop.
Simple scrapper of game titles from Myrient
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python3 | |
| import requests | |
| from bs4 import BeautifulSoup, NavigableString, Tag | |
| from http import HTTPStatus | |
| from typing import List | |
| from dataclasses import dataclass | |
| from enum import Enum | |
| class MyrientConsole(Enum): | |
| unknown='unknown' | |
| playstation_3='playstation_3' | |
| gamecube='gamecube' | |
| @staticmethod | |
| def from_string(console: str): | |
| console = console.lower().strip() | |
| if console in 'unknown': | |
| return MyrientConsole.unknown | |
| elif console in 'playstation_3' or 'playstation 3': | |
| return MyrientConsole.playstation_3 | |
| elif console in 'gamecube': | |
| return MyrientConsole.gamecube | |
| else: | |
| raise NotImplementedError(f'Unsupported console {console}.') | |
| @staticmethod | |
| def to_domain(console: str): | |
| return MYRENT_DOMAINS[MyrientConsole.from_string(console)] | |
| MYRENT_DOMAINS = { | |
| MyrientConsole.playstation_3 : 'https://myrient.erista.me/files/No-Intro/Sony%20-%20PlayStation%203%20(PSN)%20(Content)/', | |
| MyrientConsole.gamecube : 'https://myrient.erista.me/files/Redump/Nintendo%20-%20GameCube%20-%20NKit%20RVZ%20[zstd-19-128k]/' | |
| } | |
| class ContentResponse: | |
| url: str = '' | |
| status_code: int = 0 | |
| content: str = '' | |
| bad_requests: List[HTTPStatus] = [ | |
| HTTPStatus.BAD_GATEWAY, | |
| HTTPStatus.BAD_REQUEST, | |
| HTTPStatus.GATEWAY_TIMEOUT, | |
| HTTPStatus.REQUEST_TIMEOUT, | |
| HTTPStatus.INTERNAL_SERVER_ERROR, | |
| ] | |
| def __init__(self, url: str, status_code: int, content: str): | |
| self.url = url | |
| self.status_code = status_code | |
| self.content = content | |
| def is_failure(self) -> bool: | |
| return self.status_code in self.bad_requests | |
| def is_success(self) -> bool: | |
| return self.status_code not in self.bad_requests | |
| class HttpContent: | |
| url: str = '' | |
| headers = { | |
| 'User Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0', | |
| } | |
| def __init__(self, url: str): | |
| self.url = url | |
| def fetch_url_content(self) -> ContentResponse: | |
| try: | |
| print(f'Making request to {self.url}.') | |
| response = requests.get(self.url, headers=self.headers, timeout=10) | |
| response.raise_for_status() # Raise HTTPError for bad responses (4xx and 5xx) | |
| print(f'Received response for {self.url} with status code {response.status_code}.') | |
| return ContentResponse(self.url, response.status_code, response.text) | |
| except requests.exceptions.HTTPError as errh: | |
| return ContentResponse(self.url, errh.response.status_code, errh.response.text) | |
| except requests.exceptions.ConnectionError as connection_error: | |
| return ContentResponse(self.url, connection_error.response.status_code, connection_error.response.text) | |
| except requests.exceptions.Timeout as connection_timeout: | |
| return ContentResponse(self.url, connection_timeout.response.status_code, connection_timeout.response.text) | |
| except requests.exceptions.RequestException as request_error: | |
| return ContentResponse(self.url, request_error.response.status_code, request_error.response.text) | |
| class MyrientTableRow: | |
| table_row: Tag | |
| def __init__(self, table_row: Tag): | |
| self.table_row = table_row | |
| def link(self) -> str: | |
| table_row_link_data_cell = self.table_row.find('td', attrs={ 'class' : 'link' }) | |
| if table_row_link_data_cell is None: | |
| raise RuntimeError('Could not find link tag in table row.') | |
| data_cell_link_a_tag = table_row_link_data_cell.find('a') | |
| if data_cell_link_a_tag is None: | |
| raise RuntimeError('Could not find a tag in table data cell.') | |
| elif isinstance(data_cell_link_a_tag, NavigableString): | |
| raise RuntimeError('Cannot traverse instance of NavigatableString.') | |
| elif isinstance(data_cell_link_a_tag, int): | |
| raise RuntimeError('Cannot traverse instance of int.') | |
| href = data_cell_link_a_tag.get('href') | |
| if href is None: | |
| raise RuntimeError('Could not obtain href from table data cell.') | |
| elif isinstance(href, List): | |
| return href[0] | |
| return href | |
| def title(self) -> str: | |
| table_row_link_data_cell = self.table_row.find('td') | |
| if table_row_link_data_cell is None: | |
| raise RuntimeError('Could not find table data cells in table row.') | |
| data_cell_link_a_tag = table_row_link_data_cell.find('a') | |
| if data_cell_link_a_tag is None: | |
| raise RuntimeError('Could not find a tag in table data cell.') | |
| elif isinstance(data_cell_link_a_tag, NavigableString): | |
| raise RuntimeError('Cannot traverse instance of NavigatableString.') | |
| elif isinstance(data_cell_link_a_tag, int): | |
| raise RuntimeError('Cannot traverse instance of int.') | |
| title = data_cell_link_a_tag.get('title') | |
| if title is None: | |
| raise RuntimeError('Could not obtain href from table data cell.') | |
| elif isinstance(title, List): | |
| return title[0] | |
| return title | |
| def size(self) -> str: | |
| return '' | |
| def date(self) -> str: | |
| return '' | |
| class MyrientPlaystation3Parser: | |
| content: BeautifulSoup | |
| query: str | |
| def __init__(self, content: str, query: str): | |
| self.content = BeautifulSoup(content, features="html.parser") | |
| self.query = query | |
| def parse_content(self) -> List[MyrientTableRow]: | |
| table = self.content.find('table', attrs={ 'id' : 'list' }) | |
| if table is None: | |
| raise RuntimeError('Could not find table in provided content.') | |
| elif isinstance(table, NavigableString): | |
| raise RuntimeError('Cannot traverse instance of NavigatableString.') | |
| parsed_table_rows: List[MyrientTableRow] = [] | |
| for index, table_row in enumerate(table.find_all('tr')): | |
| # skip headers and file traversal row | |
| if index == 0 or index == 1: | |
| continue | |
| elif table_row is None: | |
| continue | |
| parsed_table_rows.append(MyrientTableRow(table_row)) | |
| return parsed_table_rows | |
| class MyrientGamecubeParser: | |
| content: str | |
| query: str | |
| def __init__(self, content: str, query: str): | |
| self.content = content | |
| self.query = query | |
| def parse_content(self) -> List[MyrientTableRow]: | |
| return [] | |
| @dataclass | |
| class ConsoleParser: | |
| console: MyrientConsole | |
| content: ContentResponse | |
| query: str | |
| @property | |
| def results(self) -> List[MyrientTableRow]: | |
| print(f'Parsing console {self.console}.') | |
| parsed_results = [] | |
| if (self.console == MyrientConsole.playstation_3): | |
| parsed_results = MyrientPlaystation3Parser(self.content.content, self.query).parse_content() | |
| elif (self.console == MyrientConsole.gamecube): | |
| parsed_results = MyrientGamecubeParser(self.content.content, self.query).parse_content() | |
| else: | |
| print(f'No parser found for {self.console}.') | |
| raise RuntimeError(f'Unsupported console: {self.console}') | |
| return [result for result in parsed_results if self.query.lower().strip() in result.title().lower().strip()] | |
| def main(): | |
| """ | |
| entry point for script | |
| """ | |
| game_title = input('Please input your game title: ') | |
| game_console = input('Please input the console: ') | |
| http_content_response = HttpContent(MyrientConsole.to_domain(game_console)).fetch_url_content() | |
| if (http_content_response.is_failure()): | |
| raise RuntimeError(f'Failure to obtain response content. Status Code: {http_content_response.status_code}, Message: {http_content_response.content}') | |
| parsed_myrient_results = ConsoleParser(MyrientConsole.from_string(game_console), http_content_response, game_title).results | |
| for query_result in parsed_myrient_results: | |
| print(query_result.title()) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment