Last active
July 31, 2023 12:50
-
-
Save zelsaddr/2280e2f658b6aa9d202f5d056992cf42 to your computer and use it in GitHub Desktop.
MyAnimeListScrapper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import concurrent.futures | |
| import requests | |
| import re | |
| from random_user_agent.user_agent import UserAgent | |
| from random_user_agent.params import SoftwareName, OperatingSystem | |
| import pandas as pd | |
| # pip install requests random-user-agent pandas | |
| class MyAnimeList: | |
| def __init__(self): | |
| self.base_url = "https://myanimelist.net" | |
| self.headers = { | |
| 'authority': 'myanimelist.net', | |
| 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', | |
| 'accept-language': 'en-US,en;q=0.9,id;q=0.8', | |
| 'cache-control': 'max-age=0', | |
| 'sec-ch-ua': '"Not/A)Brand";v="99", "Microsoft Edge";v="115", "Chromium";v="115"', | |
| 'sec-ch-ua-mobile': '?0', | |
| 'sec-ch-ua-platform': '"Windows"', | |
| 'sec-fetch-dest': 'document', | |
| 'sec-fetch-mode': 'navigate', | |
| 'sec-fetch-site': 'none', | |
| 'sec-fetch-user': '?1', | |
| 'upgrade-insecure-requests': '1', | |
| } | |
| self.software_names = [SoftwareName.CHROME.value] | |
| self.operating_systems = [ | |
| OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value] | |
| # self.anime_lists = [] | |
| self.anime_data_lists = [] | |
| self.req = requests.Session() | |
| def get_all_data_anime_from_page(self, next_page=None, limit=99999): | |
| user_agent_rotator = UserAgent( | |
| software_names=self.software_names, operating_systems=self.operating_systems, limit=100) | |
| self.headers['User-Agent'] = user_agent_rotator.get_random_user_agent() | |
| if next_page: | |
| response = self.req.get(next_page, headers=self.headers) | |
| else: | |
| response = self.req.get( | |
| f"{self.base_url}/topanime.php?type=bypopularity", headers=self.headers) | |
| if response.status_code == 200: | |
| anime_lists = re.findall( | |
| r"<tr class=\"ranking-list\">(.*?)<\/tr>", response.text, re.DOTALL) | |
| try: | |
| get_next_page = re.search( | |
| r"rel=\"next\"\shref=\"(.*?)\s\/>", response.text) | |
| next_page = get_next_page[1].replace("amp;", "") | |
| print("Next Page: ", next_page) | |
| except: | |
| next_page = None | |
| anime_data_tasks = [] # To store the tasks | |
| with concurrent.futures.ThreadPoolExecutor() as executor: | |
| for anime in anime_lists: | |
| anime_data = {} | |
| anime_data['title'] = re.search( | |
| r"<h3 class=\"hoverinfo_trigger.*\">(.*?)<\/a><\/h3>", str(anime))[1] | |
| anime_data['url'] = re.search(r"<h3 class=\"hoverinfo_trigger.*\"><a\shref=\"(.*?)\"", str( | |
| anime))[1].replace("amp;", "").replace("/video", "") | |
| try: | |
| anime_data['eps'] = re.search(r"\d+", re.search( | |
| r"class=\"information di-ib.*>\n(.*?)<br>", str(anime))[1])[0] | |
| except: | |
| anime_data['eps'] = 0 | |
| anime_data_tasks.append(executor.submit( | |
| self.get_data_of_anime, anime_data)) | |
| for future in concurrent.futures.as_completed(anime_data_tasks): | |
| try: | |
| future.result() | |
| except Exception as exc: | |
| pass | |
| print("Total Anime: ", len(self.anime_data_lists)) | |
| if len(self.anime_data_lists) >= limit: | |
| print(self.anime_data_lists) | |
| return True | |
| return self.get_all_data_anime_from_page(next_page, limit=limit) | |
| else: | |
| print(response.text) | |
| def get_data_of_anime(self, anime_data): | |
| get_details_page = self.req.get( | |
| anime_data['url'], headers=self.headers, timeout=10) | |
| if get_details_page.status_code == 200: | |
| data = { | |
| 'url': anime_data['url'], | |
| 'title': re.search(r"h1\sclass=\"title-name.*<strong>(.*?)<\/h1>", get_details_page.text)[1].replace("</strong>", ""), | |
| 'eps': 'Episodes : ' + str(anime_data['eps']), | |
| 'score': re.search(r"<span itemprop=\"ratingValue\" class=\"score-label.*\">(.*?)<\/span><sup>", get_details_page.text)[1], | |
| 'rank': re.search(r"span class=\"dark_text\">Ranked:<\/span>\s+(.*?)<sup>", get_details_page.text)[1], | |
| 'popularity': 'Popularity: ' + str(re.search(r"span class=\"dark_text\">Popularity:<\/span>\s+(.*?)\s+<\/div>", get_details_page.text)[1]), | |
| 'members': re.search(r"span class=\"dark_text\">Members:<\/span>\s+(.*?)\s+<\/div>", get_details_page.text)[1], | |
| 'themes': 'Themes: ' + ', '.join(re.findall(r"itemprop=\"genre\".*\">(.*?)<\/span>", | |
| get_details_page.text)), | |
| 'studios': 'Studios: ' + str(re.search(r"span class=\"dark_text\">Studios:<\/span>\s+.*title=\"(.*?)\">.*<\/a>", get_details_page.text)[1]), | |
| } | |
| self.anime_data_lists.append(data) | |
| print(data) | |
| else: | |
| print(get_details_page.text) | |
| def save_to_csv(self, file_name): | |
| df = pd.DataFrame(self.anime_data_lists) | |
| df.to_csv(file_name, index=False) | |
| ma = MyAnimeList() | |
| ma.get_all_data_anime_from_page(limit=50) | |
| ma.save_to_csv("anime.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment