Skip to content

Instantly share code, notes, and snippets.

@zelsaddr
Last active July 31, 2023 12:50
Show Gist options
  • Save zelsaddr/2280e2f658b6aa9d202f5d056992cf42 to your computer and use it in GitHub Desktop.
Save zelsaddr/2280e2f658b6aa9d202f5d056992cf42 to your computer and use it in GitHub Desktop.
MyAnimeListScrapper
import concurrent.futures
import requests
import re
from random_user_agent.user_agent import UserAgent
from random_user_agent.params import SoftwareName, OperatingSystem
import pandas as pd
# pip install requests random-user-agent pandas
class MyAnimeList:
def __init__(self):
self.base_url = "https://myanimelist.net"
self.headers = {
'authority': 'myanimelist.net',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.9,id;q=0.8',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Not/A)Brand";v="99", "Microsoft Edge";v="115", "Chromium";v="115"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
}
self.software_names = [SoftwareName.CHROME.value]
self.operating_systems = [
OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value]
# self.anime_lists = []
self.anime_data_lists = []
self.req = requests.Session()
def get_all_data_anime_from_page(self, next_page=None, limit=99999):
user_agent_rotator = UserAgent(
software_names=self.software_names, operating_systems=self.operating_systems, limit=100)
self.headers['User-Agent'] = user_agent_rotator.get_random_user_agent()
if next_page:
response = self.req.get(next_page, headers=self.headers)
else:
response = self.req.get(
f"{self.base_url}/topanime.php?type=bypopularity", headers=self.headers)
if response.status_code == 200:
anime_lists = re.findall(
r"<tr class=\"ranking-list\">(.*?)<\/tr>", response.text, re.DOTALL)
try:
get_next_page = re.search(
r"rel=\"next\"\shref=\"(.*?)\s\/>", response.text)
next_page = get_next_page[1].replace("amp;", "")
print("Next Page: ", next_page)
except:
next_page = None
anime_data_tasks = [] # To store the tasks
with concurrent.futures.ThreadPoolExecutor() as executor:
for anime in anime_lists:
anime_data = {}
anime_data['title'] = re.search(
r"<h3 class=\"hoverinfo_trigger.*\">(.*?)<\/a><\/h3>", str(anime))[1]
anime_data['url'] = re.search(r"<h3 class=\"hoverinfo_trigger.*\"><a\shref=\"(.*?)\"", str(
anime))[1].replace("amp;", "").replace("/video", "")
try:
anime_data['eps'] = re.search(r"\d+", re.search(
r"class=\"information di-ib.*>\n(.*?)<br>", str(anime))[1])[0]
except:
anime_data['eps'] = 0
anime_data_tasks.append(executor.submit(
self.get_data_of_anime, anime_data))
for future in concurrent.futures.as_completed(anime_data_tasks):
try:
future.result()
except Exception as exc:
pass
print("Total Anime: ", len(self.anime_data_lists))
if len(self.anime_data_lists) >= limit:
print(self.anime_data_lists)
return True
return self.get_all_data_anime_from_page(next_page, limit=limit)
else:
print(response.text)
def get_data_of_anime(self, anime_data):
get_details_page = self.req.get(
anime_data['url'], headers=self.headers, timeout=10)
if get_details_page.status_code == 200:
data = {
'url': anime_data['url'],
'title': re.search(r"h1\sclass=\"title-name.*<strong>(.*?)<\/h1>", get_details_page.text)[1].replace("</strong>", ""),
'eps': 'Episodes : ' + str(anime_data['eps']),
'score': re.search(r"<span itemprop=\"ratingValue\" class=\"score-label.*\">(.*?)<\/span><sup>", get_details_page.text)[1],
'rank': re.search(r"span class=\"dark_text\">Ranked:<\/span>\s+(.*?)<sup>", get_details_page.text)[1],
'popularity': 'Popularity: ' + str(re.search(r"span class=\"dark_text\">Popularity:<\/span>\s+(.*?)\s+<\/div>", get_details_page.text)[1]),
'members': re.search(r"span class=\"dark_text\">Members:<\/span>\s+(.*?)\s+<\/div>", get_details_page.text)[1],
'themes': 'Themes: ' + ', '.join(re.findall(r"itemprop=\"genre\".*\">(.*?)<\/span>",
get_details_page.text)),
'studios': 'Studios: ' + str(re.search(r"span class=\"dark_text\">Studios:<\/span>\s+.*title=\"(.*?)\">.*<\/a>", get_details_page.text)[1]),
}
self.anime_data_lists.append(data)
print(data)
else:
print(get_details_page.text)
def save_to_csv(self, file_name):
df = pd.DataFrame(self.anime_data_lists)
df.to_csv(file_name, index=False)
ma = MyAnimeList()
ma.get_all_data_anime_from_page(limit=50)
ma.save_to_csv("anime.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment