Created
August 2, 2025 23:10
-
-
Save manjaroman2/8e9cead8fb1fa96c640efab185c26035 to your computer and use it in GitHub Desktop.
Tmdb dump
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pathlib import Path | |
| from tqdm import tqdm | |
| import json | |
| import datetime | |
| import os | |
| import bisect | |
| import time | |
| import aiohttp | |
| import asyncio | |
| import shutil | |
| import sys | |
| tmdb_api_key = os.environ["TMDB_API_KEY"] | |
| RQ_SEC = 50 | |
| dir_movie_ids = Path(__file__).parent / "movie_ids" | |
| if not dir_movie_ids.exists(): | |
| exit(1) | |
| dir_chunks = Path(__file__).parent / "chunks" | |
| dir_chunks.mkdir(exist_ok=True) | |
| def get_credits_url(movie_id): | |
| return f"https://api.themoviedb.org/3/movie/{movie_id}/credits" | |
| async def get(url, session): | |
| try: | |
| async with session.get(url=url, params={"api_key": tmdb_api_key}) as response: | |
| if response.status == 429: | |
| print("RATE LIMIT") | |
| exit(1) | |
| resp = await response.read() | |
| # print("Successfully got url {} with resp of length {}.".format(url, len(resp))) | |
| return resp | |
| except Exception as e: | |
| print("Unable to get url {} due to {}.".format(url, e.__class__)) | |
| async def main(dir_save: Path, movie_ids: list[int], save_chunk_size: int = 1000): | |
| id_chunk_size = RQ_SEC | |
| idx_start = 0 | |
| idx_end = idx_start + id_chunk_size | |
| chunk_save = dir_save / f"{idx_start}:{idx_end}.json" | |
| time_end = 1 | |
| time_start = 0 | |
| mb_per_file = 1 | |
| file_sizes_in_mb = list() | |
| async with aiohttp.ClientSession() as tmdb_api: | |
| tmdb_api.params = {"api_key": tmdb_api_key} | |
| bar = tqdm( | |
| total=len(movie_ids), desc="Processing", unit="movies", dynamic_ncols=True | |
| ) | |
| while idx_start < len(movie_ids) - 1: | |
| chunk_save = dir_save / f"{idx_start}:{idx_end}.json" | |
| id_chunk = movie_ids[idx_start:idx_end] | |
| urls = map(get_credits_url, id_chunk) | |
| time_end = time.time() | |
| time_took = time_end - time_start | |
| time_sleep = max(1.0 - time_took, 0) | |
| await asyncio.sleep(time_sleep) | |
| ret = await asyncio.gather(*(get(url, tmdb_api) for url in urls)) | |
| time_start = time.time() | |
| idx_start = idx_end | |
| idx_end = min(idx_start + id_chunk_size, len(movie_ids) - 1) | |
| ret_dump = b"\n".join(ret) | |
| file_sizes_in_mb.append(len(ret_dump) / 1000000.0) | |
| mb_per_file = sum(file_sizes_in_mb) / len(file_sizes_in_mb) | |
| predicted_db_size = (len(movie_ids) / id_chunk_size * mb_per_file) / 1000.0 | |
| db_size = sum(file_sizes_in_mb) / 1000.0 | |
| bar.update(len(id_chunk)) | |
| bar.set_postfix( | |
| { | |
| "chunk": f"[{idx_start}:{idx_end}]", | |
| "took": f"{time_took:.3f}s", | |
| "sleep": f"{time_sleep:.2f}s", | |
| "DB": f"{db_size:.2f}/{predicted_db_size:.2f}GB", | |
| } | |
| ) | |
| with chunk_save.open("wb") as f: | |
| f.write(ret_dump) | |
| bar.close() | |
| print("Finalized all.") | |
| if __name__ == "__main__": | |
| fns = list(dir_movie_ids.glob("**/*.json")) | |
| def parse_fn(fn): | |
| month, day, year = fn.stem.split("_")[-3:] | |
| return datetime.date(day=int(day), month=int(month), year=int(year)) | |
| date = parse_fn(fns[0]) | |
| fn = fns[0] | |
| for it in fns[1:]: | |
| if (curr := parse_fn(it)) > date: | |
| date = curr | |
| fn = it | |
| print("Using latest movie ids") | |
| print(fn) | |
| dir_save = dir_chunks / f"{date}" | |
| print(f"Dumping chunks to {dir_save.relative_to(Path(__file__).parent)}") | |
| if dir_save.exists(): | |
| shutil.rmtree(dir_save) | |
| dir_save.mkdir() | |
| movie_ids = list() | |
| with open(fn) as f: | |
| while line := f.readline(): | |
| bisect.insort(movie_ids, json.loads(line)["id"]) | |
| print(f"loaded {len(movie_ids)} movie ids") | |
| est_hours = len(movie_ids) / (RQ_SEC * 60 * 60) | |
| est_minutes = int((est_hours - int(est_hours)) * 60) | |
| print( | |
| f"estimated scrape time @ {RQ_SEC}req/s: {int(est_hours)} hours, {est_minutes} minutes" | |
| ) | |
| start = time.time() | |
| asyncio.run(main(dir_save, movie_ids)) | |
| end = time.time() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment