Skip to content

Instantly share code, notes, and snippets.

@manjaroman2
Created August 2, 2025 23:10
Show Gist options
  • Select an option

  • Save manjaroman2/8e9cead8fb1fa96c640efab185c26035 to your computer and use it in GitHub Desktop.

Select an option

Save manjaroman2/8e9cead8fb1fa96c640efab185c26035 to your computer and use it in GitHub Desktop.
Tmdb dump
from pathlib import Path
from tqdm import tqdm
import json
import datetime
import os
import bisect
import time
import aiohttp
import asyncio
import shutil
import sys
tmdb_api_key = os.environ["TMDB_API_KEY"]
RQ_SEC = 50
dir_movie_ids = Path(__file__).parent / "movie_ids"
if not dir_movie_ids.exists():
exit(1)
dir_chunks = Path(__file__).parent / "chunks"
dir_chunks.mkdir(exist_ok=True)
def get_credits_url(movie_id):
return f"https://api.themoviedb.org/3/movie/{movie_id}/credits"
async def get(url, session):
try:
async with session.get(url=url, params={"api_key": tmdb_api_key}) as response:
if response.status == 429:
print("RATE LIMIT")
exit(1)
resp = await response.read()
# print("Successfully got url {} with resp of length {}.".format(url, len(resp)))
return resp
except Exception as e:
print("Unable to get url {} due to {}.".format(url, e.__class__))
async def main(dir_save: Path, movie_ids: list[int], save_chunk_size: int = 1000):
id_chunk_size = RQ_SEC
idx_start = 0
idx_end = idx_start + id_chunk_size
chunk_save = dir_save / f"{idx_start}:{idx_end}.json"
time_end = 1
time_start = 0
mb_per_file = 1
file_sizes_in_mb = list()
async with aiohttp.ClientSession() as tmdb_api:
tmdb_api.params = {"api_key": tmdb_api_key}
bar = tqdm(
total=len(movie_ids), desc="Processing", unit="movies", dynamic_ncols=True
)
while idx_start < len(movie_ids) - 1:
chunk_save = dir_save / f"{idx_start}:{idx_end}.json"
id_chunk = movie_ids[idx_start:idx_end]
urls = map(get_credits_url, id_chunk)
time_end = time.time()
time_took = time_end - time_start
time_sleep = max(1.0 - time_took, 0)
await asyncio.sleep(time_sleep)
ret = await asyncio.gather(*(get(url, tmdb_api) for url in urls))
time_start = time.time()
idx_start = idx_end
idx_end = min(idx_start + id_chunk_size, len(movie_ids) - 1)
ret_dump = b"\n".join(ret)
file_sizes_in_mb.append(len(ret_dump) / 1000000.0)
mb_per_file = sum(file_sizes_in_mb) / len(file_sizes_in_mb)
predicted_db_size = (len(movie_ids) / id_chunk_size * mb_per_file) / 1000.0
db_size = sum(file_sizes_in_mb) / 1000.0
bar.update(len(id_chunk))
bar.set_postfix(
{
"chunk": f"[{idx_start}:{idx_end}]",
"took": f"{time_took:.3f}s",
"sleep": f"{time_sleep:.2f}s",
"DB": f"{db_size:.2f}/{predicted_db_size:.2f}GB",
}
)
with chunk_save.open("wb") as f:
f.write(ret_dump)
bar.close()
print("Finalized all.")
if __name__ == "__main__":
fns = list(dir_movie_ids.glob("**/*.json"))
def parse_fn(fn):
month, day, year = fn.stem.split("_")[-3:]
return datetime.date(day=int(day), month=int(month), year=int(year))
date = parse_fn(fns[0])
fn = fns[0]
for it in fns[1:]:
if (curr := parse_fn(it)) > date:
date = curr
fn = it
print("Using latest movie ids")
print(fn)
dir_save = dir_chunks / f"{date}"
print(f"Dumping chunks to {dir_save.relative_to(Path(__file__).parent)}")
if dir_save.exists():
shutil.rmtree(dir_save)
dir_save.mkdir()
movie_ids = list()
with open(fn) as f:
while line := f.readline():
bisect.insort(movie_ids, json.loads(line)["id"])
print(f"loaded {len(movie_ids)} movie ids")
est_hours = len(movie_ids) / (RQ_SEC * 60 * 60)
est_minutes = int((est_hours - int(est_hours)) * 60)
print(
f"estimated scrape time @ {RQ_SEC}req/s: {int(est_hours)} hours, {est_minutes} minutes"
)
start = time.time()
asyncio.run(main(dir_save, movie_ids))
end = time.time()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment