manjaroman2 · August 2, 2025 23:10
diff --git a/tmdb.py b/tmdb.py
 from pathlib import Path
 from tqdm import tqdm
 import json
 import datetime
 import os
 import bisect
 import time
 import aiohttp
 import asyncio
 import shutil
 import sys

 tmdb_api_key = os.environ["TMDB_API_KEY"]
 RQ_SEC = 50

 dir_movie_ids = Path(__file__).parent / "movie_ids"
 if not dir_movie_ids.exists():
    exit(1)

 dir_chunks = Path(__file__).parent / "chunks"
 dir_chunks.mkdir(exist_ok=True)


 def get_credits_url(movie_id):
    return f"https://api.themoviedb.org/3/movie/{movie_id}/credits"


 async def get(url, session):
    try:
        async with session.get(url=url, params={"api_key": tmdb_api_key}) as response:
            if response.status == 429:
                print("RATE LIMIT")
                exit(1)
            resp = await response.read()
            # print("Successfully got url {} with resp of length {}.".format(url, len(resp)))
            return resp
    except Exception as e:
        print("Unable to get url {} due to {}.".format(url, e.__class__))


 async def main(dir_save: Path, movie_ids: list[int], save_chunk_size: int = 1000):
    id_chunk_size = RQ_SEC
    idx_start = 0
    idx_end = idx_start + id_chunk_size

    chunk_save = dir_save / f"{idx_start}:{idx_end}.json"

    time_end = 1
    time_start = 0

    mb_per_file = 1
    file_sizes_in_mb = list()

    async with aiohttp.ClientSession() as tmdb_api:
        tmdb_api.params = {"api_key": tmdb_api_key}

        bar = tqdm(
            total=len(movie_ids), desc="Processing", unit="movies", dynamic_ncols=True
        )

        while idx_start < len(movie_ids) - 1:
            chunk_save = dir_save / f"{idx_start}:{idx_end}.json"
            id_chunk = movie_ids[idx_start:idx_end]
            urls = map(get_credits_url, id_chunk)

            time_end = time.time()

            time_took = time_end - time_start

            time_sleep = max(1.0 - time_took, 0)

            await asyncio.sleep(time_sleep)

            ret = await asyncio.gather(*(get(url, tmdb_api) for url in urls))
            time_start = time.time()

            idx_start = idx_end
            idx_end = min(idx_start + id_chunk_size, len(movie_ids) - 1)

            ret_dump = b"\n".join(ret)
            file_sizes_in_mb.append(len(ret_dump) / 1000000.0)
            mb_per_file = sum(file_sizes_in_mb) / len(file_sizes_in_mb)
            predicted_db_size = (len(movie_ids) / id_chunk_size * mb_per_file) / 1000.0
            db_size = sum(file_sizes_in_mb) / 1000.0

            bar.update(len(id_chunk))
            bar.set_postfix(
                {
                    "chunk": f"[{idx_start}:{idx_end}]",
                    "took": f"{time_took:.3f}s",
                    "sleep": f"{time_sleep:.2f}s",
                    "DB": f"{db_size:.2f}/{predicted_db_size:.2f}GB",
                }
            )
            with chunk_save.open("wb") as f:
                f.write(ret_dump)

        bar.close()
    print("Finalized all.")


 if __name__ == "__main__":
    fns = list(dir_movie_ids.glob("**/*.json"))

    def parse_fn(fn):
        month, day, year = fn.stem.split("_")[-3:]
        return datetime.date(day=int(day), month=int(month), year=int(year))

    date = parse_fn(fns[0])
    fn = fns[0]
    for it in fns[1:]:
        if (curr := parse_fn(it)) > date:
            date = curr
            fn = it

    print("Using latest movie ids")
    print(fn)
    dir_save = dir_chunks / f"{date}"
    print(f"Dumping chunks to {dir_save.relative_to(Path(__file__).parent)}")
    if dir_save.exists():
        shutil.rmtree(dir_save)
    dir_save.mkdir()

    movie_ids = list()
    with open(fn) as f:
        while line := f.readline():
            bisect.insort(movie_ids, json.loads(line)["id"])

    print(f"loaded {len(movie_ids)} movie ids")
    est_hours = len(movie_ids) / (RQ_SEC * 60 * 60)
    est_minutes = int((est_hours - int(est_hours)) * 60)
    print(
        f"estimated scrape time @ {RQ_SEC}req/s: {int(est_hours)} hours, {est_minutes} minutes"
    )

    start = time.time()
    asyncio.run(main(dir_save, movie_ids))
    end = time.time()
	from pathlib import Path
	from tqdm import tqdm
	import json
	import datetime
	import os
	import bisect
	import time
	import aiohttp
	import asyncio
	import shutil
	import sys

	tmdb_api_key = os.environ["TMDB_API_KEY"]
	RQ_SEC = 50

	dir_movie_ids = Path(__file__).parent / "movie_ids"
	if not dir_movie_ids.exists():
	exit(1)

	dir_chunks = Path(__file__).parent / "chunks"
	dir_chunks.mkdir(exist_ok=True)


	def get_credits_url(movie_id):
	return f"https://api.themoviedb.org/3/movie/{movie_id}/credits"


	async def get(url, session):
	try:
	async with session.get(url=url, params={"api_key": tmdb_api_key}) as response:
	if response.status == 429:
	print("RATE LIMIT")
	exit(1)
	resp = await response.read()
	# print("Successfully got url {} with resp of length {}.".format(url, len(resp)))
	return resp
	except Exception as e:
	print("Unable to get url {} due to {}.".format(url, e.__class__))


	async def main(dir_save: Path, movie_ids: list[int], save_chunk_size: int = 1000):
	id_chunk_size = RQ_SEC
	idx_start = 0
	idx_end = idx_start + id_chunk_size

	chunk_save = dir_save / f"{idx_start}:{idx_end}.json"

	time_end = 1
	time_start = 0

	mb_per_file = 1
	file_sizes_in_mb = list()

	async with aiohttp.ClientSession() as tmdb_api:
	tmdb_api.params = {"api_key": tmdb_api_key}

	bar = tqdm(
	total=len(movie_ids), desc="Processing", unit="movies", dynamic_ncols=True
	)

	while idx_start < len(movie_ids) - 1:
	chunk_save = dir_save / f"{idx_start}:{idx_end}.json"
	id_chunk = movie_ids[idx_start:idx_end]
	urls = map(get_credits_url, id_chunk)

	time_end = time.time()

	time_took = time_end - time_start

	time_sleep = max(1.0 - time_took, 0)

	await asyncio.sleep(time_sleep)

	ret = await asyncio.gather(*(get(url, tmdb_api) for url in urls))
	time_start = time.time()

	idx_start = idx_end
	idx_end = min(idx_start + id_chunk_size, len(movie_ids) - 1)

	ret_dump = b"\n".join(ret)
	file_sizes_in_mb.append(len(ret_dump) / 1000000.0)
	mb_per_file = sum(file_sizes_in_mb) / len(file_sizes_in_mb)
	predicted_db_size = (len(movie_ids) / id_chunk_size * mb_per_file) / 1000.0
	db_size = sum(file_sizes_in_mb) / 1000.0

	bar.update(len(id_chunk))
	bar.set_postfix(
	{
	"chunk": f"[{idx_start}:{idx_end}]",
	"took": f"{time_took:.3f}s",
	"sleep": f"{time_sleep:.2f}s",
	"DB": f"{db_size:.2f}/{predicted_db_size:.2f}GB",
	}
	)
	with chunk_save.open("wb") as f:
	f.write(ret_dump)

	bar.close()
	print("Finalized all.")


	if __name__ == "__main__":
	fns = list(dir_movie_ids.glob("*/.json"))

	def parse_fn(fn):
	month, day, year = fn.stem.split("_")[-3:]
	return datetime.date(day=int(day), month=int(month), year=int(year))

	date = parse_fn(fns[0])
	fn = fns[0]
	for it in fns[1:]:
	if (curr := parse_fn(it)) > date:
	date = curr
	fn = it

	print("Using latest movie ids")
	print(fn)
	dir_save = dir_chunks / f"{date}"
	print(f"Dumping chunks to {dir_save.relative_to(Path(__file__).parent)}")
	if dir_save.exists():
	shutil.rmtree(dir_save)
	dir_save.mkdir()

	movie_ids = list()
	with open(fn) as f:
	while line := f.readline():
	bisect.insort(movie_ids, json.loads(line)["id"])

	print(f"loaded {len(movie_ids)} movie ids")
	est_hours = len(movie_ids) / (RQ_SEC * 60 * 60)
	est_minutes = int((est_hours - int(est_hours)) * 60)
	print(
	f"estimated scrape time @ {RQ_SEC}req/s: {int(est_hours)} hours, {est_minutes} minutes"
	)

	start = time.time()
	asyncio.run(main(dir_save, movie_ids))
	end = time.time()
No results found