Skip to content

Instantly share code, notes, and snippets.

@Strangemother
Created September 9, 2025 11:30
Show Gist options
  • Save Strangemother/0f3e5077b268b402ebfc8fc27aec4f8f to your computer and use it in GitHub Desktop.
Save Strangemother/0f3e5077b268b402ebfc8fc27aec4f8f to your computer and use it in GitHub Desktop.
cache tiktoken blobs
"""An example of caching tiktoken data, so I can count tokens offline.
"""
import tiktoken
import hashlib
from pathlib import Path
import shutil
import os
# from tiktoken_ext
blobs = (
# uri (blobpath), expected hash (unrequired)
("https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
"306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930",),
("https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
"94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",),
("https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
"94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",),
("https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
"223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7",),
("https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
"446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d",),
)
DIR = Path("./tiktoken_cache").absolute()
os.environ["TIKTOKEN_CACHE_DIR"] = str(DIR.absolute())
for blobpath, expected in blobs:
cache_key = hashlib.sha1(blobpath.encode()).hexdigest()
filename = Path(blobpath).name
new_filename = cache_key
shutil.copyfile(dir_path / filename, DIR / new_filename)
print(filename, ': ', cache_key)
def num_tokens_from_string(string: str, encoding_name: str) -> int:
"""Returns the number of tokens in a text string."""
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment