Skip to content

Instantly share code, notes, and snippets.

@pedramamini
Created May 6, 2025 17:28
Show Gist options
  • Select an option

  • Save pedramamini/ae9a881b13d89faf0a46d43f0b30bc7d to your computer and use it in GitHub Desktop.

Select an option

Save pedramamini/ae9a881b13d89faf0a46d43f0b30bc7d to your computer and use it in GitHub Desktop.

Revisions

  1. pedramamini created this gist May 6, 2025.
    60 changes: 60 additions & 0 deletions ttok.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,60 @@
    #!/usr/bin/env python3
    import sys
    import tiktoken

    DEFAULT_ENCODING = "cl100k_base"

    def count_tokens(encoding_name, text):
    """Count the number of tokens in the given text using the specified encoding."""
    enc = tiktoken.get_encoding(encoding_name)
    return len(enc.encode(text))

    def safe_read_file(path):
    """Read a file as UTF-8, ignoring decoding errors."""
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
    return f.read()

    def safe_read_stdin():
    """
    Read from stdin in a way that ignores decoding errors.
    We use sys.stdin.buffer (the raw binary stream) and decode with errors='ignore'.
    """
    try:
    # sys.stdin.buffer should be available in standard CPython.
    return sys.stdin.buffer.read().decode("utf-8", errors="ignore")
    except AttributeError:
    # Fallback: if sys.stdin has no buffer, use the text stream.
    return sys.stdin.read()

    def main():
    encoding_name = DEFAULT_ENCODING
    text = ""

    # Check command-line arguments.
    # If there's a single argument, try to open it as a file.
    # If that fails, assume it's an encoding and read from stdin.
    if len(sys.argv) > 1 and sys.argv[1] not in ("-", "--"):
    if len(sys.argv) == 2:
    try:
    text = safe_read_file(sys.argv[1])
    except FileNotFoundError:
    # If the file isn't found, treat the argument as an encoding.
    encoding_name = sys.argv[1]
    text = safe_read_stdin()
    else:
    # If there are two arguments, assume the first is encoding and the second is file path.
    encoding_name = sys.argv[1]
    file_path = sys.argv[2]
    try:
    text = safe_read_file(file_path)
    except FileNotFoundError:
    print(f"Error: File '{file_path}' not found.", file=sys.stderr)
    sys.exit(1)
    else:
    # No arguments (or "-" or "--" provided): read from stdin using the default encoding.
    text = safe_read_stdin()

    print(count_tokens(encoding_name, text))

    if __name__ == "__main__":
    main()