Created
May 6, 2025 17:28
-
-
Save pedramamini/ae9a881b13d89faf0a46d43f0b30bc7d to your computer and use it in GitHub Desktop.
Revisions
-
pedramamini created this gist
May 6, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,60 @@ #!/usr/bin/env python3 import sys import tiktoken DEFAULT_ENCODING = "cl100k_base" def count_tokens(encoding_name, text): """Count the number of tokens in the given text using the specified encoding.""" enc = tiktoken.get_encoding(encoding_name) return len(enc.encode(text)) def safe_read_file(path): """Read a file as UTF-8, ignoring decoding errors.""" with open(path, "r", encoding="utf-8", errors="ignore") as f: return f.read() def safe_read_stdin(): """ Read from stdin in a way that ignores decoding errors. We use sys.stdin.buffer (the raw binary stream) and decode with errors='ignore'. """ try: # sys.stdin.buffer should be available in standard CPython. return sys.stdin.buffer.read().decode("utf-8", errors="ignore") except AttributeError: # Fallback: if sys.stdin has no buffer, use the text stream. return sys.stdin.read() def main(): encoding_name = DEFAULT_ENCODING text = "" # Check command-line arguments. # If there's a single argument, try to open it as a file. # If that fails, assume it's an encoding and read from stdin. if len(sys.argv) > 1 and sys.argv[1] not in ("-", "--"): if len(sys.argv) == 2: try: text = safe_read_file(sys.argv[1]) except FileNotFoundError: # If the file isn't found, treat the argument as an encoding. encoding_name = sys.argv[1] text = safe_read_stdin() else: # If there are two arguments, assume the first is encoding and the second is file path. encoding_name = sys.argv[1] file_path = sys.argv[2] try: text = safe_read_file(file_path) except FileNotFoundError: print(f"Error: File '{file_path}' not found.", file=sys.stderr) sys.exit(1) else: # No arguments (or "-" or "--" provided): read from stdin using the default encoding. text = safe_read_stdin() print(count_tokens(encoding_name, text)) if __name__ == "__main__": main()