{ "cells": [ { "cell_type": "markdown", "source": [ "### Install requirements" ], "metadata": { "id": "2krxXyYOEsAj" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qD4__VYRE9ep" }, "outputs": [], "source": [ "!pip install -q tiktoken transformers" ] }, { "cell_type": "markdown", "source": [ "### Setup" ], "metadata": { "id": "OcCezqFbEvVN" } }, { "cell_type": "code", "source": [ "\n", "# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb\n", "MODEL_INFO = {\n", " # GPT-2 and GPT-3 models (r50k_base)\n", " 'gpt2': {\n", " 'tokenizer_class': 'GPT2Tokenizer',\n", " 'model_max_length': 1024,\n", " },\n", " 'davinci': { # (gpt-3)\n", " 'tokenizer_class': 'GPT3Tokenizer',\n", " 'model_max_length': 2048,\n", " },\n", "\n", " # GPT-3.5 and GPT-4 models (cl100k_base)\n", " 'gpt-3.5-turbo': {\n", " 'tokenizer_class': 'GPT3_5Tokenizer',\n", " 'model_max_length': 4096,\n", " },\n", " 'gpt-3.5-turbo-16k': {\n", " 'tokenizer_class': 'GPT3_5Tokenizer',\n", " 'model_max_length': 16384,\n", " },\n", " 'gpt-4': {\n", " 'tokenizer_class': 'GPT4Tokenizer',\n", " 'model_max_length': 8192,\n", " },\n", " 'text-embedding-ada-002': {\n", " 'tokenizer_class': 'GPT4Tokenizer',\n", " 'model_max_length': 8192,\n", " },\n", "\n", " # Codex models (p50k_base)\n", " 'text-davinci-002': {\n", " 'tokenizer_class': 'CodexTokenizer',\n", " 'model_max_length': 4096,\n", " },\n", " 'text-davinci-003': {\n", " 'tokenizer_class': 'CodexTokenizer',\n", " 'model_max_length': 4096,\n", " },\n", "}\n" ], "metadata": { "id": "UuNt2kwgFWbN" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZypJVeIMFQGQ" }, "outputs": [], "source": [ "import json\n", "import os\n", "\n", "import tiktoken\n", "from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode\n", "from typing import Dict, Optional\n", "\n", "byte_encoder = bytes_to_unicode()\n", "\n", "def token_bytes_to_string(b):\n", " return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])\n", "\n", "# Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960\n", "def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:\n", " parts = [bytes([b]) for b in token]\n", " while True:\n", " min_idx = None\n", " min_rank = None\n", " for i, pair in enumerate(zip(parts[:-1], parts[1:])):\n", " rank = mergeable_ranks.get(pair[0] + pair[1])\n", " if rank is not None and (min_rank is None or rank < min_rank):\n", " min_idx = i\n", " min_rank = rank\n", " if min_rank is None or (max_rank is not None and min_rank >= max_rank):\n", " break\n", " assert min_idx is not None\n", " parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]\n", " return parts\n", "\n", "def generate_vocab_and_merges(encoder):\n", " mergeable_ranks = encoder._mergeable_ranks\n", "\n", " merges = []\n", " vocab = {}\n", " for token, rank in mergeable_ranks.items():\n", " vocab[token_bytes_to_string(token)] = rank\n", "\n", " if len(token) == 1:\n", " continue\n", " merged = tuple(bpe(mergeable_ranks, token, max_rank=rank))\n", " assert len(merged) == 2\n", "\n", " merges.append(' '.join(map(token_bytes_to_string, merged)))\n", "\n", " # Also add special tokens\n", " vocab.update(encoder._special_tokens)\n", "\n", " return vocab, merges\n", "\n", "def convert_tiktoken(model_name, output_dir=None):\n", " if output_dir is None:\n", " output_dir = model_name\n", "\n", " encoder = tiktoken.encoding_for_model(model_name)\n", "\n", " vocab, merges = generate_vocab_and_merges(encoder)\n", "\n", " added_tokens = [\n", " {\n", " \"id\": id,\n", " \"content\": content,\n", " \"single_word\": False,\n", " \"lstrip\": False,\n", " \"rstrip\": False,\n", " \"normalized\": False,\n", " \"special\": True,\n", " }\n", " for content, id in encoder._special_tokens.items()\n", " ]\n", "\n", " # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer_config.json\n", " tokenizer_config_template = {\n", " \"add_prefix_space\": False,\n", " \"bos_token\": \"<|endoftext|>\",\n", " \"clean_up_tokenization_spaces\": False,\n", " \"eos_token\": \"<|endoftext|>\",\n", " \"unk_token\": \"<|endoftext|>\",\n", " }\n", " tokenizer_config_template.update(MODEL_INFO[model_name]) # Adds `model_max_length` and `tokenizer_class`\n", " tokenizer_config_template = dict(sorted(tokenizer_config_template.items(), key=lambda x: x[0]))\n", "\n", " os.makedirs(output_dir, exist_ok=True)\n", "\n", " if MODEL_INFO[model_name]['tokenizer_class'] in ('GPT3_5Tokenizer', 'GPT4Tokenizer'):\n", " pre_tokenizer = {\n", " \"type\": \"Sequence\",\n", " \"pretokenizers\": [\n", " {\n", " \"type\": \"Split\",\n", " \"pattern\": {\n", " \"Regex\": \"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\\\r\\\\n\\\\p{L}\\\\p{N}]?\\\\p{L}+|\\\\p{N}{1,3}| ?[^\\\\s\\\\p{L}\\\\p{N}]+[\\\\r\\\\n]*|\\\\s*[\\\\r\\\\n]+|\\\\s+(?!\\\\S)|\\\\s+\"\n", " },\n", " \"behavior\": \"Removed\",\n", " \"invert\": True,\n", " },\n", " {\n", " \"type\": \"ByteLevel\",\n", " \"add_prefix_space\": False,\n", " \"trim_offsets\": True,\n", " \"use_regex\": False,\n", " }\n", " ]\n", " }\n", " else:\n", " pre_tokenizer = {\n", " \"type\": \"ByteLevel\",\n", " \"add_prefix_space\": False,\n", " \"trim_offsets\": True,\n", " \"use_regex\": True,\n", " }\n", "\n", " # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer.json\n", " tokenizer_template = {\n", " \"version\": \"1.0\",\n", " \"truncation\": None,\n", " \"padding\": None,\n", " \"added_tokens\": added_tokens,\n", " \"normalizer\": None,\n", " \"pre_tokenizer\": pre_tokenizer,\n", " \"post_processor\": None,\n", " \"decoder\": {\n", " \"type\": \"ByteLevel\",\n", " \"add_prefix_space\": True,\n", " \"trim_offsets\": True,\n", " \"use_regex\": True,\n", " },\n", " \"model\": {\n", " \"type\": \"BPE\",\n", " \"dropout\": None,\n", " \"unk_token\": None,\n", " \"continuing_subword_prefix\": \"\",\n", " \"end_of_word_suffix\": \"\",\n", " \"fuse_unk\": False,\n", " \"byte_fallback\": False,\n", " \"vocab\": vocab,\n", " \"merges\": merges,\n", " },\n", " }\n", "\n", "\n", " # Save to files\n", " with open(os.path.join(output_dir, 'vocab.json'), 'w', encoding='utf-8') as fp:\n", " json.dump(vocab, fp, indent=2, ensure_ascii=False)\n", "\n", " with open(os.path.join(output_dir, 'tokenizer.json'), 'w', encoding='utf-8') as fp:\n", " json.dump(tokenizer_template, fp, indent=2, ensure_ascii=False)\n", "\n", " with open(os.path.join(output_dir, 'tokenizer_config.json'), 'w', encoding='utf-8') as fp:\n", " json.dump(tokenizer_config_template, fp, indent=2, ensure_ascii=False)\n", "\n", " with open(os.path.join(output_dir, 'special_tokens_map.json'), 'w', encoding='utf-8') as fp:\n", " json.dump({\n", " \"bos_token\": \"<|endoftext|>\",\n", " \"eos_token\": \"<|endoftext|>\",\n", " \"unk_token\": \"<|endoftext|>\",\n", " }, fp, indent=2, ensure_ascii=False)\n", "\n", " with open(os.path.join(output_dir, 'merges.txt'), 'w', encoding='utf-8') as fp:\n", " fp.write('#version: 0.2\\n')\n", " fp.write('\\n'.join(merges))" ] }, { "cell_type": "markdown", "source": [ "### Run conversion" ], "metadata": { "id": "wfuFCZRbFMT_" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "O87Zz6Vzhb5C" }, "outputs": [], "source": [ "output = 'models'\n", "for model_name in MODEL_INFO:\n", " convert_tiktoken(model_name, os.path.join(output, model_name))" ] }, { "cell_type": "markdown", "metadata": { "id": "qx6tfE_UwFNB" }, "source": [ "### Validation" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "oSRUBMLmwatB" }, "outputs": [], "source": [ "# Tests adapted from https://github.com/openai/tiktoken/blob/1b9faf2779855124f05174adf1383e53689ed94b/tests/test_encoding.py\n", "TESTS = [\n", " \"\\n\\n\\n\\n\\ns1232\", \"hello world\", \"hello <|endoftext|>\", \"hello world\", \"hello <|endoftext|>\", \"0\", \"00\", \"000\", \"0000\", \"00000\", \"000000\", \"0000000\", \"00000000\", \"000000000\", \"0000000000\", \"00000000000\", \"000000000000\", \"0000000000000\", \"00000000000000\", \"000000000000000\", \"0000000000000000\", \"00000000000000000\", \"rer\", \"'rer\", \"today\\n \", \"today\\n \\n\", \"today\\n \\n\", \"hello world\", \"hello world\", \"hello world\", \" \\x850\", \"\", \"👍\", \" .\",\n", "]" ] }, { "cell_type": "code", "source": [ "from transformers import GPT2TokenizerFast, logging\n", "\n", "# Hide warning messages\n", "logging.set_verbosity_error()\n", "\n", "output = 'models'\n", "for model_name in MODEL_INFO:\n", " print('Testing', model_name)\n", " og_tokenizer = tiktoken.encoding_for_model(model_name)\n", " hf_tokenizer = GPT2TokenizerFast.from_pretrained(os.path.join(output, model_name))\n", "\n", " for test in TESTS:\n", " # Test encoding\n", " og_tokens = og_tokenizer.encode(test, allowed_special={'<|endoftext|>'})\n", " hf_tokens = hf_tokenizer.encode(test)\n", " assert og_tokens == hf_tokens, f'ENCODE FAIL: \"{test}\". {og_tokens} != {hf_tokens}'\n", "\n", " # Test decoding\n", " og_decoded = og_tokenizer.decode(og_tokens)\n", " hf_decoded = hf_tokenizer.decode(hf_tokens)\n", " assert og_decoded == hf_decoded, f'DECODE FAIL: \"{og_tokens}\". {og_decoded} != {hf_decoded}'\n" ], "metadata": { "id": "ELyGSJM0-yA4" }, "execution_count": null, "outputs": [] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }