### Install requirements

In [None]:
!pip install -q tiktoken transformers

### Setup

In [None]:

# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
MODEL_INFO = {
  # GPT-2 and GPT-3 models (r50k_base)
  'gpt2': {
    'tokenizer_class': 'GPT2Tokenizer',
    'model_max_length': 1024,
  },
  'davinci': { # (gpt-3)
    'tokenizer_class': 'GPT3Tokenizer',
    'model_max_length': 2048,
  },

  # GPT-3.5 and GPT-4 models (cl100k_base)
  'gpt-3.5-turbo': {
    'tokenizer_class': 'GPT3_5Tokenizer',
    'model_max_length': 4096,
  },
  'gpt-3.5-turbo-16k': {
    'tokenizer_class': 'GPT3_5Tokenizer',
    'model_max_length': 16384,
  },
  'gpt-4': {
    'tokenizer_class': 'GPT4Tokenizer',
    'model_max_length': 8192,
  },
  'text-embedding-ada-002': {
    'tokenizer_class': 'GPT4Tokenizer',
    'model_max_length': 8192,
  },

  # Codex models (p50k_base)
  'text-davinci-002': {
    'tokenizer_class': 'CodexTokenizer',
    'model_max_length': 4096,
  },
  'text-davinci-003': {
    'tokenizer_class': 'CodexTokenizer',
    'model_max_length': 4096,
  },
}


In [None]:
import json
import os

import tiktoken
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
from typing import Dict, Optional

byte_encoder = bytes_to_unicode()

def token_bytes_to_string(b):
  return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])

# Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
  parts = [bytes([b]) for b in token]
  while True:
    min_idx = None
    min_rank = None
    for i, pair in enumerate(zip(parts[:-1], parts[1:])):
      rank = mergeable_ranks.get(pair[0] + pair[1])
      if rank is not None and (min_rank is None or rank < min_rank):
        min_idx = i
        min_rank = rank
    if min_rank is None or (max_rank is not None and min_rank >= max_rank):
      break
    assert min_idx is not None
    parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
  return parts

def generate_vocab_and_merges(encoder):
  mergeable_ranks = encoder._mergeable_ranks

  merges = []
  vocab = {}
  for token, rank in mergeable_ranks.items():
    vocab[token_bytes_to_string(token)] = rank

    if len(token) == 1:
      continue
    merged = tuple(bpe(mergeable_ranks, token, max_rank=rank))
    assert len(merged) == 2

    merges.append(' '.join(map(token_bytes_to_string, merged)))

  # Also add special tokens
  vocab.update(encoder._special_tokens)

  return vocab, merges

def convert_tiktoken(model_name, output_dir=None):
  if output_dir is None:
    output_dir = model_name

  encoder = tiktoken.encoding_for_model(model_name)

  vocab, merges = generate_vocab_and_merges(encoder)

  added_tokens = [
    {
      "id": id,
      "content": content,
      "single_word": False,
      "lstrip": False,
      "rstrip": False,
      "normalized": False,
      "special": True,
    }
    for content, id in encoder._special_tokens.items()
  ]

  # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer_config.json
  tokenizer_config_template = {
    "add_prefix_space": False,
    "bos_token": "<|endoftext|>",
    "clean_up_tokenization_spaces": False,
    "eos_token": "<|endoftext|>",
    "unk_token": "<|endoftext|>",
  }
  tokenizer_config_template.update(MODEL_INFO[model_name]) # Adds `model_max_length` and `tokenizer_class`
  tokenizer_config_template = dict(sorted(tokenizer_config_template.items(), key=lambda x: x[0]))

  os.makedirs(output_dir, exist_ok=True)

  if MODEL_INFO[model_name]['tokenizer_class'] in ('GPT3_5Tokenizer', 'GPT4Tokenizer'):
    pre_tokenizer = {
      "type": "Sequence",
      "pretokenizers": [
        {
          "type": "Split",
          "pattern": {
            "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
          },
          "behavior": "Removed",
          "invert": True,
        },
        {
          "type": "ByteLevel",
          "add_prefix_space": False,
          "trim_offsets": True,
          "use_regex": False,
        }
      ]
    }
  else:
    pre_tokenizer = {
      "type": "ByteLevel",
      "add_prefix_space": False,
      "trim_offsets": True,
      "use_regex": True,
    }

  # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer.json
  tokenizer_template = {
    "version": "1.0",
    "truncation": None,
    "padding": None,
    "added_tokens": added_tokens,
    "normalizer": None,
    "pre_tokenizer": pre_tokenizer,
    "post_processor": None,
    "decoder": {
      "type": "ByteLevel",
      "add_prefix_space": True,
      "trim_offsets": True,
      "use_regex": True,
    },
    "model": {
      "type": "BPE",
      "dropout": None,
      "unk_token": None,
      "continuing_subword_prefix": "",
      "end_of_word_suffix": "",
      "fuse_unk": False,
      "byte_fallback": False,
      "vocab": vocab,
      "merges": merges,
    },
  }


  # Save to files
  with open(os.path.join(output_dir, 'vocab.json'), 'w', encoding='utf-8') as fp:
    json.dump(vocab, fp, indent=2, ensure_ascii=False)

  with open(os.path.join(output_dir, 'tokenizer.json'), 'w', encoding='utf-8') as fp:
    json.dump(tokenizer_template, fp, indent=2, ensure_ascii=False)

  with open(os.path.join(output_dir, 'tokenizer_config.json'), 'w', encoding='utf-8') as fp:
    json.dump(tokenizer_config_template, fp, indent=2, ensure_ascii=False)

  with open(os.path.join(output_dir, 'special_tokens_map.json'), 'w', encoding='utf-8') as fp:
    json.dump({
      "bos_token": "<|endoftext|>",
      "eos_token": "<|endoftext|>",
      "unk_token": "<|endoftext|>",
    }, fp, indent=2, ensure_ascii=False)

  with open(os.path.join(output_dir, 'merges.txt'), 'w', encoding='utf-8') as fp:
    fp.write('#version: 0.2\n')
    fp.write('\n'.join(merges))

### Run conversion

In [None]:
output = 'models'
for model_name in MODEL_INFO:
  convert_tiktoken(model_name, os.path.join(output, model_name))

### Validation

In [None]:
# Tests adapted from https://github.com/openai/tiktoken/blob/1b9faf2779855124f05174adf1383e53689ed94b/tests/test_encoding.py
TESTS = [
  "\n\n\n\n\ns1232", "hello world", "hello <|endoftext|>", "hello world", "hello <|endoftext|>", "0", "00", "000", "0000", "00000", "000000", "0000000", "00000000", "000000000", "0000000000", "00000000000", "000000000000", "0000000000000", "00000000000000", "000000000000000", "0000000000000000", "00000000000000000", "rer", "'rer", "today\n ", "today\n \n", "today\n  \n", "hello world", "hello world", "hello world", " \x850", "", "üëç", " .",
]

In [None]:
from transformers import GPT2TokenizerFast, logging

# Hide warning messages
logging.set_verbosity_error()

output = 'models'
for model_name in MODEL_INFO:
  print('Testing', model_name)
  og_tokenizer = tiktoken.encoding_for_model(model_name)
  hf_tokenizer = GPT2TokenizerFast.from_pretrained(os.path.join(output, model_name))

  for test in TESTS:
    # Test encoding
    og_tokens = og_tokenizer.encode(test, allowed_special={'<|endoftext|>'})
    hf_tokens = hf_tokenizer.encode(test)
    assert og_tokens == hf_tokens, f'ENCODE FAIL: "{test}". {og_tokens} != {hf_tokens}'

    # Test decoding
    og_decoded = og_tokenizer.decode(og_tokens)
    hf_decoded = hf_tokenizer.decode(hf_tokens)
    assert og_decoded == hf_decoded, f'DECODE FAIL: "{og_tokens}". {og_decoded} != {hf_decoded}'
