Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save aaaddress1/db8a40aee421617fe50cadf29ff7a228 to your computer and use it in GitHub Desktop.

Select an option

Save aaaddress1/db8a40aee421617fe50cadf29ff7a228 to your computer and use it in GitHub Desktop.

Revisions

  1. @p208p2002 p208p2002 revised this gist Aug 21, 2023. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions gpt2-bpe-tokenizer-token-to-utf8.py
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,6 @@
    # GPT2 BPE-Tokenizer token 轉 utf-8 處理
    # 轉換僅針對不在詞表內,以bytes形式表達的token(如中文字)

    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    word = "台"
  2. @p208p2002 p208p2002 created this gist Aug 21, 2023.
    25 changes: 25 additions & 0 deletions gpt2-bpe-tokenizer-token-to-utf8.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,25 @@
    # GPT2 BPE-Tokenizer token 轉 utf-8 處理
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    word = "台"
    tokens = tokenizer.convert_ids_to_tokens(tokenizer(word,add_special_tokens=False)["input_ids"])
    print("tokens:",tokens)

    # 轉 utf-8
    mid_token_offset = 162 # 非頭尾位置的token有一個額外的位置標記
    tokens = "".join(tokens) # 合併
    integer_unicode_x = [ord(t) for t in tokens] # 文字轉對應unicode(十進位表示)
    for idx,int_x in enumerate(integer_unicode_x):
    if idx == 0 or idx == len(integer_unicode_x) - 1:
    hex_x = format(int_x,"x")
    print(f"\\x{hex_x}") # 十進位轉十六進位
    else:
    hex_x = format(int_x-mid_token_offset,"x") # 扣除offset
    print(f"\\x{hex_x}")
    print(word.encode("utf-8")) # 與迴圈印出的內容相符

    # tokens: ['åı', '°']
    # \xe5
    # \x8f
    # \xb0
    # b'\xe5\x8f\xb0'