文字轉unicode較為簡單,用ord(x)即可
import re
def word2unicode(x):
    uni = hex(ord(x))
    uni = re.sub("^0x", "", uni).upper()
    return uni
word2unicode("字") # 5B57| # GPT2 BPE-Tokenizer token 轉 utf-8 處理 | |
| # 轉換僅針對不在詞表內,以bytes形式表達的token(如中文字) | |
| from transformers import AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
| word = "台" | |
| tokens = tokenizer.convert_ids_to_tokens(tokenizer(word,add_special_tokens=False)["input_ids"]) | |
| print("tokens:",tokens) | |
| # 轉 utf-8 | 
| 一 | |
| 丁 | |
| 七 | |
| 三 | |
| 下 | |
| 丈 | |
| 上 | |
| 丑 | |
| 丐 | |
| 不 | 
| # https://huggingface.co/docs/transformers/perplexity | |
| from typing import Any | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| class PPL(): | |
| def __init__(self, model_id="gpt2") -> None: | |
| self.model = AutoModelForCausalLM.from_pretrained(model_id) | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| self.device = 'cpu' | 
| <s>[INST] <<SYS>>你是一位中文母語使用者,你只能用中文對話<</SYS>>hello [/INST] *你好* (nǐ hǎo) </s> | |
| <s>[INST] 你是誰 [/INST] *我是 líng* (wǒ shì líng) - I am Chinese. </s> | |
| <s>[INST] 說個笑話來聽聽 [/INST] *笑* (xì) - Sure, here's a Chinese joke for you </s> | 
| # $ pip install deepspeed>=0.9.3 | |
| # $ deepspeed deepspeed_inference.py | |
| import os | |
| import deepspeed | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| local_rank = int(os.getenv("LOCAL_RANK", "0")) |