# English pre-trained tokenizer on a text in 3 languages (en, pt, fr) # text in 3 languages to be tokenized text_en = 'Jacques-Germain Soufflot (Irancy, July 22, 1713 - Paris, August 29, 1780) was a French architect, initiator of the architectural style of Neoclassicism.' text_pt = 'Jacques-Germain Soufflot (Irancy, 22 de julho de 1713 — Paris, 29 de agosto de 1780) foi um arquitecto francês, iniciador do estilo arquitectónico do Neoclassicismo.' text_fr = 'Jacques-Germain Soufflot (Irancy, 22 juillet 1713 - Paris, 29 août 1780) était un architecte français, initiateur du style architectural du néoclassicisme.' langs = ['en', 'pt', 'fr'] texts = [text_en,text_pt,text_fr] for lang,text in zip(*[langs,texts]): print(f'({lang}) {TitledStr(text)}\n') # number and list of classical tokens (ie, tokens separated by a blank) for lang,text in zip(*[langs,texts]): print(f'({lang} - {len(text.split())} tokens) {TitledStr(text.split(" "))}\n') # number and list of tokens # after the text tokenization by imported BPE GPT2TokenizerFast (trained with an English corpus...) for lang,text in zip(*[langs,texts]): toks = tokenizer_en.tokenize(text) print(f'({lang} - {len(toks)} tokens) {TitledStr(toks)}\n') # number and list of tokens ids # after the text tokenization + numerization by imported BPE GPT2TokenizerFast (trained with an English corpus...) for lang,text in zip(*[langs,texts]): toks_ids = tokenizer_en.encode(text) print(f'({lang} - {len(toks_ids)} tokens) {TitledStr(toks_ids)}\n') # decode (back to the text) for lang,text in zip(*[langs,texts]): toks_ids = tokenizer_en.encode(text) text_decoded = tokenizer_en.decode(toks_ids) print(f'({lang}) {TitledStr(text_decoded)}\n') # graph # source: https://matplotlib.org/3.2.1/gallery/lines_bars_and_markers/barchart.html#sphx-glr-gallery-lines-bars-and-markers-barchart-py text_split = list() toks_split = list() for text in texts: text_split.append(len(text.split())) toks_ids = tokenizer_en.encode(text) toks_split.append(len(toks_ids)) labels = langs xy = list(np.array([1.,2.,3.]) - 0.2) xz = list(np.array([1.,2.,3.]) + 0.2) y = text_split z = toks_split ax = plt.subplot(111) ax.bar(xy, y, width=0.4, color='b', align='center') ax.bar(xz, z, width=0.4, color='g', align='center') ax.set_xlabel('languages') ax.set_xticks(range(1,len(labels)+1)) ax.set_xticklabels(labels) ax.set_ylabel('number of tokens') ax.legend(['split(" ")', 'GPTTokenizerFast (en)']) ax.set_title('Number of tokens by tokenization method and lang') plt.show()