{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# test our tokenizer on a simple sentence\n", "tokens = tokenizer.encode('ciao, come va?') # 'hi, how are you?'" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])" ] }, "metadata": {}, "execution_count": 6 } ], "source": [ "tokens # this is our encodings object, with several tensors including ids and attention_mask" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['[CLS]', 'ciao', ',', 'Ġcome', 'Ġva', '?', '[SEP]', '[PAD]', '[PAD]', '[PAD]']" ] }, "metadata": {}, "execution_count": 7 } ], "source": [ "tokens.tokens[:10] # we can view the tokens here (eg output of merges.txt)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[1, 16834, 16, 488, 611, 35, 2, 0, 0, 0]" ] }, "metadata": {}, "execution_count": 8 } ], "source": [ "tokens.ids[:10] # and here are the token ids (output of vocab.json)" ] } ], "metadata": { "kernelspec": { "display_name": "ML", "language": "python", "name": "ml" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }