{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "9GmF1MAYFM8C" }, "source": [ "# Training a new tokenizer from an old one" ] }, { "cell_type": "markdown", "metadata": { "id": "i6Ckxh5KFM8E" }, "source": [ "Install the Transformers, Datasets, and Evaluate libraries to run this notebook." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "CNqoBxR5FM8F" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n", "Requirement already satisfied: datasets in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (2.14.0)\n", "Requirement already satisfied: evaluate in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (0.4.0)\n", "Requirement already satisfied: transformers in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (4.31.0)\n", "Requirement already satisfied: sentencepiece in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (0.1.99)\n", "Requirement already satisfied: tokenizers in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (0.13.3)\n", "Requirement already satisfied: accelerate in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (0.19.0)\n", "Collecting accelerate\n", " Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)\n", " -------------------------------------- 244.2/244.2 kB 3.0 MB/s eta 0:00:00\n", "Requirement already satisfied: numpy>=1.17 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (1.23.5)\n", "Requirement already satisfied: packaging in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (22.0)\n", "Requirement already satisfied: requests>=2.19.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (2.25.1)\n", "Requirement already satisfied: huggingface-hub<1.0.0,>=0.14.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (0.14.1)\n", "Requirement already satisfied: dill<0.3.8,>=0.3.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (0.3.6)\n", "Requirement already satisfied: pyarrow>=8.0.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (12.0.0)\n", "Requirement already satisfied: xxhash in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (3.2.0)\n", "Requirement already satisfied: multiprocess in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (0.70.14)\n", "Requirement already satisfied: fsspec[http]>=2021.11.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (2023.5.0)\n", "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (6.0)\n", "Requirement already satisfied: tqdm>=4.62.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (4.65.0)\n", "Requirement already satisfied: pandas in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (2.0.1)\n", "Requirement already satisfied: aiohttp in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from datasets) (3.8.4)\n", "Requirement already satisfied: responses<0.19 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from evaluate) (0.18.0)\n", "Requirement already satisfied: safetensors>=0.3.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from transformers) (0.3.1)\n", "Requirement already satisfied: regex!=2019.12.17 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from transformers) (2023.5.5)\n", "Requirement already satisfied: filelock in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from transformers) (3.12.0)\n", "Requirement already satisfied: torch>=1.10.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from accelerate) (2.0.1+cu118)\n", "Requirement already satisfied: psutil in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from accelerate) (5.9.5)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (6.0.4)\n", "Requirement already satisfied: attrs>=17.3.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (23.1.0)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (1.9.2)\n", "Requirement already satisfied: frozenlist>=1.1.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (1.3.3)\n", "Requirement already satisfied: aiosignal>=1.1.2 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (4.0.2)\n", "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from aiohttp->datasets) (2.0.4)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from huggingface-hub<1.0.0,>=0.14.0->datasets) (4.5.0)\n", "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from requests>=2.19.0->datasets) (2.10)\n", "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from requests>=2.19.0->datasets) (2023.5.7)\n", "Requirement already satisfied: chardet<5,>=3.0.2 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from requests>=2.19.0->datasets) (4.0.0)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from requests>=2.19.0->datasets) (1.26.14)\n", "Requirement already satisfied: jinja2 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from torch>=1.10.0->accelerate) (3.1.2)\n", "Requirement already satisfied: sympy in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from torch>=1.10.0->accelerate) (1.11.1)\n", "Requirement already satisfied: networkx in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from torch>=1.10.0->accelerate) (2.8.4)\n", "Requirement already satisfied: colorama in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from tqdm>=4.62.1->datasets) (0.4.6)\n", "Requirement already satisfied: tzdata>=2022.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from pandas->datasets) (2023.3)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from pandas->datasets) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from pandas->datasets) (2023.3)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\doron\\appdata\\roaming\\python\\python310\\site-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.2)\n", "Requirement already satisfied: mpmath>=0.19 in c:\\users\\doron\\anaconda3\\envs\\py310\\lib\\site-packages (from sympy->torch>=1.10.0->accelerate) (1.2.1)\n", "Installing collected packages: accelerate\n", " Attempting uninstall: accelerate\n", " Found existing installation: accelerate 0.19.0\n", " Uninstalling accelerate-0.19.0:\n", " Successfully uninstalled accelerate-0.19.0\n", "Successfully installed accelerate-0.21.0\n" ] } ], "source": [ "!pip install --upgrade datasets evaluate transformers sentencepiece tokenizers accelerate\n", "#!apt install git-lfs" ] }, { "cell_type": "markdown", "metadata": { "id": "TZxkkRZXFM8G" }, "source": [ "You will need to setup git, adapt your email and name in the following cell." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Ba_CYyHDFM8G" }, "outputs": [], "source": [ "!git config --global user.email \"doronadler@gmail.com\"\n", "!git config --global user.name \"Doron Adler\"" ] }, { "cell_type": "markdown", "metadata": { "id": "VBQxAoBYFM8H" }, "source": [ "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qR19bo_2FM8I" }, "outputs": [], "source": [ "from huggingface_hub import notebook_login\n", "\n", "notebook_login()" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "hUx3zfy7FM8I" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/miniconda3/envs/pytorch2/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "Downloading readme: 100%|██████████| 286/286 [00:00<00:00, 766kB/s]\n", "Downloading data: 100%|██████████| 663M/663M [00:33<00:00, 20.1MB/s] \n", "Downloading data: 100%|██████████| 323M/323M [00:16<00:00, 19.1MB/s] \n", "Downloading data: 100%|██████████| 386M/386M [00:23<00:00, 16.3MB/s] \n", "Downloading data: 100%|██████████| 190M/190M [00:09<00:00, 20.3MB/s] \n", "Downloading data: 100%|██████████| 147M/147M [00:08<00:00, 17.7MB/s] \n", "Downloading data: 100%|██████████| 147M/147M [00:07<00:00, 18.9MB/s] \n", "Downloading data: 100%|██████████| 144M/144M [00:07<00:00, 20.2MB/s] \n", "Downloading data: 100%|██████████| 147M/147M [00:07<00:00, 19.1MB/s] \n", "Downloading data: 100%|██████████| 147M/147M [00:08<00:00, 18.2MB/s] \n", "Downloading data: 100%|██████████| 146M/146M [00:07<00:00, 18.3MB/s] \n", "Downloading data: 100%|██████████| 326M/326M [00:18<00:00, 17.9MB/s] \n", "Generating train split: 100%|██████████| 2188612/2188612 [00:04<00:00, 536444.11 examples/s]\n" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "# This can take a few minutes to load, so grab a coffee or tea while you wait!\n", "#raw_datasets = load_dataset(\"Norod78/hewiki-20220901-articles-dataset\")\n", "raw_datasets = load_dataset(\"Norod78/Hebrew-corpus-other\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['text'],\n", " num_rows: 2188612\n", "})" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_datasets['train']" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dataset({\n", " features: ['text'],\n", " num_rows: 43773\n", "})\n" ] } ], "source": [ "raw_datasets = raw_datasets['train'].train_test_split(test_size=0.02, seed=42)\n", "\n", "print(raw_datasets['test'])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "bpoiLcGpFM8J" }, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['text'],\n", " num_rows: 2144839\n", " })\n", " test: Dataset({\n", " features: ['text'],\n", " num_rows: 43773\n", " })\n", "})" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_datasets" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "1ZmzNR_PFM8K" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "כאן: פתחנו מלא פודקאסטים אז אנחנו המצאנו את הפודקאסטים בארץ \\n הוא ממציא הדאבל אלבו. מתי יוצא הפרק של TALKYO?\\n האם ידידי יודע שישנם פודקאסטים ותיקים באנגלית המזכירים מאוד כמה מהפודקאסטים של כאן?\\n\\nהמאבק הבא של הארץ. של הפרוגרסיבים. של האנטי-ציוניים במדינת ישראל הוא היותה של מדינת ישראל מדינת כל אזרחיה. מה שרחש מתחת לקרקע יעלה כעת בכל כוחו לפרונט.\\n ותודות לנפתלי בנט לגדעון סער ולאלקין\\n בהנהגת הרפורמי בנט ☠💀☠\\n\\nגליה רהב הבוקר ברדיו: קריטי לחסן עכשיו בחיסון שלישי את מדוכאי החיסון. מעריכה שנתחיל עוד השבוע. אין כמובן קשר לזה שבסוף החודש החיסונים פגי תוקף ולפיכך הילדים (שצריכים 2 זריקות). כבר לא קהל יעד. סוכנת מכירות על מלא מלא\\n שתהווה דוגמא ותתחסן ראשונה. היא בהחלט מדוכאת מוח וזה דיכוי חיסוני ממדרגה ראשונה\\n עזוב את פג התוקף, אפילו ה FDA צחק להם בפרצוף על הבקשה הזו.\\n פרזנטורית על מלא\\n *פרנסה*\\n חחחח\\n אתה גם נחשב למדוכא חיסון....\\n אפשר להיות רגועים https://t.co/KuXiobwP4r\\n היא שטן בלתי נסבלת\\n צריך לשלול מהאישה הזאת את רישיון הרפואה\\n ברורררררר\\n אנשים במדינה 'מדוכאי גליה רהב', לא 'מדוכאי חיסון'.\\n\n" ] } ], "source": [ "print(raw_datasets[\"train\"][656645][\"text\"])\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " הזיה, מה קורה איתו באמת?\\n אופיר לובל. הפך לבמאי שעבד בין היתר במערכונים של ארץ נהדרת, הסרט ״מה כבר יכול לקרות״ ובלקספייס\\n\\nאיזה מוזר זה בוקר חופשי בלי ילדים https://t.co/ZU7rVGlkJN\\n ירידה דראסטית באיכות התוכן\\n פרסום ראשון: בלי הילדים אבי אטיאס הוא בעצם דור פרץ\\n תמחק את החיוך לפני שמתקשרים אלייך כי הילדה העלתה חום. לא צועקים יש לפני הגול כמאמר הקלישאה\\n ילדים לא, כיווץ מצח למצלמה✅\\n\\nאתמול מישהו תקף אותי על שנתתי לייק לציוץ של פוליטיקאי. אז אני פה כדי להזכיר לכולם שאעשה לייקים לכל ציוץ שאני אוהב ולכל מי שאני רוצה. ואגב, לייקים בשונה מהמון דברים לא עולה כסף (שימו לב למספר הלייקים שעשיתי עד כה) ולכן, גם לציוץ הזה אתם מוזמנים ללייק בכיף. https://t.co/dI0Ic4c8y2\\n יש לך 165 שאתה עוקב אחריהם.\\n כל הכבוד לך, אל תתן להם לשנות אותך\\n לכל ציוץ שאתה אוהב ורוצה..ולי כמובן..\\n נראה לך שמאמינים לך?\\n חחחח, קמצן. https://t.co/G8D7glsm2h\\n\\n לא עזב. יחד עם מיקי חיימוביץ ורם שפע הצביע נגד הקואליציה במארב מתוכנן והפיל את הממשלה הפריטטית ששלושתם היו חלק ממנה. זו לא היתה החלטה קלה להפיל ממשלה ולגרור את המדינה לבחירות רביעיות תוך שנתים. זמיר הביע כבר זמן מה לפני כן את חוסר נוחותו מהתנהלות הממשלה וגם התפטר מתפקידו כשר.\\n\\n יותר חרא מברגיל? 🤣🤣🤣\\n רעיון טוב\\n\n" ] } ], "source": [ "print(raw_datasets[\"test\"][6546][\"text\"])\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "xkxxKLVdFM8L" }, "outputs": [], "source": [ "def get_training_corpus():\n", " dataset = raw_datasets[\"train\"]\n", " for start_idx in range(0, len(dataset), 1000):\n", " samples = dataset[start_idx : start_idx + 1000]\n", " yield samples[\"text\"]\n", "\n", "#def get_training_corpus():\n", "# for i in range(0, len(raw_datasets[\"test\"]), 1000):\n", " #yield raw_datasets[\"test\"][i : i + 1000][\"text\"]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "4ynOkDnVFM8O" }, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "old_tokenizer = AutoTokenizer.from_pretrained(\"openai-community/gpt2\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "78\n" ] } ], "source": [ "example = '''שלום לכולם:\n", " \"\"\"האיש האחרון עליי אדמות ישב לבד בחדרו כשלפתע.\"\"\"\n", " Hello world'''\n", "print(len(old_tokenizer.tokenize(example)))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "id": "myM5RipKFM8P" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] } ], "source": [ "tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(), 14000) #50000\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "-DBFJk_eS6t1" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "29\n", "78\n" ] } ], "source": [ "example = '''שלום לכולם:\n", " \"\"\"האיש האחרון עליי אדמות ישב לבד בחדרו כשלפתע.\"\"\"\n", " Hello world'''\n", "tokens = tokenizer.tokenize(example)\n", "print(len(tokens))\n", "print(len(old_tokenizer.tokenize(example)))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "id": "6LYw9KL2FM8R" }, "outputs": [ { "data": { "text/plain": [ "('hebrew-14k/tokenizer_config.json',\n", " 'hebrew-14k/special_tokens_map.json',\n", " 'hebrew-14k/vocab.json',\n", " 'hebrew-14k/merges.txt',\n", " 'hebrew-14k/added_tokens.json',\n", " 'hebrew-14k/tokenizer.json')" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#tokenizer.save_pretrained(\"gpt2-tokenizer-with-added-hebrew-14k\")\n", "tokenizer.save_pretrained(\"hebrew-14k\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "KNx5PaAgFM8S" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "232c4222c5434abe9b2bbaa90dff6959", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Upload 1 LFS files: 0%| | 0/1 [00:00