Last active
November 11, 2023 12:50
-
-
Save khursani8/984d19f6c3eb1644a6ec74a9a57d2a05 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "8be9502e-62ce-4556-8050-924b1e19f1e1", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from tqdm import tqdm\n", | |
| "import os" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "06635008-1386-4e48-b8a2-cf6609592ea9", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with open('BM-A-pt3') as fopen:\n", | |
| " text = fopen.read()\n", | |
| " \n", | |
| "questions = []\n", | |
| "for t in text.split('no: ')[1:]:\n", | |
| " t = t.strip()\n", | |
| " no = t.split('\\n')[0]\n", | |
| " objektif = t.split('objektif: ')[1].split('\\n')[0]\n", | |
| " soalan = t.split('soalan:')[1].split('jawapan:')[0].strip()\n", | |
| " jawapan = t.split('jawapan: ')[1].split(',')[0].strip()\n", | |
| " data = {\n", | |
| " 'no': no,\n", | |
| " 'objektif': objektif,\n", | |
| " 'soalan': soalan,\n", | |
| " 'jawapan': jawapan,\n", | |
| " }\n", | |
| " questions.append(data)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "13d82f1e-826f-4588-9a34-7b6f3a85b22b", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[{'no': '1',\n", | |
| " 'objektif': 'Lengkapkan ayat-ayat yang berikut dengan memilih jawapan yang paling sesuai.',\n", | |
| " 'soalan': 'Para ___ tanah air telah dihantar ke negara Jepun untuk membantu mangsa gempa bumi dan tsunami.\\nA. hartawan\\nB. dermawan\\nC. bangsawan\\nD. sukarelawan',\n", | |
| " 'jawapan': 'D'},\n", | |
| " {'no': '2',\n", | |
| " 'objektif': 'Lengkapkan ayat-ayat yang berikut dengan memilih jawapan yang paling sesuai.',\n", | |
| " 'soalan': 'Kebanyakan barang yang disimpan di dalam stor itu telah rosak ____ tikus.\\nA. digerit\\nB. digigit\\nC. dikesip\\nD. diketip',\n", | |
| " 'jawapan': 'A'},\n", | |
| " {'no': '3',\n", | |
| " 'objektif': 'Lengkapkan ayat-ayat yang berikut dengan memilih jawapan yang paling sesuai.',\n", | |
| " 'soalan': 'Shafiq cuba mengubah kedudukan meja kayu itu ___ ruang tamu ___ ruang dapur rumahnya.\\nA. dari ... ke\\nB. dari ... kepada\\nC. daripada ... ke\\nD. daripada ... kepada',\n", | |
| " 'jawapan': 'A'}]" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "questions[:3]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "fa9f2d54-84ba-4fb2-a95b-88111551dab8", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "\n", | |
| "objektif: Lengkapkan ayat-ayat yang berikut dengan memilih jawapan yang paling sesuai.\n", | |
| "soalan: Para ___ tanah air telah dihantar ke negara Jepun untuk membantu mangsa gempa bumi dan tsunami.\n", | |
| "A. hartawan\n", | |
| "B. dermawan\n", | |
| "C. bangsawan\n", | |
| "D. sukarelawan\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "row = questions[0]\n", | |
| "prompt = f\"\"\"\n", | |
| "objektif: {row['objektif']}\n", | |
| "soalan: {row['soalan']}\n", | |
| "\"\"\"\n", | |
| "print(prompt)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "2bc9f442-6291-4ac5-b174-79f7f16ecd51", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from transformers import AutoTokenizer,AutoModelForCausalLM, BitsAndBytesConfig\n", | |
| "import torch\n", | |
| "import logging\n", | |
| "logging.getLogger(\"transformers\").setLevel(logging.CRITICAL)\n", | |
| "nf4_config = BitsAndBytesConfig(\n", | |
| " load_in_4bit=True,\n", | |
| " bnb_4bit_quant_type='nf4',\n", | |
| " bnb_4bit_use_double_quant=True,\n", | |
| " bnb_4bit_compute_dtype=torch.bfloat16\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "2a059d94-e484-4467-9032-b3aced07ac46", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "634d6c8164304545be048c3629fb2250", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "p = \"mesolitica/malaysian-llama2-7b-32k-instructions\"\n", | |
| "tokenizer = AutoTokenizer.from_pretrained(p)\n", | |
| "model = AutoModelForCausalLM.from_pretrained(p,quantization_config=nf4_config)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "f2ecaee7-b242-47d5-8e41-2b1cb5f73ae6", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:00<00:00, 562015.92it/s]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "device = torch.device(\"cuda\")\n", | |
| "max_tok = 256\n", | |
| "correct_idx = []\n", | |
| "for i in tqdm(range(len(questions))):\n", | |
| " row = questions[i]\n", | |
| " prompt = f\"\"\"\n", | |
| "objektif: {row['objektif']}\n", | |
| "soalan: {row['soalan']}\n", | |
| "Jawapan:\n", | |
| "\n", | |
| "\"\"\".strip()\n", | |
| " if 'A' == row['jawapan'][0]:\n", | |
| " correct_idx.append(i)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "0695b69b-86b4-4be9-90e1-4657cefe7e18", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "('lowest score', 27.77777777777778)" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "\"lowest score\",len(correct_idx)/len(questions) * 100" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "83171e99-ce62-4d8b-b258-b32d8c7f32a6", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| " 0%| | 0/54 [00:00<?, ?it/s]/home/sani/miniconda3/envs/franken/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:381: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n", | |
| " warnings.warn(\n", | |
| "/home/sani/miniconda3/envs/franken/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:386: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.\n", | |
| " warnings.warn(\n", | |
| "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:46<00:00, 1.16it/s]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "device = torch.device(\"cuda\")\n", | |
| "max_tok = 256\n", | |
| "correct_idx = []\n", | |
| "for i in tqdm(range(len(questions))):\n", | |
| " row = questions[i]\n", | |
| " prompt = f\"\"\"\n", | |
| "objektif: {row['objektif']}\n", | |
| "soalan: {row['soalan']}\n", | |
| "Jawapan:\n", | |
| "\n", | |
| "\"\"\".strip()\n", | |
| " try:\n", | |
| " msg = [\n", | |
| " {\"role\": \"system\", \"content\": \"Anda sebagai pakar menjawab soalan yang mahir berbahasa. Sila berfikir dgn bijak dan berhati-hati ketika menjawab. Sila menjawab dengan satu huruf sahaja.\"},\n", | |
| " {\"role\": \"user\", \"content\": prompt},\n", | |
| " ]\n", | |
| " encodeds = tokenizer.apply_chat_template(msg, return_tensors=\"pt\")\n", | |
| " model_inputs = encodeds.to(device)\n", | |
| " generated_ids = model.generate(model_inputs, max_new_tokens=64, do_sample=False)\n", | |
| " decoded = tokenizer.batch_decode(generated_ids)[0].split('[/INST]')[-1].replace(\"Jawapan: \",\"\").strip()\n", | |
| " if decoded[0] == row['jawapan'][0]:\n", | |
| " correct_idx.append(i)\n", | |
| " except Exception as e:\n", | |
| " print(e)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "eddafca2-9618-4fee-8d10-4faf292836cd", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "37.03703703703704" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "len(correct_idx)/len(questions) * 100" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "400b777c-a15a-4deb-8697-0a79a3826b09", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[1, 2, 6, 13, 14, 15, 16, 19, 23, 24, 25, 33, 37, 41, 42, 44, 45, 47, 48, 51]" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "correct_idx" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "09e9f59b-fa72-408b-a5b7-f152ced730fe", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.10.13" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment