Skip to content

Instantly share code, notes, and snippets.

@khursani8
Last active November 11, 2023 12:50
Show Gist options
  • Save khursani8/984d19f6c3eb1644a6ec74a9a57d2a05 to your computer and use it in GitHub Desktop.
Save khursani8/984d19f6c3eb1644a6ec74a9a57d2a05 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "8be9502e-62ce-4556-8050-924b1e19f1e1",
"metadata": {},
"outputs": [],
"source": [
"from tqdm import tqdm\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "06635008-1386-4e48-b8a2-cf6609592ea9",
"metadata": {},
"outputs": [],
"source": [
"with open('BM-A-pt3') as fopen:\n",
" text = fopen.read()\n",
" \n",
"questions = []\n",
"for t in text.split('no: ')[1:]:\n",
" t = t.strip()\n",
" no = t.split('\\n')[0]\n",
" objektif = t.split('objektif: ')[1].split('\\n')[0]\n",
" soalan = t.split('soalan:')[1].split('jawapan:')[0].strip()\n",
" jawapan = t.split('jawapan: ')[1].split(',')[0].strip()\n",
" data = {\n",
" 'no': no,\n",
" 'objektif': objektif,\n",
" 'soalan': soalan,\n",
" 'jawapan': jawapan,\n",
" }\n",
" questions.append(data)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "13d82f1e-826f-4588-9a34-7b6f3a85b22b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'no': '1',\n",
" 'objektif': 'Lengkapkan ayat-ayat yang berikut dengan memilih jawapan yang paling sesuai.',\n",
" 'soalan': 'Para ___ tanah air telah dihantar ke negara Jepun untuk membantu mangsa gempa bumi dan tsunami.\\nA. hartawan\\nB. dermawan\\nC. bangsawan\\nD. sukarelawan',\n",
" 'jawapan': 'D'},\n",
" {'no': '2',\n",
" 'objektif': 'Lengkapkan ayat-ayat yang berikut dengan memilih jawapan yang paling sesuai.',\n",
" 'soalan': 'Kebanyakan barang yang disimpan di dalam stor itu telah rosak ____ tikus.\\nA. digerit\\nB. digigit\\nC. dikesip\\nD. diketip',\n",
" 'jawapan': 'A'},\n",
" {'no': '3',\n",
" 'objektif': 'Lengkapkan ayat-ayat yang berikut dengan memilih jawapan yang paling sesuai.',\n",
" 'soalan': 'Shafiq cuba mengubah kedudukan meja kayu itu ___ ruang tamu ___ ruang dapur rumahnya.\\nA. dari ... ke\\nB. dari ... kepada\\nC. daripada ... ke\\nD. daripada ... kepada',\n",
" 'jawapan': 'A'}]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"questions[:3]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "fa9f2d54-84ba-4fb2-a95b-88111551dab8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"objektif: Lengkapkan ayat-ayat yang berikut dengan memilih jawapan yang paling sesuai.\n",
"soalan: Para ___ tanah air telah dihantar ke negara Jepun untuk membantu mangsa gempa bumi dan tsunami.\n",
"A. hartawan\n",
"B. dermawan\n",
"C. bangsawan\n",
"D. sukarelawan\n",
"\n"
]
}
],
"source": [
"row = questions[0]\n",
"prompt = f\"\"\"\n",
"objektif: {row['objektif']}\n",
"soalan: {row['soalan']}\n",
"\"\"\"\n",
"print(prompt)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "2bc9f442-6291-4ac5-b174-79f7f16ecd51",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer,AutoModelForCausalLM, BitsAndBytesConfig\n",
"import torch\n",
"import logging\n",
"logging.getLogger(\"transformers\").setLevel(logging.CRITICAL)\n",
"nf4_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_quant_type='nf4',\n",
" bnb_4bit_use_double_quant=True,\n",
" bnb_4bit_compute_dtype=torch.bfloat16\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2a059d94-e484-4467-9032-b3aced07ac46",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "634d6c8164304545be048c3629fb2250",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"p = \"mesolitica/malaysian-llama2-7b-32k-instructions\"\n",
"tokenizer = AutoTokenizer.from_pretrained(p)\n",
"model = AutoModelForCausalLM.from_pretrained(p,quantization_config=nf4_config)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "f2ecaee7-b242-47d5-8e41-2b1cb5f73ae6",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:00<00:00, 562015.92it/s]\n"
]
}
],
"source": [
"device = torch.device(\"cuda\")\n",
"max_tok = 256\n",
"correct_idx = []\n",
"for i in tqdm(range(len(questions))):\n",
" row = questions[i]\n",
" prompt = f\"\"\"\n",
"objektif: {row['objektif']}\n",
"soalan: {row['soalan']}\n",
"Jawapan:\n",
"\n",
"\"\"\".strip()\n",
" if 'A' == row['jawapan'][0]:\n",
" correct_idx.append(i)\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "0695b69b-86b4-4be9-90e1-4657cefe7e18",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('lowest score', 27.77777777777778)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"lowest score\",len(correct_idx)/len(questions) * 100"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "83171e99-ce62-4d8b-b258-b32d8c7f32a6",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/54 [00:00<?, ?it/s]/home/sani/miniconda3/envs/franken/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:381: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
" warnings.warn(\n",
"/home/sani/miniconda3/envs/franken/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:386: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.\n",
" warnings.warn(\n",
"100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 54/54 [00:46<00:00, 1.16it/s]\n"
]
}
],
"source": [
"device = torch.device(\"cuda\")\n",
"max_tok = 256\n",
"correct_idx = []\n",
"for i in tqdm(range(len(questions))):\n",
" row = questions[i]\n",
" prompt = f\"\"\"\n",
"objektif: {row['objektif']}\n",
"soalan: {row['soalan']}\n",
"Jawapan:\n",
"\n",
"\"\"\".strip()\n",
" try:\n",
" msg = [\n",
" {\"role\": \"system\", \"content\": \"Anda sebagai pakar menjawab soalan yang mahir berbahasa. Sila berfikir dgn bijak dan berhati-hati ketika menjawab. Sila menjawab dengan satu huruf sahaja.\"},\n",
" {\"role\": \"user\", \"content\": prompt},\n",
" ]\n",
" encodeds = tokenizer.apply_chat_template(msg, return_tensors=\"pt\")\n",
" model_inputs = encodeds.to(device)\n",
" generated_ids = model.generate(model_inputs, max_new_tokens=64, do_sample=False)\n",
" decoded = tokenizer.batch_decode(generated_ids)[0].split('[/INST]')[-1].replace(\"Jawapan: \",\"\").strip()\n",
" if decoded[0] == row['jawapan'][0]:\n",
" correct_idx.append(i)\n",
" except Exception as e:\n",
" print(e)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "eddafca2-9618-4fee-8d10-4faf292836cd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"37.03703703703704"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(correct_idx)/len(questions) * 100"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "400b777c-a15a-4deb-8697-0a79a3826b09",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[1, 2, 6, 13, 14, 15, 16, 19, 23, 24, 25, 33, 37, 41, 42, 44, 45, 47, 48, 51]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"correct_idx"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "09e9f59b-fa72-408b-a5b7-f152ced730fe",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment