# import stuffs

In [None]:
import json
import os
import re
import time
import datetime
import random
import demoji
import numpy as np
import pandas as pd
import undetected_chromedriver as uc

from datetime import date
from glob import glob
from pathlib import Path
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from tqdm import tqdm

**CSS classes updated on 23/11/2023**

In [None]:
THREE_DOTS = "#radix-\:r1p\:"
DELETE_BTN = "#radix-\:r1q\: > div.flex.gap-2.m-1\.5.rounded.px-5.py-2\.5.text-sm.cursor-pointer.focus\:ring-0.hover\:bg-black\/5.dark\:hover\:bg-white\/5.radix-disabled\:pointer-events-none.radix-disabled\:opacity-50.group.text-red-500"
CONFIRM_DELETE_BTN = ".btn-danger"
RESPONSE_AREA = ".text-token-text-primary .p\-4.gizmo\:py\-2"
HOME_MODAL_CONFIRM_BTN = "#radix\-\:rh\: button.btn.relative.btn-primary"

JSON_PATTERN = r"\[\n*\s*{(?:\n*.*\n)*\]"

REACH_REQUEST_LIMIT_MESSAGE = [
 "!\nYou've reached our limit of messages per hour. Please try again later.",
 "Too many requests in 1 hour. Try again later.",
 "!\nChatGPT\nYou've reached our limit of messages per hour. Please try again later."
]

In [None]:
cwd = Path.cwd()
save_path = cwd / "data/chatgpt"

# Seed data is used for few shots prompting.
seed_instruction_data = json.load(open(cwd / "data/train.json", "r"))
num_prompt_instructions = 4 # Number of shots.

In [None]:
def init_browser(headless=True):
 driver = uc.Chrome(
 headless=headless,
 use_subprocess=False,
 version_main=119)
 return driver
 
def select_element_if_clickable(driver, css_class, t=5):
 element = WebDriverWait(driver, t).until(
 EC.element_to_be_clickable((By.CSS_SELECTOR, css_class))
 )
 return element
 
def go_to_home(br):
 br.get(chatgpt_url)
 time.sleep(5)
 try:
 br.find_element(By.CSS_SELECTOR, HOME_MODAL_CONFIRM_BTN).click()
 except:
 pass

def enter_prompt(br, prompt):
 prompt_input = br.find_element(By.CSS_SELECTOR, "#prompt-textarea")
 prompt_input.clear()
 js_send_keys(br, prompt_input, prompt)
 time.sleep(1)
 prompt_input.send_keys(Keys.BACK_SPACE)
 prompt_input.send_keys(Keys.ENTER)
 time.sleep(15)


def js_send_keys(br, el, text):
 JS_ADD_TEXT_TO_INPUT = """
 var elm = arguments[0], txt = arguments[1];
 elm.value += txt;
 elm.dispatchEvent(new Event('change'));
 """
 if text == "":
 print("found null in text")
 br.execute_script(JS_ADD_TEXT_TO_INPUT, el, text)


def parse_response(br, response):
 task_types = []
 instructions = []
 inputs_ = []
 outputs = []
 for line in response.split("\n"):
 line = line.strip()
 if "Task type" in line:
 task_types.append(line.replace("Task type: ", ""))
 elif "Instruction" in line:
 instructions.append(line.replace("Instruction: ", ""))
 elif "Input" in line:
 inputs_.append(line.replace("Input: ", ""))
 elif "Output" in line:
 outputs.append(line.replace("Output: ", ""))
 m_l = min(len(task_types), len(instructions), len(inputs_), len(outputs))
 return task_types[:m_l], instructions[:m_l], inputs_[:m_l], outputs[:m_l]

def is_prompt_ok(br):
 try:
 content = br.find_element(
 By.CSS_SELECTOR,
 'div.flex.flex-grow.flex-col.gap-3.max-w-full > div[data-message-author-role="user"]',
 ).text
 except:
 return False
 if 'Give me the next set of diverse task instructions in Vietnamese' in content:
 return False
 return len(content) > 1

def wait_when_limit_reached(br, t=60 * 60, navigate_home=True):
 time.sleep(t)

 if navigate_home:
 go_to_home(br)

def delete_current_chat(br):
 # Delete last conversation.
 three_dot = select_element_if_clickable(br, THREE_DOTS, 5)
 three_dot.click()
 time.sleep(1)
 
 delete_button = select_element_if_clickable(br, DELETE_BTN, 20)
 delete_button.click()
 # br.find_element(By.CSS_SELECTOR, DELETE_BTN).click()
 time.sleep(1)
 confirm_delete_button = select_element_if_clickable(br, CONFIRM_DELETE_BTN, 5)
 confirm_delete_button.click()
 # br.find_element(By.CSS_SELECTOR, CONFIRM_DELETE_BTN).click()
 time.sleep(1)

def response_to_instructions(br, response):
 """Parse instruction from response. You should create your own function."""
 instructions = []
 for task in re.split(r'\n\n*[0-9]+\s*\n*', response):
 res = re.split('(Question|Choices|Answer|Explanation):\s+', task)
 if len(res) != 9:
 continue
 # task_type = res[2].strip()
 task_question = res[2].strip()
 task_choices = res[4].strip().split('\n')
 task_explaination = res[6].strip()
 task_answer = res[8].strip()
 instructions.append({"question": task_question, "choices": task_choices, "explanation": task_explaination, "answer": task_answer})
 return instructions
 
def prompt_with_fewshot(prompt_instructions, prompt_path=cwd / "data/prompt/math_alpaca_v2.txt"):
 prompt = open(prompt_path).read() + "\n"
 
 for idx, task_dict in enumerate(prompt_instructions):
 # id = task_dict['id']
 question = task_dict['question']
 choices = task_dict['choices']
 explaination = task_dict.get('explanation', '')
 answer = task_dict['answer']
 
 prompt += f'### {idx + 1}\n'
 prompt += f'Question: {question}\n'
 prompt += f'Choices: '
 for choice in choices:
 prompt += choice + '\n'
 prompt += f'Explanation: {explaination}\n'
 prompt += f'Answer: {answer}\n'
 
 prompt += '###'
 return prompt

# main

In [None]:
try:
 br
 br.quit()
except:
 pass

chatgpt_url = "https://chat.openai.com"
br = init_browser(headless=False)
br.get(chatgpt_url)

**Notice:** after this, you should manually log-in or write a script for that (because I didn't xD)

In [None]:
all_data = []

try:
 reach_limit = False
 for _ in range(1000):
 new_chat = True
 last_response = None

 # Repeat n=5 times in the same conversation.
 # This helps decrease duplication but also decrease diversity.
 for _ in range(5):
 if new_chat:
 # seed_number = random.randint(1, 123456)
 prompt_fewshot_examples = random.sample(seed_instruction_data['data'], num_prompt_instructions)
 prompt = prompt_with_fewshot(prompt_fewshot_examples)
 
 enter_prompt(br, prompt)
 new_chat = False
 else:
 enter_prompt(br, "Increase the difficulty a little bit and generate 10 more questions. Make sure they are feasible for elementary students.")
 if not is_prompt_ok(br):
 break
 response = br.find_elements(By.CSS_SELECTOR, RESPONSE_AREA)[-1].text
 
 # Reach request limit per hour.
 # Let's wait for 60 minutes.
 if response in REACH_REQUEST_LIMIT_MESSAGE:
 print(response)
 reach_limit = True
 wait_when_limit_reached(
 br, t=30 * 60, navigate_home=False
 )
 break
 elif response.startswith("I apologize") or response == last_response or 'network error' in response:
 go_to_home(br)
 new_chat = True
 else:
 instructions = response_to_instructions(br, response)
 all_data.extend(instructions)
 last_response = response
 try:
 delete_current_chat(br)
 except:
 go_to_home(br)
 print('Got', len(all_data), '.')
except Exception as e:
 raise e
finally:
 if len(all_data) > 0:
 # Unique save file name.
 uid = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
 with open(save_path / "raw" / f"{uid}.json", "w") as f:
 json.dump(all_data, f, ensure_ascii=False)