import requests import time import os import sys import openai import tiktoken from termcolor import colored openai.api_key = open(os.path.expanduser('~/.openai')).read().strip() USE_GPT_4_32K = True DEFAULT_MODEL = "gpt-4-32k" if USE_GPT_4_32K else "gpt-3.5-turbo-0301" TOKEN_LIMIT = 31000 if USE_GPT_4_32K else 3096 # Leave some room for the chat. IS_SCIENTIFIC = True # Assumes you have a file called ~/.mathpix with the first line containing your app_id and # the second line containing your app_key with open(os.path.expanduser('~/.mathpix')) as f: APP_ID = f.readline().strip() APP_KEY = f.readline().strip() def send_pdf_to_mathpix(file_path, output_format='mmd'): url = 'https://api.mathpix.com/v3/pdf' headers = { 'app_id': APP_ID, 'app_key': APP_KEY } with open(file_path, 'rb') as file: files = {'file': file} options = { 'options_json': '{"conversion_formats": {"%s": true}}' % output_format } print(f"Sending {os.path.getsize(file_path) / 1000} kb to Mathpix") response = requests.post(url, headers=headers, files=files, data=options) response_data = response.json() if 'pdf_id' in response_data: pdf_id = response_data['pdf_id'] print(f"PDF ID: {pdf_id}") return pdf_id else: print("Error: Unable to send PDF to Mathpix") return None def wait_for_processing(pdf_id): url = f'https://api.mathpix.com/v3/pdf/{pdf_id}' headers = { 'app_id': APP_ID, 'app_key': APP_KEY } while True: response = requests.get(url, headers=headers) response_data = response.json() status = response_data.get('status', None) if status == 'completed': print("Processing complete") return True elif status == 'error': print("Error: Unable to process PDF") return False else: print(f"Status: {status}, waiting for processing to complete") time.sleep(5) def download_processed_file(pdf_id, file_format, output_path): url = f'https://api.mathpix.com/v3/pdf/{pdf_id}.{file_format}' headers = { 'app_id': APP_ID, 'app_key': APP_KEY } response = requests.get(url, headers=headers) with open(output_path, 'wb') as output_file: output_file.write(response.content) print(f"File downloaded to {output_path}") def clear_terminal(): os.system('cls' if os.name == 'nt' else 'clear') def print_messages(messages): for index, message in enumerate(messages): color = 'blue' if message['role'] == 'assistant' else 'white' print( colored(f"{message['role'].capitalize()}: {message['content']}", color)) def chat_gpt(messages): result = openai.ChatCompletion.create( model=DEFAULT_MODEL, messages=messages ) answer = result.choices[0].message.content messages.append({"role": "assistant", "content": answer}) return messages def start_question_answering(input_path): print("Using model: %s" % DEFAULT_MODEL) with open(input_path) as fh: data = fh.read() text = data.strip() tokenizer = tiktoken.encoding_for_model(DEFAULT_MODEL) text = tokenizer.decode(tokenizer.encode(text)[:TOKEN_LIMIT]) text = '\n' + '-' * 50 + '\n' + text + '\n' + '-' * 50 messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Here is some content from a PDF I extracted to Markdown. %s" % text}, ] messages = chat_gpt(messages) while True: clear_terminal() print_messages(messages) question = input("User: ") if question.lower() == 'exit': break messages.append({"role": "user", "content": question}) messages = chat_gpt(messages) def main(): if len(sys.argv) < 2: print("Usage: python pdfvqa.py ") return input_pdf_path = sys.argv[1] output_mmd_path = input_pdf_path.replace('.pdf', '.md') output_simplemd_path = input_pdf_path.replace('.pdf', '.simple.md') if not os.path.exists(output_mmd_path): pdf_id = send_pdf_to_mathpix(input_pdf_path) if pdf_id and wait_for_processing(pdf_id): download_processed_file(pdf_id, 'mmd', output_mmd_path) if not os.path.exists(output_simplemd_path): with open(output_mmd_path, 'r') as mmd_file: mmd = mmd_file.read() if not IS_SCIENTIFIC: # There's too much LaTeX style escaping for most PDFs in my view, so remove some of it. # Keep it if the paper is a scientific paper. mmd = '\n'.join([line for line in mmd.split( '\n') if not line.startswith('![]')]) # replace \section{Title} with # Title mmd = mmd.replace('\\section{', '# ').replace('}', '') # replace the "\" slash that Mathpix adds to escape $, %, (, etc. mmd = mmd.replace('\$', '$').replace( '\%', '%').replace('\(', '(').replace('\)', ')') with open(output_simplemd_path, 'w') as simplemd_file: simplemd_file.write(mmd) start_question_answering(output_simplemd_path) if __name__ == '__main__': main()