Last active
March 18, 2025 02:18
-
-
Save danielgross/3ab4104e14faccc12b49200843adab21 to your computer and use it in GitHub Desktop.
Revisions
-
danielgross revised this gist
Apr 27, 2023 . 1 changed file with 26 additions and 16 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -10,15 +10,16 @@ USE_GPT_4_32K = True DEFAULT_MODEL = "gpt-4-32k" if USE_GPT_4_32K else "gpt-3.5-turbo-0301" TOKEN_LIMIT = 31000 if USE_GPT_4_32K else 3096 # Leave some room for the chat. IS_SCIENTIFIC = True # Assumes you have a file called ~/.mathpix with the first line containing your app_id and # the second line containing your app_key with open(os.path.expanduser('~/.mathpix')) as f: APP_ID = f.readline().strip() APP_KEY = f.readline().strip() def send_pdf_to_mathpix(file_path, output_format='mmd'): url = 'https://api.mathpix.com/v3/pdf' headers = { @@ -32,7 +33,8 @@ def send_pdf_to_mathpix(file_path, output_format='mmd'): 'options_json': '{"conversion_formats": {"%s": true}}' % output_format } print(f"Sending {os.path.getsize(file_path) / 1000} kb to Mathpix") response = requests.post(url, headers=headers, files=files, data=options) response_data = response.json() if 'pdf_id' in response_data: @@ -43,6 +45,7 @@ def send_pdf_to_mathpix(file_path, output_format='mmd'): print("Error: Unable to send PDF to Mathpix") return None def wait_for_processing(pdf_id): url = f'https://api.mathpix.com/v3/pdf/{pdf_id}' headers = { @@ -65,6 +68,7 @@ def wait_for_processing(pdf_id): print(f"Status: {status}, waiting for processing to complete") time.sleep(5) def download_processed_file(pdf_id, file_format, output_path): url = f'https://api.mathpix.com/v3/pdf/{pdf_id}.{file_format}' headers = { @@ -77,14 +81,18 @@ def download_processed_file(pdf_id, file_format, output_path): output_file.write(response.content) print(f"File downloaded to {output_path}") def clear_terminal(): os.system('cls' if os.name == 'nt' else 'clear') def print_messages(messages): for index, message in enumerate(messages): color = 'blue' if message['role'] == 'assistant' else 'white' print( colored(f"{message['role'].capitalize()}: {message['content']}", color)) def chat_gpt(messages): result = openai.ChatCompletion.create( model=DEFAULT_MODEL, @@ -97,10 +105,10 @@ def chat_gpt(messages): def start_question_answering(input_path): print("Using model: %s" % DEFAULT_MODEL) with open(input_path) as fh: data = fh.read() text = data.strip() tokenizer = tiktoken.encoding_for_model(DEFAULT_MODEL) text = tokenizer.decode(tokenizer.encode(text)[:TOKEN_LIMIT]) @@ -117,21 +125,20 @@ def start_question_answering(input_path): question = input("User: ") if question.lower() == 'exit': break messages.append({"role": "user", "content": question}) messages = chat_gpt(messages) def main(): if len(sys.argv) < 2: print("Usage: python pdfvqa.py <input_pdf_path>") return input_pdf_path = sys.argv[1] output_mmd_path = input_pdf_path.replace('.pdf', '.md') output_simplemd_path = input_pdf_path.replace('.pdf', '.simple.md') if not os.path.exists(output_mmd_path): pdf_id = send_pdf_to_mathpix(input_pdf_path) if pdf_id and wait_for_processing(pdf_id): @@ -141,16 +148,19 @@ def main(): mmd = mmd_file.read() if not IS_SCIENTIFIC: # There's too much LaTeX style escaping for most PDFs in my view, so remove some of it. # Keep it if the paper is a scientific paper. mmd = '\n'.join([line for line in mmd.split( '\n') if not line.startswith('![]')]) # replace \section{Title} with # Title mmd = mmd.replace('\\section{', '# ').replace('}', '') # replace the "\" slash that Mathpix adds to escape $, %, (, etc. mmd = mmd.replace('\$', '$').replace( '\%', '%').replace('\(', '(').replace('\)', ')') with open(output_simplemd_path, 'w') as simplemd_file: simplemd_file.write(mmd) start_question_answering(output_simplemd_path) if __name__ == '__main__': main() -
danielgross created this gist
Apr 27, 2023 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,156 @@ import requests import time import os import sys import openai import tiktoken from termcolor import colored openai.api_key = open(os.path.expanduser('~/.openai')).read().strip() USE_GPT_4_32K = True DEFAULT_MODEL = "gpt-4-32k" if USE_GPT_4_32K else "gpt-3.5-turbo-0301" TOKEN_LIMIT = 31000 if USE_GPT_4_32K else 3096 # Leave some room for the chat. IS_SCIENTIFIC = True # Assumes you have a file called ~/.mathpix with the first line containing your app_id and # the second line containing your app_key with open(os.path.expanduser('~/.mathpix')) as f: APP_ID = f.readline().strip() APP_KEY = f.readline().strip() def send_pdf_to_mathpix(file_path, output_format='mmd'): url = 'https://api.mathpix.com/v3/pdf' headers = { 'app_id': APP_ID, 'app_key': APP_KEY } with open(file_path, 'rb') as file: files = {'file': file} options = { 'options_json': '{"conversion_formats": {"%s": true}}' % output_format } print(f"Sending {os.path.getsize(file_path) / 1000} kb to Mathpix") response = requests.post(url, headers=headers, files=files, data=options) response_data = response.json() if 'pdf_id' in response_data: pdf_id = response_data['pdf_id'] print(f"PDF ID: {pdf_id}") return pdf_id else: print("Error: Unable to send PDF to Mathpix") return None def wait_for_processing(pdf_id): url = f'https://api.mathpix.com/v3/pdf/{pdf_id}' headers = { 'app_id': APP_ID, 'app_key': APP_KEY } while True: response = requests.get(url, headers=headers) response_data = response.json() status = response_data.get('status', None) if status == 'completed': print("Processing complete") return True elif status == 'error': print("Error: Unable to process PDF") return False else: print(f"Status: {status}, waiting for processing to complete") time.sleep(5) def download_processed_file(pdf_id, file_format, output_path): url = f'https://api.mathpix.com/v3/pdf/{pdf_id}.{file_format}' headers = { 'app_id': APP_ID, 'app_key': APP_KEY } response = requests.get(url, headers=headers) with open(output_path, 'wb') as output_file: output_file.write(response.content) print(f"File downloaded to {output_path}") def clear_terminal(): os.system('cls' if os.name == 'nt' else 'clear') def print_messages(messages): for index, message in enumerate(messages): color = 'blue' if message['role'] == 'assistant' else 'white' print(colored(f"{message['role'].capitalize()}: {message['content']}", color)) def chat_gpt(messages): result = openai.ChatCompletion.create( model=DEFAULT_MODEL, messages=messages ) answer = result.choices[0].message.content messages.append({"role": "assistant", "content": answer}) return messages def start_question_answering(input_path): print("Using model: %s" % DEFAULT_MODEL) with open(input_path) as fh: data = fh.read() text = data.strip() tokenizer = tiktoken.encoding_for_model(DEFAULT_MODEL) text = tokenizer.decode(tokenizer.encode(text)[:TOKEN_LIMIT]) text = '\n' + '-' * 50 + '\n' + text + '\n' + '-' * 50 messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Here is some content from a PDF I extracted to Markdown. %s" % text}, ] messages = chat_gpt(messages) while True: clear_terminal() print_messages(messages) question = input("User: ") if question.lower() == 'exit': break messages.append({"role": "user", "content": question}) messages = chat_gpt(messages) def main(): if len (sys.argv) < 2: print("Usage: python pdfvqa.py <input_pdf_path>") return input_pdf_path = sys.argv[1] output_mmd_path = input_pdf_path.replace('.pdf', '.md') output_simplemd_path = input_pdf_path.replace('.pdf', '.simple.md') if not os.path.exists(output_mmd_path): pdf_id = send_pdf_to_mathpix(input_pdf_path) if pdf_id and wait_for_processing(pdf_id): download_processed_file(pdf_id, 'mmd', output_mmd_path) if not os.path.exists(output_simplemd_path): with open(output_mmd_path, 'r') as mmd_file: mmd = mmd_file.read() if not IS_SCIENTIFIC: # There's too much LaTeX style escaping for most PDFs in my view, so remove some of it. # Keep it if the paper is a scientific paper. mmd = '\n'.join([line for line in mmd.split('\n') if not line.startswith('![]')]) # replace \section{Title} with # Title mmd = mmd.replace('\\section{', '# ').replace('}', '') # replace the "\" slash that Mathpix adds to escape $, %, (, etc. mmd = mmd.replace('\$', '$').replace('\%', '%').replace('\(', '(').replace('\)', ')') with open(output_simplemd_path, 'w') as simplemd_file: simplemd_file.write(mmd) start_question_answering(output_simplemd_path) if __name__ == '__main__': main()