danielgross · March 18, 2025 02:18 · Apr 27, 2023 · Apr 27, 2023
diff --git a/mathpix2gpt.py b/mathpix2gpt.py
@@ -10,15 +10,16 @@
 
 USE_GPT_4_32K = True
 DEFAULT_MODEL = "gpt-4-32k" if USE_GPT_4_32K else "gpt-3.5-turbo-0301"
-TOKEN_LIMIT = 31000 if USE_GPT_4_32K else 3096 # Leave some room for the chat.
+TOKEN_LIMIT = 31000 if USE_GPT_4_32K else 3096  # Leave some room for the chat.
 IS_SCIENTIFIC = True
 
-# Assumes you have a file called ~/.mathpix with the first line containing your app_id and 
+# Assumes you have a file called ~/.mathpix with the first line containing your app_id and
 # the second line containing your app_key
 with open(os.path.expanduser('~/.mathpix')) as f:
     APP_ID = f.readline().strip()
     APP_KEY = f.readline().strip()
 
+
 def send_pdf_to_mathpix(file_path, output_format='mmd'):
     url = 'https://api.mathpix.com/v3/pdf'
     headers = {
@@ -32,7 +33,8 @@ def send_pdf_to_mathpix(file_path, output_format='mmd'):
             'options_json': '{"conversion_formats": {"%s": true}}' % output_format
         }
         print(f"Sending {os.path.getsize(file_path) / 1000} kb to Mathpix")
-        response = requests.post(url, headers=headers, files=files, data=options)
+        response = requests.post(url, headers=headers,
+                                 files=files, data=options)
         response_data = response.json()
 
         if 'pdf_id' in response_data:
@@ -43,6 +45,7 @@ def send_pdf_to_mathpix(file_path, output_format='mmd'):
             print("Error: Unable to send PDF to Mathpix")
             return None
 
+
 def wait_for_processing(pdf_id):
     url = f'https://api.mathpix.com/v3/pdf/{pdf_id}'
     headers = {
@@ -65,6 +68,7 @@ def wait_for_processing(pdf_id):
             print(f"Status: {status}, waiting for processing to complete")
             time.sleep(5)
 
+
 def download_processed_file(pdf_id, file_format, output_path):
     url = f'https://api.mathpix.com/v3/pdf/{pdf_id}.{file_format}'
     headers = {
@@ -77,14 +81,18 @@ def download_processed_file(pdf_id, file_format, output_path):
         output_file.write(response.content)
     print(f"File downloaded to {output_path}")
 
+
 def clear_terminal():
     os.system('cls' if os.name == 'nt' else 'clear')
 
+
 def print_messages(messages):
     for index, message in enumerate(messages):
         color = 'blue' if message['role'] == 'assistant' else 'white'
-        print(colored(f"{message['role'].capitalize()}: {message['content']}", color))
-
+        print(
+            colored(f"{message['role'].capitalize()}: {message['content']}", color))
+
+
 def chat_gpt(messages):
     result = openai.ChatCompletion.create(
         model=DEFAULT_MODEL,
@@ -97,10 +105,10 @@ def chat_gpt(messages):
 
 def start_question_answering(input_path):
     print("Using model: %s" % DEFAULT_MODEL)
-    
+
     with open(input_path) as fh:
         data = fh.read()
-    
+
     text = data.strip()
     tokenizer = tiktoken.encoding_for_model(DEFAULT_MODEL)
     text = tokenizer.decode(tokenizer.encode(text)[:TOKEN_LIMIT])
@@ -117,21 +125,20 @@ def start_question_answering(input_path):
         question = input("User: ")
         if question.lower() == 'exit':
             break
-        
+
         messages.append({"role": "user", "content": question})
         messages = chat_gpt(messages)
-
-
+
 
 def main():
-    if len (sys.argv) < 2:
+    if len(sys.argv) < 2:
         print("Usage: python pdfvqa.py <input_pdf_path>")
         return
 
     input_pdf_path = sys.argv[1]
     output_mmd_path = input_pdf_path.replace('.pdf', '.md')
     output_simplemd_path = input_pdf_path.replace('.pdf', '.simple.md')
-    
+
     if not os.path.exists(output_mmd_path):
         pdf_id = send_pdf_to_mathpix(input_pdf_path)
         if pdf_id and wait_for_processing(pdf_id):
@@ -141,16 +148,19 @@ def main():
             mmd = mmd_file.read()
             if not IS_SCIENTIFIC:
                 # There's too much LaTeX style escaping for most PDFs in my view, so remove some of it.
-                # Keep it if the paper is a scientific paper. 
-                mmd = '\n'.join([line for line in mmd.split('\n') if not line.startswith('![]')])
+                # Keep it if the paper is a scientific paper.
+                mmd = '\n'.join([line for line in mmd.split(
+                    '\n') if not line.startswith('![]')])
                 # replace \section{Title} with # Title
                 mmd = mmd.replace('\\section{', '# ').replace('}', '')
                 # replace the "\" slash that Mathpix adds to escape $, %, (, etc.
-                mmd = mmd.replace('\$', '$').replace('\%', '%').replace('\(', '(').replace('\)', ')')
+                mmd = mmd.replace('\$', '$').replace(
+                    '\%', '%').replace('\(', '(').replace('\)', ')')
         with open(output_simplemd_path, 'w') as simplemd_file:
             simplemd_file.write(mmd)
 
     start_question_answering(output_simplemd_path)
 
+
 if __name__ == '__main__':
-    main()
+    main()
diff --git a/mathpix2gpt.py b/mathpix2gpt.py
@@ -0,0 +1,156 @@
+import requests
+import time
+import os
+import sys
+import openai
+import tiktoken
+from termcolor import colored
+
+openai.api_key = open(os.path.expanduser('~/.openai')).read().strip()
+
+USE_GPT_4_32K = True
+DEFAULT_MODEL = "gpt-4-32k" if USE_GPT_4_32K else "gpt-3.5-turbo-0301"
+TOKEN_LIMIT = 31000 if USE_GPT_4_32K else 3096 # Leave some room for the chat.
+IS_SCIENTIFIC = True
+
+# Assumes you have a file called ~/.mathpix with the first line containing your app_id and 
+# the second line containing your app_key
+with open(os.path.expanduser('~/.mathpix')) as f:
+    APP_ID = f.readline().strip()
+    APP_KEY = f.readline().strip()
+
+def send_pdf_to_mathpix(file_path, output_format='mmd'):
+    url = 'https://api.mathpix.com/v3/pdf'
+    headers = {
+        'app_id': APP_ID,
+        'app_key': APP_KEY
+    }
+
+    with open(file_path, 'rb') as file:
+        files = {'file': file}
+        options = {
+            'options_json': '{"conversion_formats": {"%s": true}}' % output_format
+        }
+        print(f"Sending {os.path.getsize(file_path) / 1000} kb to Mathpix")
+        response = requests.post(url, headers=headers, files=files, data=options)
+        response_data = response.json()
+
+        if 'pdf_id' in response_data:
+            pdf_id = response_data['pdf_id']
+            print(f"PDF ID: {pdf_id}")
+            return pdf_id
+        else:
+            print("Error: Unable to send PDF to Mathpix")
+            return None
+
+def wait_for_processing(pdf_id):
+    url = f'https://api.mathpix.com/v3/pdf/{pdf_id}'
+    headers = {
+        'app_id': APP_ID,
+        'app_key': APP_KEY
+    }
+
+    while True:
+        response = requests.get(url, headers=headers)
+        response_data = response.json()
+        status = response_data.get('status', None)
+
+        if status == 'completed':
+            print("Processing complete")
+            return True
+        elif status == 'error':
+            print("Error: Unable to process PDF")
+            return False
+        else:
+            print(f"Status: {status}, waiting for processing to complete")
+            time.sleep(5)
+
+def download_processed_file(pdf_id, file_format, output_path):
+    url = f'https://api.mathpix.com/v3/pdf/{pdf_id}.{file_format}'
+    headers = {
+        'app_id': APP_ID,
+        'app_key': APP_KEY
+    }
+
+    response = requests.get(url, headers=headers)
+    with open(output_path, 'wb') as output_file:
+        output_file.write(response.content)
+    print(f"File downloaded to {output_path}")
+
+def clear_terminal():
+    os.system('cls' if os.name == 'nt' else 'clear')
+
+def print_messages(messages):
+    for index, message in enumerate(messages):
+        color = 'blue' if message['role'] == 'assistant' else 'white'
+        print(colored(f"{message['role'].capitalize()}: {message['content']}", color))
+
+def chat_gpt(messages):
+    result = openai.ChatCompletion.create(
+        model=DEFAULT_MODEL,
+        messages=messages
+    )
+    answer = result.choices[0].message.content
+    messages.append({"role": "assistant", "content": answer})
+    return messages
+
+
+def start_question_answering(input_path):
+    print("Using model: %s" % DEFAULT_MODEL)
+
+    with open(input_path) as fh:
+        data = fh.read()
+
+    text = data.strip()
+    tokenizer = tiktoken.encoding_for_model(DEFAULT_MODEL)
+    text = tokenizer.decode(tokenizer.encode(text)[:TOKEN_LIMIT])
+    text = '\n' + '-' * 50 + '\n' + text + '\n' + '-' * 50
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Here is some content from a PDF I extracted to Markdown. %s" % text},
+    ]
+    messages = chat_gpt(messages)
+    while True:
+        clear_terminal()
+        print_messages(messages)
+
+        question = input("User: ")
+        if question.lower() == 'exit':
+            break
+
+        messages.append({"role": "user", "content": question})
+        messages = chat_gpt(messages)
+
+
+
+def main():
+    if len (sys.argv) < 2:
+        print("Usage: python pdfvqa.py <input_pdf_path>")
+        return
+
+    input_pdf_path = sys.argv[1]
+    output_mmd_path = input_pdf_path.replace('.pdf', '.md')
+    output_simplemd_path = input_pdf_path.replace('.pdf', '.simple.md')
+
+    if not os.path.exists(output_mmd_path):
+        pdf_id = send_pdf_to_mathpix(input_pdf_path)
+        if pdf_id and wait_for_processing(pdf_id):
+            download_processed_file(pdf_id, 'mmd', output_mmd_path)
+    if not os.path.exists(output_simplemd_path):
+        with open(output_mmd_path, 'r') as mmd_file:
+            mmd = mmd_file.read()
+            if not IS_SCIENTIFIC:
+                # There's too much LaTeX style escaping for most PDFs in my view, so remove some of it.
+                # Keep it if the paper is a scientific paper. 
+                mmd = '\n'.join([line for line in mmd.split('\n') if not line.startswith('![]')])
+                # replace \section{Title} with # Title
+                mmd = mmd.replace('\\section{', '# ').replace('}', '')
+                # replace the "\" slash that Mathpix adds to escape $, %, (, etc.
+                mmd = mmd.replace('\$', '$').replace('\%', '%').replace('\(', '(').replace('\)', ')')
+        with open(output_simplemd_path, 'w') as simplemd_file:
+            simplemd_file.write(mmd)
+
+    start_question_answering(output_simplemd_path)
+
+if __name__ == '__main__':
+    main()