Skip to content

Instantly share code, notes, and snippets.

@danielgross
Last active March 18, 2025 02:18
Show Gist options
  • Save danielgross/3ab4104e14faccc12b49200843adab21 to your computer and use it in GitHub Desktop.
Save danielgross/3ab4104e14faccc12b49200843adab21 to your computer and use it in GitHub Desktop.

Revisions

  1. danielgross revised this gist Apr 27, 2023. 1 changed file with 26 additions and 16 deletions.
    42 changes: 26 additions & 16 deletions mathpix2gpt.py
    Original file line number Diff line number Diff line change
    @@ -10,15 +10,16 @@

    USE_GPT_4_32K = True
    DEFAULT_MODEL = "gpt-4-32k" if USE_GPT_4_32K else "gpt-3.5-turbo-0301"
    TOKEN_LIMIT = 31000 if USE_GPT_4_32K else 3096 # Leave some room for the chat.
    TOKEN_LIMIT = 31000 if USE_GPT_4_32K else 3096 # Leave some room for the chat.
    IS_SCIENTIFIC = True

    # Assumes you have a file called ~/.mathpix with the first line containing your app_id and
    # Assumes you have a file called ~/.mathpix with the first line containing your app_id and
    # the second line containing your app_key
    with open(os.path.expanduser('~/.mathpix')) as f:
    APP_ID = f.readline().strip()
    APP_KEY = f.readline().strip()


    def send_pdf_to_mathpix(file_path, output_format='mmd'):
    url = 'https://api.mathpix.com/v3/pdf'
    headers = {
    @@ -32,7 +33,8 @@ def send_pdf_to_mathpix(file_path, output_format='mmd'):
    'options_json': '{"conversion_formats": {"%s": true}}' % output_format
    }
    print(f"Sending {os.path.getsize(file_path) / 1000} kb to Mathpix")
    response = requests.post(url, headers=headers, files=files, data=options)
    response = requests.post(url, headers=headers,
    files=files, data=options)
    response_data = response.json()

    if 'pdf_id' in response_data:
    @@ -43,6 +45,7 @@ def send_pdf_to_mathpix(file_path, output_format='mmd'):
    print("Error: Unable to send PDF to Mathpix")
    return None


    def wait_for_processing(pdf_id):
    url = f'https://api.mathpix.com/v3/pdf/{pdf_id}'
    headers = {
    @@ -65,6 +68,7 @@ def wait_for_processing(pdf_id):
    print(f"Status: {status}, waiting for processing to complete")
    time.sleep(5)


    def download_processed_file(pdf_id, file_format, output_path):
    url = f'https://api.mathpix.com/v3/pdf/{pdf_id}.{file_format}'
    headers = {
    @@ -77,14 +81,18 @@ def download_processed_file(pdf_id, file_format, output_path):
    output_file.write(response.content)
    print(f"File downloaded to {output_path}")


    def clear_terminal():
    os.system('cls' if os.name == 'nt' else 'clear')


    def print_messages(messages):
    for index, message in enumerate(messages):
    color = 'blue' if message['role'] == 'assistant' else 'white'
    print(colored(f"{message['role'].capitalize()}: {message['content']}", color))

    print(
    colored(f"{message['role'].capitalize()}: {message['content']}", color))


    def chat_gpt(messages):
    result = openai.ChatCompletion.create(
    model=DEFAULT_MODEL,
    @@ -97,10 +105,10 @@ def chat_gpt(messages):

    def start_question_answering(input_path):
    print("Using model: %s" % DEFAULT_MODEL)

    with open(input_path) as fh:
    data = fh.read()

    text = data.strip()
    tokenizer = tiktoken.encoding_for_model(DEFAULT_MODEL)
    text = tokenizer.decode(tokenizer.encode(text)[:TOKEN_LIMIT])
    @@ -117,21 +125,20 @@ def start_question_answering(input_path):
    question = input("User: ")
    if question.lower() == 'exit':
    break

    messages.append({"role": "user", "content": question})
    messages = chat_gpt(messages)




    def main():
    if len (sys.argv) < 2:
    if len(sys.argv) < 2:
    print("Usage: python pdfvqa.py <input_pdf_path>")
    return

    input_pdf_path = sys.argv[1]
    output_mmd_path = input_pdf_path.replace('.pdf', '.md')
    output_simplemd_path = input_pdf_path.replace('.pdf', '.simple.md')

    if not os.path.exists(output_mmd_path):
    pdf_id = send_pdf_to_mathpix(input_pdf_path)
    if pdf_id and wait_for_processing(pdf_id):
    @@ -141,16 +148,19 @@ def main():
    mmd = mmd_file.read()
    if not IS_SCIENTIFIC:
    # There's too much LaTeX style escaping for most PDFs in my view, so remove some of it.
    # Keep it if the paper is a scientific paper.
    mmd = '\n'.join([line for line in mmd.split('\n') if not line.startswith('![]')])
    # Keep it if the paper is a scientific paper.
    mmd = '\n'.join([line for line in mmd.split(
    '\n') if not line.startswith('![]')])
    # replace \section{Title} with # Title
    mmd = mmd.replace('\\section{', '# ').replace('}', '')
    # replace the "\" slash that Mathpix adds to escape $, %, (, etc.
    mmd = mmd.replace('\$', '$').replace('\%', '%').replace('\(', '(').replace('\)', ')')
    mmd = mmd.replace('\$', '$').replace(
    '\%', '%').replace('\(', '(').replace('\)', ')')
    with open(output_simplemd_path, 'w') as simplemd_file:
    simplemd_file.write(mmd)

    start_question_answering(output_simplemd_path)


    if __name__ == '__main__':
    main()
    main()
  2. danielgross created this gist Apr 27, 2023.
    156 changes: 156 additions & 0 deletions mathpix2gpt.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,156 @@
    import requests
    import time
    import os
    import sys
    import openai
    import tiktoken
    from termcolor import colored

    openai.api_key = open(os.path.expanduser('~/.openai')).read().strip()

    USE_GPT_4_32K = True
    DEFAULT_MODEL = "gpt-4-32k" if USE_GPT_4_32K else "gpt-3.5-turbo-0301"
    TOKEN_LIMIT = 31000 if USE_GPT_4_32K else 3096 # Leave some room for the chat.
    IS_SCIENTIFIC = True

    # Assumes you have a file called ~/.mathpix with the first line containing your app_id and
    # the second line containing your app_key
    with open(os.path.expanduser('~/.mathpix')) as f:
    APP_ID = f.readline().strip()
    APP_KEY = f.readline().strip()

    def send_pdf_to_mathpix(file_path, output_format='mmd'):
    url = 'https://api.mathpix.com/v3/pdf'
    headers = {
    'app_id': APP_ID,
    'app_key': APP_KEY
    }

    with open(file_path, 'rb') as file:
    files = {'file': file}
    options = {
    'options_json': '{"conversion_formats": {"%s": true}}' % output_format
    }
    print(f"Sending {os.path.getsize(file_path) / 1000} kb to Mathpix")
    response = requests.post(url, headers=headers, files=files, data=options)
    response_data = response.json()

    if 'pdf_id' in response_data:
    pdf_id = response_data['pdf_id']
    print(f"PDF ID: {pdf_id}")
    return pdf_id
    else:
    print("Error: Unable to send PDF to Mathpix")
    return None

    def wait_for_processing(pdf_id):
    url = f'https://api.mathpix.com/v3/pdf/{pdf_id}'
    headers = {
    'app_id': APP_ID,
    'app_key': APP_KEY
    }

    while True:
    response = requests.get(url, headers=headers)
    response_data = response.json()
    status = response_data.get('status', None)

    if status == 'completed':
    print("Processing complete")
    return True
    elif status == 'error':
    print("Error: Unable to process PDF")
    return False
    else:
    print(f"Status: {status}, waiting for processing to complete")
    time.sleep(5)

    def download_processed_file(pdf_id, file_format, output_path):
    url = f'https://api.mathpix.com/v3/pdf/{pdf_id}.{file_format}'
    headers = {
    'app_id': APP_ID,
    'app_key': APP_KEY
    }

    response = requests.get(url, headers=headers)
    with open(output_path, 'wb') as output_file:
    output_file.write(response.content)
    print(f"File downloaded to {output_path}")

    def clear_terminal():
    os.system('cls' if os.name == 'nt' else 'clear')

    def print_messages(messages):
    for index, message in enumerate(messages):
    color = 'blue' if message['role'] == 'assistant' else 'white'
    print(colored(f"{message['role'].capitalize()}: {message['content']}", color))

    def chat_gpt(messages):
    result = openai.ChatCompletion.create(
    model=DEFAULT_MODEL,
    messages=messages
    )
    answer = result.choices[0].message.content
    messages.append({"role": "assistant", "content": answer})
    return messages


    def start_question_answering(input_path):
    print("Using model: %s" % DEFAULT_MODEL)

    with open(input_path) as fh:
    data = fh.read()

    text = data.strip()
    tokenizer = tiktoken.encoding_for_model(DEFAULT_MODEL)
    text = tokenizer.decode(tokenizer.encode(text)[:TOKEN_LIMIT])
    text = '\n' + '-' * 50 + '\n' + text + '\n' + '-' * 50
    messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Here is some content from a PDF I extracted to Markdown. %s" % text},
    ]
    messages = chat_gpt(messages)
    while True:
    clear_terminal()
    print_messages(messages)

    question = input("User: ")
    if question.lower() == 'exit':
    break

    messages.append({"role": "user", "content": question})
    messages = chat_gpt(messages)



    def main():
    if len (sys.argv) < 2:
    print("Usage: python pdfvqa.py <input_pdf_path>")
    return

    input_pdf_path = sys.argv[1]
    output_mmd_path = input_pdf_path.replace('.pdf', '.md')
    output_simplemd_path = input_pdf_path.replace('.pdf', '.simple.md')

    if not os.path.exists(output_mmd_path):
    pdf_id = send_pdf_to_mathpix(input_pdf_path)
    if pdf_id and wait_for_processing(pdf_id):
    download_processed_file(pdf_id, 'mmd', output_mmd_path)
    if not os.path.exists(output_simplemd_path):
    with open(output_mmd_path, 'r') as mmd_file:
    mmd = mmd_file.read()
    if not IS_SCIENTIFIC:
    # There's too much LaTeX style escaping for most PDFs in my view, so remove some of it.
    # Keep it if the paper is a scientific paper.
    mmd = '\n'.join([line for line in mmd.split('\n') if not line.startswith('![]')])
    # replace \section{Title} with # Title
    mmd = mmd.replace('\\section{', '# ').replace('}', '')
    # replace the "\" slash that Mathpix adds to escape $, %, (, etc.
    mmd = mmd.replace('\$', '$').replace('\%', '%').replace('\(', '(').replace('\)', ')')
    with open(output_simplemd_path, 'w') as simplemd_file:
    simplemd_file.write(mmd)

    start_question_answering(output_simplemd_path)

    if __name__ == '__main__':
    main()