import requests
import time
import os
import sys
import openai
import tiktoken
from termcolor import colored

openai.api_key = open(os.path.expanduser('~/.openai')).read().strip()

USE_GPT_4_32K = True
DEFAULT_MODEL = "gpt-4-32k" if USE_GPT_4_32K else "gpt-3.5-turbo-0301"
TOKEN_LIMIT = 31000 if USE_GPT_4_32K else 3096  # Leave some room for the chat.
IS_SCIENTIFIC = True

# Assumes you have a file called ~/.mathpix with the first line containing your app_id and
# the second line containing your app_key
with open(os.path.expanduser('~/.mathpix')) as f:
    APP_ID = f.readline().strip()
    APP_KEY = f.readline().strip()


def send_pdf_to_mathpix(file_path, output_format='mmd'):
    url = 'https://api.mathpix.com/v3/pdf'
    headers = {
        'app_id': APP_ID,
        'app_key': APP_KEY
    }

    with open(file_path, 'rb') as file:
        files = {'file': file}
        options = {
            'options_json': '{"conversion_formats": {"%s": true}}' % output_format
        }
        print(f"Sending {os.path.getsize(file_path) / 1000} kb to Mathpix")
        response = requests.post(url, headers=headers,
                                 files=files, data=options)
        response_data = response.json()

        if 'pdf_id' in response_data:
            pdf_id = response_data['pdf_id']
            print(f"PDF ID: {pdf_id}")
            return pdf_id
        else:
            print("Error: Unable to send PDF to Mathpix")
            return None


def wait_for_processing(pdf_id):
    url = f'https://api.mathpix.com/v3/pdf/{pdf_id}'
    headers = {
        'app_id': APP_ID,
        'app_key': APP_KEY
    }

    while True:
        response = requests.get(url, headers=headers)
        response_data = response.json()
        status = response_data.get('status', None)

        if status == 'completed':
            print("Processing complete")
            return True
        elif status == 'error':
            print("Error: Unable to process PDF")
            return False
        else:
            print(f"Status: {status}, waiting for processing to complete")
            time.sleep(5)


def download_processed_file(pdf_id, file_format, output_path):
    url = f'https://api.mathpix.com/v3/pdf/{pdf_id}.{file_format}'
    headers = {
        'app_id': APP_ID,
        'app_key': APP_KEY
    }

    response = requests.get(url, headers=headers)
    with open(output_path, 'wb') as output_file:
        output_file.write(response.content)
    print(f"File downloaded to {output_path}")


def clear_terminal():
    os.system('cls' if os.name == 'nt' else 'clear')


def print_messages(messages):
    for index, message in enumerate(messages):
        color = 'blue' if message['role'] == 'assistant' else 'white'
        print(
            colored(f"{message['role'].capitalize()}: {message['content']}", color))


def chat_gpt(messages):
    result = openai.ChatCompletion.create(
        model=DEFAULT_MODEL,
        messages=messages
    )
    answer = result.choices[0].message.content
    messages.append({"role": "assistant", "content": answer})
    return messages


def start_question_answering(input_path):
    print("Using model: %s" % DEFAULT_MODEL)

    with open(input_path) as fh:
        data = fh.read()

    text = data.strip()
    tokenizer = tiktoken.encoding_for_model(DEFAULT_MODEL)
    text = tokenizer.decode(tokenizer.encode(text)[:TOKEN_LIMIT])
    text = '\n' + '-' * 50 + '\n' + text + '\n' + '-' * 50
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Here is some content from a PDF I extracted to Markdown. %s" % text},
    ]
    messages = chat_gpt(messages)
    while True:
        clear_terminal()
        print_messages(messages)

        question = input("User: ")
        if question.lower() == 'exit':
            break

        messages.append({"role": "user", "content": question})
        messages = chat_gpt(messages)


def main():
    if len(sys.argv) < 2:
        print("Usage: python pdfvqa.py <input_pdf_path>")
        return

    input_pdf_path = sys.argv[1]
    output_mmd_path = input_pdf_path.replace('.pdf', '.md')
    output_simplemd_path = input_pdf_path.replace('.pdf', '.simple.md')

    if not os.path.exists(output_mmd_path):
        pdf_id = send_pdf_to_mathpix(input_pdf_path)
        if pdf_id and wait_for_processing(pdf_id):
            download_processed_file(pdf_id, 'mmd', output_mmd_path)
    if not os.path.exists(output_simplemd_path):
        with open(output_mmd_path, 'r') as mmd_file:
            mmd = mmd_file.read()
            if not IS_SCIENTIFIC:
                # There's too much LaTeX style escaping for most PDFs in my view, so remove some of it.
                # Keep it if the paper is a scientific paper.
                mmd = '\n'.join([line for line in mmd.split(
                    '\n') if not line.startswith('![]')])
                # replace \section{Title} with # Title
                mmd = mmd.replace('\\section{', '# ').replace('}', '')
                # replace the "\" slash that Mathpix adds to escape $, %, (, etc.
                mmd = mmd.replace('\$', '$').replace(
                    '\%', '%').replace('\(', '(').replace('\)', ')')
        with open(output_simplemd_path, 'w') as simplemd_file:
            simplemd_file.write(mmd)

    start_question_answering(output_simplemd_path)


if __name__ == '__main__':
    main()