from datetime import datetime, timedelta
import concurrent.futures
import csv
import html
import os
import time

from bs4 import BeautifulSoup
from dotenv import load_dotenv
import nltk
import openai
import pandas as pd
from sec_api import ExtractorApi, QueryApi
import tiktoken

# Load environment variables for sensitive data and configuration
load_dotenv()

# Global configurations for tickers, API keys, and output settings

# Create a .env file with your sec and openai API keys
# SEC_API_KEY="..."
# OPENAI_API_KEY="sk-..."

# You can get your free api key here https://sec-api.io/signup/free
SEC_API_KEY = os.getenv("SEC_API_KEY")

# Your OpenAI API key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Add more tickers here
TICKERS = ["AAPL"]

FILING_URLS_FILE = "filing_urls.csv"
OUTPUT_BASE_DIR = datetime.now().strftime('%Y%m%d_%H%M%S')
os.makedirs(OUTPUT_BASE_DIR, exist_ok=True)

# Initialize SEC and OpenAI API clients
queryApi = QueryApi(api_key=SEC_API_KEY)
extractorApi = ExtractorApi(api_key=SEC_API_KEY)
openai.api_key = OPENAI_API_KEY
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")


def get_date_range():
    """
    Returns a date range of one week from today's date.
    """
    end_date = datetime.now().strftime('%Y-%m-%d')
    # Іelect dataframe here
    start_date = (datetime.now() - timedelta(days=90)).strftime('%Y-%m-%d')
    return start_date, end_date


def fetch_filing_urls(start_date, end_date):
    """
    Fetches filing URLs from the SEC API for the specified
    tickers within the given date range.
    """

    base_query = {
        "query": {
            "query_string": {
                "query": "PLACEHOLDER",
                "time_zone": "America/New_York"
            }
        },
        "from": "0",
        "size": "200",
        "sort": [{"filedAt": {"order": "desc"}}]
    }

    with open(FILING_URLS_FILE, "w", newline='') as log_file:
        writer = csv.writer(log_file)
        writer.writerow(['Company', 'Ticker', 'Filed Date', 'Report Year', 'Report Type', 'URL', 'Report Date'])

        for ticker in TICKERS:
            print(f"Starting download for ticker {ticker}")

            universe_query = (
                f'formType:(\"10-K\" OR \"10-Q\") AND '
                f'filedAt:[{start_date} TO {end_date}] AND '
                f'ticker:{ticker}'
            )
            base_query["query"]["query_string"]["query"] = universe_query

            for from_batch in range(0, 9800, 200):
                base_query["from"] = str(from_batch)
                response = queryApi.get_filings(base_query)

                if len(response["filings"]) == 0:
                    break

                rows = [
                    [
                        x['companyName'],
                        ticker,
                        x['filedAt'],
                        int(x['filedAt'][:4]),
                        x['formType'],
                        x["linkToFilingDetails"],
                        x['filedAt']
                    ]
                    for x in response["filings"]
                ]
                writer.writerows(rows)

            print(f"Filing URLs downloaded for {ticker}")


def mark_tables_in_html(html_content):
    """
    Marks tables in the provided HTML content for easier processing later.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    for table in soup.find_all('table'):
        table_str = '##TABLE_START\n' + str(table) + '\n##TABLE_END'
        table.replace_with(BeautifulSoup(table_str, 'html.parser'))
    return soup.get_text()


def split_text(input_text, token_limit=6000):
    """
    Splits the text into sections ensuring that each section
    is below the specified token limit so ChatGPT can process it.
    """
    sections = []
    current_section = ""
    current_count = 0
    table_flag = False

    sentences = nltk.sent_tokenize(input_text)

    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        if '##TABLE_START' in tokens:
            table_flag = True
        elif '##TABLE_END' in tokens:
            table_flag = False

        token_count = len(encoding.encode(sentence))
        if current_count + token_count <= token_limit or table_flag:
            current_section += sentence + " "
            current_count += token_count
        else:
            sections.append(current_section.strip())
            current_section = sentence + " "
            current_count = token_count

        if not table_flag and current_count + len(encoding.encode(current_section)) > token_limit:
            sections.append(current_section.strip())
            current_section = ""
            current_count = 0

    if current_section:
        sections.append(current_section.strip())

    return sections


def process_report(row):
    """
    Extracts the Management Discussion & Analysis section
    from a 10-K or 10-Q report and processes it.
    """
    report_type = row['Report Type']
    filing_url = row['URL']

    if report_type == "10-K":
        section_text = extractorApi.get_section(filing_url, '7', 'html')
    elif report_type == "10-Q":
        section_text = extractorApi.get_section(filing_url, 'part1item2', 'html')
    else:
        print(f"Unknown report type: {report_type} for company: {row['Company']}, year: {row['Report Year']}")
        return

    marked_text = mark_tables_in_html(section_text)
    decoded_text = html.unescape(marked_text)
    sections = split_text(decoded_text)

    for section in sections:
        with open(os.path.join(OUTPUT_BASE_DIR, 'all_reports.csv'), 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow([row['Company'], row['Report Year'], report_type, row['Report Date'], section])


def summarize_row(row, index):
    """
    Summarizes a row from the extracted report using gpt-3.5-16k
    """
    while True:
        try:
            # Use the OpenAI API to summarize the text
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo-16k",
                messages = [
                    {
                        "role": "system",
                        "content": "You are an assistant."
                    },
                    {
                        "role": "user",
                        "content": (
                            f'This is a table/page from a Management Discussion & Analysis section '
                            f'of {row["Report Year"]} {row["Report Type"]} report from {row["Company"]} published. '
                            'Using only data provided below please write a short and structured executive summary, '
                            f'use numbers and relative metrics.\n\n Table/page:\n"{row["Section"]}"'
                        )
                    }
                ]
            )
            # Extract the assistant's reply
            summarized_text = response['choices'][0]['message']['content']
            return index, summarized_text

        except Exception as e:
            print(f"An error occurred: {e}. Retrying...")
            time.sleep(5)


def generate_summaries_gpt35():
    """
    Uses gpt-3.5-16k to generate summaries for
    all the reports in 4 parallel streams.
    """
    input_file = os.path.join(OUTPUT_BASE_DIR, 'all_reports.csv')
    df = pd.read_csv(input_file)

    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        futures = [executor.submit(summarize_row, row, index) for index, row in df.iterrows()]

        for future in concurrent.futures.as_completed(futures):
            index, summarized_text = future.result()

            # Add the summarized text to the original dataframe
            df.loc[index, 'Summarized'] = summarized_text

            # Save the dataframe with the summarized text to a new csv file after each summary
            output_file = os.path.join(OUTPUT_BASE_DIR, os.path.basename(input_file).split('.')[0] + '_gpt35_summary.csv')
            df.sort_index().to_csv(output_file, quoting=csv.QUOTE_NONNUMERIC, index=False)
            print(f"Total lines processed: {len(df[df['Summarized'].notnull()])}")


def create_3_tweets(row, index):
    """
    Uses gpt-4 to generate 3 tweets per summary.
    """
    while True:
        try:
            # Use the OpenAI API to write funny tweets
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages = [
                    {
                        "role": "system",
                        "content": "You are one of the best comedians in the world writing hilarious jokes about the stocks."
                    },
                    {
                        "role": "user",
                        "content": (
                            f'Write 3 funny and sarcastic tweets about {row["Company"]} '
                            f'performance based on the summary of their {row["Report Type"]} '
                            f'financial report for {row["Report Year"]} below. '
                            'Make sure to use numbers and metrics, be insightful. '
                            'Try to be really creative, mix satire, sarcasm, unexpectedness, '
                            'exaggeration, provocation and risk to create the top jokes:'
                            f'\n"{row["Summarized"]}"'
                        )
                    }
                ]
            )
            summarized_text = response['choices'][0]['message']['content']
            return index, summarized_text

        except Exception as e:
            print(f"An error occurred: {e}. Retrying...")
            time.sleep(5)


def generate_tweets_gpt4():
    """
    Uses gpt-4 to generate tweets for all the summaries in 2 parallel streams.
    """
    # Adjust this path to match where the summarized reports are stored
    input_file = os.path.join(OUTPUT_BASE_DIR, 'all_reports_gpt35_summary.csv')
    df = pd.read_csv(input_file, encoding='utf-8')

    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        futures = [executor.submit(create_3_tweets, row, index) for index, row in df.iterrows()]

        for future in concurrent.futures.as_completed(futures):
            index, tweet_text = future.result()
            df.loc[index, 'Tweets'] = tweet_text

            output_file = input_file.split('.')[0] + '_gpt4_tweets.csv'
            df.sort_index().to_csv(output_file, quoting=csv.QUOTE_NONNUMERIC, index=False, encoding='utf-8')
            print(f"Total lines processed: {len(df[df['Tweets'].notnull()])}")


def main():
    # Downloading the required dataset for sentence tokenization
    nltk.download('punkt')

    # Fetch the date range and filing URLs
    start_date, end_date = get_date_range()
    fetch_filing_urls(start_date, end_date)

    # Initialize an empty DataFrame to store the filings
    filings_df = pd.read_csv(FILING_URLS_FILE)

    # Initialize the CSV file to store all reports
    with open(os.path.join(OUTPUT_BASE_DIR, 'all_reports.csv'), 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Company', 'Report Year', 'Report Type', 'Report Date', 'Section'])

    # Process each report directly
    for _, row in filings_df.iterrows():
        process_report(row)

    # Summarize the reports
    generate_summaries_gpt35()

    # Create funny tweets
    generate_tweets_gpt4()


if __name__ == '__main__':
    main()