Last active
January 12, 2025 14:10
-
-
Save miagkyi/fcb1b19284ab5a086fd593318cdf1046 to your computer and use it in GitHub Desktop.
Revisions
-
miagkyi revised this gist
Oct 13, 2023 . No changes.There are no files selected for viewing
-
miagkyi created this gist
Oct 13, 2023 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,328 @@ from datetime import datetime, timedelta import concurrent.futures import csv import html import os import time from bs4 import BeautifulSoup from dotenv import load_dotenv import nltk import openai import pandas as pd from sec_api import ExtractorApi, QueryApi import tiktoken # Load environment variables for sensitive data and configuration load_dotenv() # Global configurations for tickers, API keys, and output settings # Create a .env file with your sec and openai API keys # SEC_API_KEY="..." # OPENAI_API_KEY="sk-..." # You can get your free api key here https://sec-api.io/signup/free SEC_API_KEY = os.getenv("SEC_API_KEY") # Your OpenAI API key OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # Add more tickers here TICKERS = ["AAPL"] FILING_URLS_FILE = "filing_urls.csv" OUTPUT_BASE_DIR = datetime.now().strftime('%Y%m%d_%H%M%S') os.makedirs(OUTPUT_BASE_DIR, exist_ok=True) # Initialize SEC and OpenAI API clients queryApi = QueryApi(api_key=SEC_API_KEY) extractorApi = ExtractorApi(api_key=SEC_API_KEY) openai.api_key = OPENAI_API_KEY encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") def get_date_range(): """ Returns a date range of one week from today's date. """ end_date = datetime.now().strftime('%Y-%m-%d') # Іelect dataframe here start_date = (datetime.now() - timedelta(days=90)).strftime('%Y-%m-%d') return start_date, end_date def fetch_filing_urls(start_date, end_date): """ Fetches filing URLs from the SEC API for the specified tickers within the given date range. """ base_query = { "query": { "query_string": { "query": "PLACEHOLDER", "time_zone": "America/New_York" } }, "from": "0", "size": "200", "sort": [{"filedAt": {"order": "desc"}}] } with open(FILING_URLS_FILE, "w", newline='') as log_file: writer = csv.writer(log_file) writer.writerow(['Company', 'Ticker', 'Filed Date', 'Report Year', 'Report Type', 'URL', 'Report Date']) for ticker in TICKERS: print(f"Starting download for ticker {ticker}") universe_query = ( f'formType:(\"10-K\" OR \"10-Q\") AND ' f'filedAt:[{start_date} TO {end_date}] AND ' f'ticker:{ticker}' ) base_query["query"]["query_string"]["query"] = universe_query for from_batch in range(0, 9800, 200): base_query["from"] = str(from_batch) response = queryApi.get_filings(base_query) if len(response["filings"]) == 0: break rows = [ [ x['companyName'], ticker, x['filedAt'], int(x['filedAt'][:4]), x['formType'], x["linkToFilingDetails"], x['filedAt'] ] for x in response["filings"] ] writer.writerows(rows) print(f"Filing URLs downloaded for {ticker}") def mark_tables_in_html(html_content): """ Marks tables in the provided HTML content for easier processing later. """ soup = BeautifulSoup(html_content, 'html.parser') for table in soup.find_all('table'): table_str = '##TABLE_START\n' + str(table) + '\n##TABLE_END' table.replace_with(BeautifulSoup(table_str, 'html.parser')) return soup.get_text() def split_text(input_text, token_limit=6000): """ Splits the text into sections ensuring that each section is below the specified token limit so ChatGPT can process it. """ sections = [] current_section = "" current_count = 0 table_flag = False sentences = nltk.sent_tokenize(input_text) for sentence in sentences: tokens = nltk.word_tokenize(sentence) if '##TABLE_START' in tokens: table_flag = True elif '##TABLE_END' in tokens: table_flag = False token_count = len(encoding.encode(sentence)) if current_count + token_count <= token_limit or table_flag: current_section += sentence + " " current_count += token_count else: sections.append(current_section.strip()) current_section = sentence + " " current_count = token_count if not table_flag and current_count + len(encoding.encode(current_section)) > token_limit: sections.append(current_section.strip()) current_section = "" current_count = 0 if current_section: sections.append(current_section.strip()) return sections def process_report(row): """ Extracts the Management Discussion & Analysis section from a 10-K or 10-Q report and processes it. """ report_type = row['Report Type'] filing_url = row['URL'] if report_type == "10-K": section_text = extractorApi.get_section(filing_url, '7', 'html') elif report_type == "10-Q": section_text = extractorApi.get_section(filing_url, 'part1item2', 'html') else: print(f"Unknown report type: {report_type} for company: {row['Company']}, year: {row['Report Year']}") return marked_text = mark_tables_in_html(section_text) decoded_text = html.unescape(marked_text) sections = split_text(decoded_text) for section in sections: with open(os.path.join(OUTPUT_BASE_DIR, 'all_reports.csv'), 'a', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow([row['Company'], row['Report Year'], report_type, row['Report Date'], section]) def summarize_row(row, index): """ Summarizes a row from the extracted report using gpt-3.5-16k """ while True: try: # Use the OpenAI API to summarize the text response = openai.ChatCompletion.create( model="gpt-3.5-turbo-16k", messages = [ { "role": "system", "content": "You are an assistant." }, { "role": "user", "content": ( f'This is a table/page from a Management Discussion & Analysis section ' f'of {row["Report Year"]} {row["Report Type"]} report from {row["Company"]} published. ' 'Using only data provided below please write a short and structured executive summary, ' f'use numbers and relative metrics.\n\n Table/page:\n"{row["Section"]}"' ) } ] ) # Extract the assistant's reply summarized_text = response['choices'][0]['message']['content'] return index, summarized_text except Exception as e: print(f"An error occurred: {e}. Retrying...") time.sleep(5) def generate_summaries_gpt35(): """ Uses gpt-3.5-16k to generate summaries for all the reports in 4 parallel streams. """ input_file = os.path.join(OUTPUT_BASE_DIR, 'all_reports.csv') df = pd.read_csv(input_file) with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: futures = [executor.submit(summarize_row, row, index) for index, row in df.iterrows()] for future in concurrent.futures.as_completed(futures): index, summarized_text = future.result() # Add the summarized text to the original dataframe df.loc[index, 'Summarized'] = summarized_text # Save the dataframe with the summarized text to a new csv file after each summary output_file = os.path.join(OUTPUT_BASE_DIR, os.path.basename(input_file).split('.')[0] + '_gpt35_summary.csv') df.sort_index().to_csv(output_file, quoting=csv.QUOTE_NONNUMERIC, index=False) print(f"Total lines processed: {len(df[df['Summarized'].notnull()])}") def create_3_tweets(row, index): """ Uses gpt-4 to generate 3 tweets per summary. """ while True: try: # Use the OpenAI API to write funny tweets response = openai.ChatCompletion.create( model="gpt-4", messages = [ { "role": "system", "content": "You are one of the best comedians in the world writing hilarious jokes about the stocks." }, { "role": "user", "content": ( f'Write 3 funny and sarcastic tweets about {row["Company"]} ' f'performance based on the summary of their {row["Report Type"]} ' f'financial report for {row["Report Year"]} below. ' 'Make sure to use numbers and metrics, be insightful. ' 'Try to be really creative, mix satire, sarcasm, unexpectedness, ' 'exaggeration, provocation and risk to create the top jokes:' f'\n"{row["Summarized"]}"' ) } ] ) summarized_text = response['choices'][0]['message']['content'] return index, summarized_text except Exception as e: print(f"An error occurred: {e}. Retrying...") time.sleep(5) def generate_tweets_gpt4(): """ Uses gpt-4 to generate tweets for all the summaries in 2 parallel streams. """ # Adjust this path to match where the summarized reports are stored input_file = os.path.join(OUTPUT_BASE_DIR, 'all_reports_gpt35_summary.csv') df = pd.read_csv(input_file, encoding='utf-8') with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: futures = [executor.submit(create_3_tweets, row, index) for index, row in df.iterrows()] for future in concurrent.futures.as_completed(futures): index, tweet_text = future.result() df.loc[index, 'Tweets'] = tweet_text output_file = input_file.split('.')[0] + '_gpt4_tweets.csv' df.sort_index().to_csv(output_file, quoting=csv.QUOTE_NONNUMERIC, index=False, encoding='utf-8') print(f"Total lines processed: {len(df[df['Tweets'].notnull()])}") def main(): # Downloading the required dataset for sentence tokenization nltk.download('punkt') # Fetch the date range and filing URLs start_date, end_date = get_date_range() fetch_filing_urls(start_date, end_date) # Initialize an empty DataFrame to store the filings filings_df = pd.read_csv(FILING_URLS_FILE) # Initialize the CSV file to store all reports with open(os.path.join(OUTPUT_BASE_DIR, 'all_reports.csv'), 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(['Company', 'Report Year', 'Report Type', 'Report Date', 'Section']) # Process each report directly for _, row in filings_df.iterrows(): process_report(row) # Summarize the reports generate_summaries_gpt35() # Create funny tweets generate_tweets_gpt4() if __name__ == '__main__': main()