''' Download Cocoa Futures Historical Data, traded in New York Board of Trade, from Yahoo Finance. author: Mauro Baraldi email: mauro.baraldi@gmail.com revision: 0.0.1 date: 13/05/2025 buy me a coffee: https://buymeacoffee.com/maurobaraldi Contracts Settlement Date KCK25.NYB Coffee May 25 KCN25.NYB Coffee Jul 25 KCU25.NYB Coffee Sep 25 KCZ25.NYB Coffee Dec 25 KCH26.NYB Coffee Mar 26 KCK26.NYB Coffee May 26 KCN26.NYB Coffee Jul 26 KCU26.NYB Coffee Sep 26 KCZ26.NYB Coffee Dec 26 KCH27.NYB Coffee Mar 27 KCK27.NYB Coffee May 27 KCN27.NYB Coffee Jul 27 KCU27.NYB Coffee Sep 27 KCZ27.NYB Coffee Dec 27 For an updated list of cocoa contracts look for https://finance.yahoo.com/quote/KC%3DF/futures/ ''' from csv import DictWriter from datetime import datetime from os import remove from time import sleep, time from lxml import html from requests import get from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager contracts = ["KCK25.NYB", "KCN25.NYB","KCU25.NYB","KCZ25.NYB","KCH26.NYB","KCK26.NYB","KCN26.NYB","KCU26.NYB","KCZ26.NYB","KCH27.NYB","KCK27.NYB","KCN27.NYB","KCU27.NYB","KCZ27.NYB"] def download(): # Options for web browser (Chromium) options = Options() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') for contract in contracts: print(f"Downloading contract {contract}...") # Web browser (Chromium) engine driver = webdriver.Chrome(service=Service("/usr/bin/chromedriver"), options=options) # Filter data since August 1 2022 (max) until today, driver.get(f"https://finance.yahoo.com/quote/{contract}/history/?filter=history&period1=1659326400&period2={int(time())}") # Wait to load modal for accepting or rejecting cookies sleep(3) # Reject cookies. modal = driver.find_element("xpath", "//button[contains(@class, 'btn') and contains(@class, 'secondary') and contains(@class, 'reject-all')]") driver.execute_script('arguments[0].click()', modal) # Wait to load historical data page sleep(3) # Scroll page until the end to load all data driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Save the HTML data to parse it to CSV with open(f"./{contract}.html", "w") as _: _.write(driver.page_source) driver.close() print(f"Contract {contract} downloaded successfully.") # Wait 5 seconds for the next download to avoid block sleep(5) def convert_to_csv(): # Iterate over each contract for contract in contracts: result = [] with open(f"./{contract}.html", 'r', encoding='utf-8') as c: htmldoc = html.fromstring(c.read()) # Filter rows and columns to extract table = htmldoc.xpath("//table[contains(@class, 'table') and contains(@class, 'yf-1jecxey') and contains(@class, 'noDl') and contains(@class, 'hideOnPrint')]") columns = [i.text.strip() for i in table[0].xpath("//th")] rows = table[0].xpath("//tbody//tr") # Clean and prepare data for row in rows[::-1]: day = [datetime.strptime(row.getchildren()[0].text, '%b %d, %Y')]#.strftime("%Y-%m-%d")] ohlc = [float(i.text[:-3].replace(",", ".")) for i in row.getchildren()[1:-2]] volume = [float(row.getchildren()[-1].text.replace(",", ".").replace("-", "0"))] result.append(dict(zip(columns, day + ohlc + volume))) # Save data to CSV file with open(f'{contract}.csv', 'w', newline='') as csvfile: writer = DictWriter(csvfile, fieldnames=columns) writer.writeheader() writer.writerows(result) remove(f"./{contract}.html") if __name__ == "__main__": #download() convert_to_csv()