maurobaraldi · April 13, 2025 01:10 · Apr 13, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,10 @@
+FROM python:3.11-alpine
+
+WORKDIR /app
+
+RUN apk add --no-cache --upgrade py3-numpy py3-pandas py3-lxml py3-requests gcc build-base py3-scikit-learn 
+RUN pip install numpy pandas lxml scikit-learn requests selenium
+RUN echo "http://dl-cdn.alpinelinux.org/alpine/edge/community" > /etc/apk/repositories \
+    && echo "http://dl-cdn.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories
+RUN apk update
+RUN apk add --no-cache --upgrade chromium-chromedriver
diff --git a/README.md b/README.md
@@ -0,0 +1,58 @@
+# Download Cocoa Futures Historical Data
+
+Contracts traded in New York Board of Trade, downloades from Yahoo Finance.
+
+## Running on machine
+
+For running in machine you will need the following libs:
+
+**Python**
+
+* lxml
+* requests
+* selenium
+
+**External apps/lib**
+
+* Chromium Web Driver
+
+To run app save file in a folder and run:
+
+```
+python download.py
+```
+
+## Running on Docker
+
+Save all files of Gist in same folder and build the image contained in this Gist using the following command:
+
+```
+docker build -t cocoa-futures .
+```
+
+Then run the container with following command:
+
+```
+docker run --name cocoa-futures -v ${PWD}:/app cocoa-futures python download.py
+```
+
+It will export all the contracts history data to current directory.
+
+## Source of data
+
+For an updated list of cocoa contracts look for [https://finance.yahoo.com/quote/KC=F/futures/](https://finance.yahoo.com/quote/CCU25.NYB/futures/)
+
+If you just want the data already exported look here [https://github.com/maurobaraldi/financial_data/tree/main/cocoa_futures_nyb](https://github.com/maurobaraldi/financial_data/tree/main/cocoa_futures_nyb)
+
+## Thanks
+
+If you considering buy me a coffee or a beer: [https://buymeacoffee.com/maurobaraldi](https://buymeacoffee.com/maurobaraldi)
+
+**For Crypto**
+
+Solana: FQsA6ZiAMeWvKzeX4P4h6wCAhbYKafZJF3UNYf3qm7b6
+
+Bitcoin: bc1qvkhwcjuhap2pxlr2cnnj6jxvlsw9a4y899srpd
+
+Ethereum: 0xD585b77B46f3B533FdE12F0062cdA159d99a42CC
+
diff --git a/download.py b/download.py
@@ -0,0 +1,116 @@
+'''
+Download Cocoa Futures Historical Data, traded in New York Board of Trade, from Yahoo Finance.
+
+author: Mauro Baraldi
+email: [email protected]
+revision: 0.0.1
+date: 13/05/2025
+buy me a coffee: https://buymeacoffee.com/maurobaraldi
+
+Contracts   Settlement Date
+KCK25.NYB   Coffee May 25
+KCN25.NYB   Coffee Jul 25
+KCU25.NYB   Coffee Sep 25
+KCZ25.NYB   Coffee Dec 25
+KCH26.NYB   Coffee Mar 26
+KCK26.NYB   Coffee May 26
+KCN26.NYB   Coffee Jul 26
+KCU26.NYB   Coffee Sep 26
+KCZ26.NYB   Coffee Dec 26
+KCH27.NYB   Coffee Mar 27
+KCK27.NYB   Coffee May 27
+KCN27.NYB   Coffee Jul 27
+KCU27.NYB   Coffee Sep 27
+KCZ27.NYB   Coffee Dec 27
+
+For an updated list of cocoa contracts look for https://finance.yahoo.com/quote/KC%3DF/futures/
+'''
+
+from csv import DictWriter
+from datetime import datetime
+from os import remove
+from time import sleep, time
+
+from lxml import html
+from requests import get
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+
+
+contracts = ["KCK25.NYB", "KCN25.NYB","KCU25.NYB","KCZ25.NYB","KCH26.NYB","KCK26.NYB","KCN26.NYB","KCU26.NYB","KCZ26.NYB","KCH27.NYB","KCK27.NYB","KCN27.NYB","KCU27.NYB","KCZ27.NYB"]
+
+def download():
+
+    # Options for web browser (Chromium)
+    options = Options()
+    options.add_argument('--headless')
+    options.add_argument('--no-sandbox')
+    options.add_argument('--disable-dev-shm-usage')
+
+    for contract in contracts:
+
+        print(f"Downloading contract {contract}...")
+
+        # Web browser (Chromium) engine
+        driver = webdriver.Chrome(service=Service("/usr/bin/chromedriver"), options=options)
+
+        # Filter data since August 1 2022 (max) until today,
+        driver.get(f"https://finance.yahoo.com/quote/{contract}/history/?filter=history&period1=1659326400&period2={int(time())}")
+
+        # Wait to load modal for accepting or rejecting cookies
+        sleep(3)
+
+        # Reject cookies.
+        modal = driver.find_element("xpath", "//button[contains(@class, 'btn') and contains(@class, 'secondary') and contains(@class, 'reject-all')]")
+        driver.execute_script('arguments[0].click()', modal)
+
+        # Wait to load historical data page
+        sleep(3)
+
+        # Scroll page until the end to load all data
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+
+        # Save the HTML data to parse it to CSV
+        with open(f"./{contract}.html", "w") as _:
+            _.write(driver.page_source)
+
+        driver.close()
+        print(f"Contract {contract} downloaded successfully.")
+
+        # Wait 5 seconds for the next download to avoid block
+        sleep(5)
+
+def convert_to_csv():
+
+    # Iterate over each contract
+    for contract in contracts:
+
+        result = []
+
+        with open(f"./{contract}.html", 'r', encoding='utf-8') as c: 
+            htmldoc = html.fromstring(c.read())
+
+        # Filter rows and columns to extract
+        table = htmldoc.xpath("//table[contains(@class, 'table') and contains(@class, 'yf-1jecxey') and contains(@class, 'noDl') and contains(@class, 'hideOnPrint')]")
+        columns = [i.text.strip() for i in table[0].xpath("//th")]
+        rows = table[0].xpath("//tbody//tr")
+
+        # Clean and prepare data
+        for row in rows[::-1]:
+            day = [datetime.strptime(row.getchildren()[0].text, '%b %d, %Y')]#.strftime("%Y-%m-%d")]
+            ohlc = [float(i.text[:-3].replace(",", ".")) for i in row.getchildren()[1:-2]]
+            volume = [float(row.getchildren()[-1].text.replace(",", ".").replace("-", "0"))]
+            result.append(dict(zip(columns, day + ohlc + volume)))
+
+        # Save data to CSV file
+        with open(f'{contract}.csv', 'w', newline='') as csvfile:
+            writer = DictWriter(csvfile, fieldnames=columns)
+            writer.writeheader()
+            writer.writerows(result)
+        remove(f"./{contract}.html")
+
+if __name__ == "__main__":
+    #download()
+    convert_to_csv()