Skip to content

Instantly share code, notes, and snippets.

@maurobaraldi
Created April 13, 2025 01:10
Show Gist options
  • Save maurobaraldi/809533dfda1fb7bfa23a017eabfd238b to your computer and use it in GitHub Desktop.
Save maurobaraldi/809533dfda1fb7bfa23a017eabfd238b to your computer and use it in GitHub Desktop.

Revisions

  1. maurobaraldi created this gist Apr 13, 2025.
    10 changes: 10 additions & 0 deletions Dockerfile
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,10 @@
    FROM python:3.11-alpine

    WORKDIR /app

    RUN apk add --no-cache --upgrade py3-numpy py3-pandas py3-lxml py3-requests gcc build-base py3-scikit-learn
    RUN pip install numpy pandas lxml scikit-learn requests selenium
    RUN echo "http://dl-cdn.alpinelinux.org/alpine/edge/community" > /etc/apk/repositories \
    && echo "http://dl-cdn.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories
    RUN apk update
    RUN apk add --no-cache --upgrade chromium-chromedriver
    58 changes: 58 additions & 0 deletions README.md
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,58 @@
    # Download Cocoa Futures Historical Data

    Contracts traded in New York Board of Trade, downloades from Yahoo Finance.

    ## Running on machine

    For running in machine you will need the following libs:

    **Python**

    * lxml
    * requests
    * selenium

    **External apps/lib**

    * Chromium Web Driver

    To run app save file in a folder and run:

    ```
    python download.py
    ```

    ## Running on Docker

    Save all files of Gist in same folder and build the image contained in this Gist using the following command:

    ```
    docker build -t cocoa-futures .
    ```

    Then run the container with following command:

    ```
    docker run --name cocoa-futures -v ${PWD}:/app cocoa-futures python download.py
    ```

    It will export all the contracts history data to current directory.

    ## Source of data

    For an updated list of cocoa contracts look for [https://finance.yahoo.com/quote/KC=F/futures/](https://finance.yahoo.com/quote/CCU25.NYB/futures/)

    If you just want the data already exported look here [https://github.com/maurobaraldi/financial_data/tree/main/cocoa_futures_nyb](https://github.com/maurobaraldi/financial_data/tree/main/cocoa_futures_nyb)

    ## Thanks

    If you considering buy me a coffee or a beer: [https://buymeacoffee.com/maurobaraldi](https://buymeacoffee.com/maurobaraldi)

    **For Crypto**

    Solana: FQsA6ZiAMeWvKzeX4P4h6wCAhbYKafZJF3UNYf3qm7b6

    Bitcoin: bc1qvkhwcjuhap2pxlr2cnnj6jxvlsw9a4y899srpd

    Ethereum: 0xD585b77B46f3B533FdE12F0062cdA159d99a42CC

    116 changes: 116 additions & 0 deletions download.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,116 @@
    '''
    Download Cocoa Futures Historical Data, traded in New York Board of Trade, from Yahoo Finance.
    author: Mauro Baraldi
    email: [email protected]
    revision: 0.0.1
    date: 13/05/2025
    buy me a coffee: https://buymeacoffee.com/maurobaraldi
    Contracts Settlement Date
    KCK25.NYB Coffee May 25
    KCN25.NYB Coffee Jul 25
    KCU25.NYB Coffee Sep 25
    KCZ25.NYB Coffee Dec 25
    KCH26.NYB Coffee Mar 26
    KCK26.NYB Coffee May 26
    KCN26.NYB Coffee Jul 26
    KCU26.NYB Coffee Sep 26
    KCZ26.NYB Coffee Dec 26
    KCH27.NYB Coffee Mar 27
    KCK27.NYB Coffee May 27
    KCN27.NYB Coffee Jul 27
    KCU27.NYB Coffee Sep 27
    KCZ27.NYB Coffee Dec 27
    For an updated list of cocoa contracts look for https://finance.yahoo.com/quote/KC%3DF/futures/
    '''

    from csv import DictWriter
    from datetime import datetime
    from os import remove
    from time import sleep, time

    from lxml import html
    from requests import get
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.chrome.service import Service
    from webdriver_manager.chrome import ChromeDriverManager


    contracts = ["KCK25.NYB", "KCN25.NYB","KCU25.NYB","KCZ25.NYB","KCH26.NYB","KCK26.NYB","KCN26.NYB","KCU26.NYB","KCZ26.NYB","KCH27.NYB","KCK27.NYB","KCN27.NYB","KCU27.NYB","KCZ27.NYB"]

    def download():

    # Options for web browser (Chromium)
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    for contract in contracts:

    print(f"Downloading contract {contract}...")

    # Web browser (Chromium) engine
    driver = webdriver.Chrome(service=Service("/usr/bin/chromedriver"), options=options)

    # Filter data since August 1 2022 (max) until today,
    driver.get(f"https://finance.yahoo.com/quote/{contract}/history/?filter=history&period1=1659326400&period2={int(time())}")

    # Wait to load modal for accepting or rejecting cookies
    sleep(3)

    # Reject cookies.
    modal = driver.find_element("xpath", "//button[contains(@class, 'btn') and contains(@class, 'secondary') and contains(@class, 'reject-all')]")
    driver.execute_script('arguments[0].click()', modal)

    # Wait to load historical data page
    sleep(3)

    # Scroll page until the end to load all data
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Save the HTML data to parse it to CSV
    with open(f"./{contract}.html", "w") as _:
    _.write(driver.page_source)

    driver.close()
    print(f"Contract {contract} downloaded successfully.")

    # Wait 5 seconds for the next download to avoid block
    sleep(5)

    def convert_to_csv():

    # Iterate over each contract
    for contract in contracts:

    result = []

    with open(f"./{contract}.html", 'r', encoding='utf-8') as c:
    htmldoc = html.fromstring(c.read())

    # Filter rows and columns to extract
    table = htmldoc.xpath("//table[contains(@class, 'table') and contains(@class, 'yf-1jecxey') and contains(@class, 'noDl') and contains(@class, 'hideOnPrint')]")
    columns = [i.text.strip() for i in table[0].xpath("//th")]
    rows = table[0].xpath("//tbody//tr")

    # Clean and prepare data
    for row in rows[::-1]:
    day = [datetime.strptime(row.getchildren()[0].text, '%b %d, %Y')]#.strftime("%Y-%m-%d")]
    ohlc = [float(i.text[:-3].replace(",", ".")) for i in row.getchildren()[1:-2]]
    volume = [float(row.getchildren()[-1].text.replace(",", ".").replace("-", "0"))]
    result.append(dict(zip(columns, day + ohlc + volume)))

    # Save data to CSV file
    with open(f'{contract}.csv', 'w', newline='') as csvfile:
    writer = DictWriter(csvfile, fieldnames=columns)
    writer.writeheader()
    writer.writerows(result)
    remove(f"./{contract}.html")

    if __name__ == "__main__":
    #download()
    convert_to_csv()