Created
April 13, 2025 01:10
-
-
Save maurobaraldi/809533dfda1fb7bfa23a017eabfd238b to your computer and use it in GitHub Desktop.
Revisions
-
maurobaraldi created this gist
Apr 13, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,10 @@ FROM python:3.11-alpine WORKDIR /app RUN apk add --no-cache --upgrade py3-numpy py3-pandas py3-lxml py3-requests gcc build-base py3-scikit-learn RUN pip install numpy pandas lxml scikit-learn requests selenium RUN echo "http://dl-cdn.alpinelinux.org/alpine/edge/community" > /etc/apk/repositories \ && echo "http://dl-cdn.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories RUN apk update RUN apk add --no-cache --upgrade chromium-chromedriver This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,58 @@ # Download Cocoa Futures Historical Data Contracts traded in New York Board of Trade, downloades from Yahoo Finance. ## Running on machine For running in machine you will need the following libs: **Python** * lxml * requests * selenium **External apps/lib** * Chromium Web Driver To run app save file in a folder and run: ``` python download.py ``` ## Running on Docker Save all files of Gist in same folder and build the image contained in this Gist using the following command: ``` docker build -t cocoa-futures . ``` Then run the container with following command: ``` docker run --name cocoa-futures -v ${PWD}:/app cocoa-futures python download.py ``` It will export all the contracts history data to current directory. ## Source of data For an updated list of cocoa contracts look for [https://finance.yahoo.com/quote/KC=F/futures/](https://finance.yahoo.com/quote/CCU25.NYB/futures/) If you just want the data already exported look here [https://github.com/maurobaraldi/financial_data/tree/main/cocoa_futures_nyb](https://github.com/maurobaraldi/financial_data/tree/main/cocoa_futures_nyb) ## Thanks If you considering buy me a coffee or a beer: [https://buymeacoffee.com/maurobaraldi](https://buymeacoffee.com/maurobaraldi) **For Crypto** Solana: FQsA6ZiAMeWvKzeX4P4h6wCAhbYKafZJF3UNYf3qm7b6 Bitcoin: bc1qvkhwcjuhap2pxlr2cnnj6jxvlsw9a4y899srpd Ethereum: 0xD585b77B46f3B533FdE12F0062cdA159d99a42CC This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,116 @@ ''' Download Cocoa Futures Historical Data, traded in New York Board of Trade, from Yahoo Finance. author: Mauro Baraldi email: [email protected] revision: 0.0.1 date: 13/05/2025 buy me a coffee: https://buymeacoffee.com/maurobaraldi Contracts Settlement Date KCK25.NYB Coffee May 25 KCN25.NYB Coffee Jul 25 KCU25.NYB Coffee Sep 25 KCZ25.NYB Coffee Dec 25 KCH26.NYB Coffee Mar 26 KCK26.NYB Coffee May 26 KCN26.NYB Coffee Jul 26 KCU26.NYB Coffee Sep 26 KCZ26.NYB Coffee Dec 26 KCH27.NYB Coffee Mar 27 KCK27.NYB Coffee May 27 KCN27.NYB Coffee Jul 27 KCU27.NYB Coffee Sep 27 KCZ27.NYB Coffee Dec 27 For an updated list of cocoa contracts look for https://finance.yahoo.com/quote/KC%3DF/futures/ ''' from csv import DictWriter from datetime import datetime from os import remove from time import sleep, time from lxml import html from requests import get from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager contracts = ["KCK25.NYB", "KCN25.NYB","KCU25.NYB","KCZ25.NYB","KCH26.NYB","KCK26.NYB","KCN26.NYB","KCU26.NYB","KCZ26.NYB","KCH27.NYB","KCK27.NYB","KCN27.NYB","KCU27.NYB","KCZ27.NYB"] def download(): # Options for web browser (Chromium) options = Options() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') for contract in contracts: print(f"Downloading contract {contract}...") # Web browser (Chromium) engine driver = webdriver.Chrome(service=Service("/usr/bin/chromedriver"), options=options) # Filter data since August 1 2022 (max) until today, driver.get(f"https://finance.yahoo.com/quote/{contract}/history/?filter=history&period1=1659326400&period2={int(time())}") # Wait to load modal for accepting or rejecting cookies sleep(3) # Reject cookies. modal = driver.find_element("xpath", "//button[contains(@class, 'btn') and contains(@class, 'secondary') and contains(@class, 'reject-all')]") driver.execute_script('arguments[0].click()', modal) # Wait to load historical data page sleep(3) # Scroll page until the end to load all data driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Save the HTML data to parse it to CSV with open(f"./{contract}.html", "w") as _: _.write(driver.page_source) driver.close() print(f"Contract {contract} downloaded successfully.") # Wait 5 seconds for the next download to avoid block sleep(5) def convert_to_csv(): # Iterate over each contract for contract in contracts: result = [] with open(f"./{contract}.html", 'r', encoding='utf-8') as c: htmldoc = html.fromstring(c.read()) # Filter rows and columns to extract table = htmldoc.xpath("//table[contains(@class, 'table') and contains(@class, 'yf-1jecxey') and contains(@class, 'noDl') and contains(@class, 'hideOnPrint')]") columns = [i.text.strip() for i in table[0].xpath("//th")] rows = table[0].xpath("//tbody//tr") # Clean and prepare data for row in rows[::-1]: day = [datetime.strptime(row.getchildren()[0].text, '%b %d, %Y')]#.strftime("%Y-%m-%d")] ohlc = [float(i.text[:-3].replace(",", ".")) for i in row.getchildren()[1:-2]] volume = [float(row.getchildren()[-1].text.replace(",", ".").replace("-", "0"))] result.append(dict(zip(columns, day + ohlc + volume))) # Save data to CSV file with open(f'{contract}.csv', 'w', newline='') as csvfile: writer = DictWriter(csvfile, fieldnames=columns) writer.writeheader() writer.writerows(result) remove(f"./{contract}.html") if __name__ == "__main__": #download() convert_to_csv()