Created
November 9, 2023 10:13
-
-
Save cboulanger/dc369a8f29ec0c46f4e2ef764d749807 to your computer and use it in GitHub Desktop.
Revisions
-
cboulanger created this gist
Nov 9, 2023 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,94 @@ import dateparser import re import requests from bs4 import BeautifulSoup import csv from collections import defaultdict url = "https://easychair.org/smart-program/<conference_name>/program.html" track_url = "https://easychair.org/smart-program/<conference_name>/de_tracks.html" css_url = "https://easychair.org/smart-program/<conference_name>/program.css" page_title = "Conference Name - Tracks und Sessions" def download_session_data(url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8') # Initialize CSV writer with open('conference_data.csv', 'w', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(["session_id", "session", "date", "interval", "track", "title"]) # Initialize date date = None for div in soup.find_all('div'): # Check if div contains date if div.get('class') == ['date']: date = dateparser.parse(div.text) date = date.strftime('%d.%m.%Y') # Check if div is a session elif div.get('class') and 'session' in div.get('class'): session_id = div.find('a').get('name').replace('session:', '') heading = div.find('div', class_='heading').text matches = re.search(r'(.+)\s*Session\s*(\w+)\s*:\s*Track\s*(\w+):\s*(.+)', heading) if matches: interval, session, track, title = matches.groups() writer.writerow([session_id, session, date, interval.strip(), track, title]) def create_track_overview(track_url, program_url, page_title, css_url): # Download and parse the track page response = requests.get(track_url) soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8') # Extract track titles tracks = {} i = 1 for h3 in soup.find_all('h3'): a = h3.find('a') if a: tracks[str(i)] = a.text.strip() i+=1 # Parse the CSV file sessions = defaultdict(list) with open('conference_data.csv', 'r', newline='', encoding='utf-8') as file: reader = csv.reader(file) next(reader) # Skip header for row in reader: session_id, session, date, interval, track, title = row if session_id != "" and track != "": sessions[track].append((date, interval, title, session_id, session)) # Sort sessions by date and interval for track in sessions: sessions[track].sort() # Create the HTML page html = '<html><head>' html += '<meta charset="UTF-8">' html += f'<title>{page_title}</title>' html += f'<link rel="stylesheet" type="text/css" href="{css_url}">' html += '<style>td { padding-right: 20px; }</style>' html += '</head><body>' for track in sorted(sessions.keys()): session_list = sessions[track] html += f'<h2>{tracks[track]}</h2>' html += '<table style="">' for date, interval, title, session_id, session in session_list: html += f'<tr><td>{date}</td><td>{interval}</td><td>{session}</td><td><a href="{program_url}#session:{session_id}">{title}</a></td></tr>' html += '</table>' html += '</body></html>' # Write the HTML to a file with open('sessions_by_track.html', 'w', encoding='utf-8') as file: file.write(html) download_session_data(url) create_track_overview(track_url, url, page_title= page_title, css_url=css_url) #%%