Skip to content

Instantly share code, notes, and snippets.

@cboulanger
Created November 9, 2023 10:13
Show Gist options
  • Select an option

  • Save cboulanger/dc369a8f29ec0c46f4e2ef764d749807 to your computer and use it in GitHub Desktop.

Select an option

Save cboulanger/dc369a8f29ec0c46f4e2ef764d749807 to your computer and use it in GitHub Desktop.

Revisions

  1. cboulanger created this gist Nov 9, 2023.
    94 changes: 94 additions & 0 deletions parse-easychair-program.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,94 @@
    import dateparser
    import re
    import requests
    from bs4 import BeautifulSoup
    import csv
    from collections import defaultdict

    url = "https://easychair.org/smart-program/<conference_name>/program.html"
    track_url = "https://easychair.org/smart-program/<conference_name>/de_tracks.html"
    css_url = "https://easychair.org/smart-program/<conference_name>/program.css"
    page_title = "Conference Name - Tracks und Sessions"

    def download_session_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')


    # Initialize CSV writer
    with open('conference_data.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["session_id", "session", "date", "interval", "track", "title"])

    # Initialize date
    date = None

    for div in soup.find_all('div'):
    # Check if div contains date
    if div.get('class') == ['date']:
    date = dateparser.parse(div.text)
    date = date.strftime('%d.%m.%Y')

    # Check if div is a session
    elif div.get('class') and 'session' in div.get('class'):
    session_id = div.find('a').get('name').replace('session:', '')
    heading = div.find('div', class_='heading').text
    matches = re.search(r'(.+)\s*Session\s*(\w+)\s*:\s*Track\s*(\w+):\s*(.+)', heading)
    if matches:
    interval, session, track, title = matches.groups()
    writer.writerow([session_id, session, date, interval.strip(), track, title])

    def create_track_overview(track_url, program_url, page_title, css_url):
    # Download and parse the track page
    response = requests.get(track_url)
    soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')

    # Extract track titles
    tracks = {}
    i = 1
    for h3 in soup.find_all('h3'):
    a = h3.find('a')
    if a:
    tracks[str(i)] = a.text.strip()
    i+=1

    # Parse the CSV file
    sessions = defaultdict(list)
    with open('conference_data.csv', 'r', newline='', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader) # Skip header
    for row in reader:
    session_id, session, date, interval, track, title = row
    if session_id != "" and track != "":
    sessions[track].append((date, interval, title, session_id, session))

    # Sort sessions by date and interval
    for track in sessions:
    sessions[track].sort()

    # Create the HTML page
    html = '<html><head>'
    html += '<meta charset="UTF-8">'
    html += f'<title>{page_title}</title>'
    html += f'<link rel="stylesheet" type="text/css" href="{css_url}">'
    html += '<style>td { padding-right: 20px; }</style>'
    html += '</head><body>'

    for track in sorted(sessions.keys()):
    session_list = sessions[track]
    html += f'<h2>{tracks[track]}</h2>'
    html += '<table style="">'
    for date, interval, title, session_id, session in session_list:
    html += f'<tr><td>{date}</td><td>{interval}</td><td>{session}</td><td><a href="{program_url}#session:{session_id}">{title}</a></td></tr>'
    html += '</table>'
    html += '</body></html>'

    # Write the HTML to a file
    with open('sessions_by_track.html', 'w', encoding='utf-8') as file:
    file.write(html)



    download_session_data(url)
    create_track_overview(track_url, url, page_title= page_title, css_url=css_url)
    #%%