Skip to content

Instantly share code, notes, and snippets.

@justyn-clark
Last active October 24, 2025 02:25
Show Gist options
  • Select an option

  • Save justyn-clark/f1b499b3722f0c03e8939d74061e88a7 to your computer and use it in GitHub Desktop.

Select an option

Save justyn-clark/f1b499b3722f0c03e8939d74061e88a7 to your computer and use it in GitHub Desktop.

Revisions

  1. justyn-clark revised this gist Jul 9, 2024. 1 changed file with 0 additions and 8 deletions.
    8 changes: 0 additions & 8 deletions Google_Takeout_Email_Extractor.py
    Original file line number Diff line number Diff line change
    @@ -28,14 +28,6 @@ def extract_email_info(msg):
    'to': decode_text(msg['to']),
    'Description': '',
    'Style': '',
    'Era': '',
    'Tempo': '',
    'Format': '',
    'Rights': '',
    'Term': '',
    'Fee Available': '',
    'Submission Guidelines': '',
    'Deadline': ''
    }

    def extract_text(content, label):
  2. justyn-clark renamed this gist Jul 9, 2024. 1 changed file with 0 additions and 0 deletions.
  3. justyn-clark created this gist Jul 9, 2024.
    134 changes: 134 additions & 0 deletions Google Takeout Email Extractor
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,134 @@
    import csv
    import json
    import mailbox
    import os
    from datetime import datetime
    from email.header import decode_header
    from email.utils import parsedate_tz, mktime_tz

    import pandas as pd
    from bs4 import BeautifulSoup


    def decode_text(text):
    if text is None or not isinstance(text, (str, bytes)):
    return ''
    decoded_text, charset = decode_header(text)[0]
    if charset:
    return decoded_text.decode(charset)
    return decoded_text


    def extract_email_info(msg):
    # Extract email information
    email_info = {
    'subject': decode_text(msg['subject']),
    'date': msg['date'],
    'from': decode_text(msg['from']),
    'to': decode_text(msg['to']),
    'Description': '',
    'Style': '',
    'Era': '',
    'Tempo': '',
    'Format': '',
    'Rights': '',
    'Term': '',
    'Fee Available': '',
    'Submission Guidelines': '',
    'Deadline': ''
    }

    def extract_text(content, label):
    if not content:
    return ''
    soup = BeautifulSoup(content, 'html.parser')
    element = soup.find('strong', string=label)
    if element:
    parent = element.find_parent()
    if parent:
    value_text = parent.get_text().replace(label, '').strip()
    if label == 'Deadline':
    value_text = value_text.lstrip(':').strip()
    return value_text
    return ''

    date_tuple = parsedate_tz(msg['date'])
    if date_tuple:
    email_info['date'] = datetime.fromtimestamp(mktime_tz(date_tuple)).isoformat()
    else:
    email_info['date'] = None

    if msg.is_multipart():
    for part in msg.walk():
    if part.get_content_type() == 'text/html':
    html_content = part.get_payload(decode=True).decode('utf-8', errors='replace')
    for label in email_info.keys():
    if label not in ['subject', 'date', 'from', 'to', 'Deadline']:
    email_info[label] = extract_text(html_content, label + ':')
    email_info['Deadline'] = extract_text(html_content, 'Deadline')
    else:
    payload = msg.get_payload(decode=True)
    if payload:
    html_content = payload.decode('utf-8', errors='replace')
    for label in email_info.keys():
    if label not in ['subject', 'date', 'from', 'to', 'Deadline']:
    email_info[label] = extract_text(html_content, label + ':')

    return email_info


    def main():
    mbox = mailbox.mbox('/Users/<user>/Documents/DOWNLOADED.mbox')
    dest_folder = '/Users/<user>/Documents/extracted-emails'
    os.makedirs(dest_folder, exist_ok=True)

    emails_json = []
    emails_csv = []

    for message in mbox:
    email_info = extract_email_info(message)
    emails_json.append(email_info)
    emails_csv.append([
    email_info['subject'],
    email_info['date'],
    email_info['from'],
    email_info['to'],
    email_info['Description'],
    email_info['Style'],
    ])

    # Convert emails list to a DataFrame for sorting
    df = pd.DataFrame(emails_json)

    # Convert 'date' column to datetime format for correct sorting
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    # Sort the DataFrame by date in descending order (newest first)
    df_sorted = df.sort_values(by='date', ascending=False)

    # Convert the dates back to strings
    df_sorted['date'] = df_sorted['date'].dt.strftime('%Y-%m-%dT%H:%M:%S')

    # Convert the sorted DataFrame back to list of dictionaries
    emails_json_sorted = df_sorted.to_dict(orient='records')

    # Save to JSON using emails_json_sorted
    json_file_path = os.path.join(dest_folder, 'emails.json')
    with open(json_file_path, 'w', encoding='utf-8') as f:
    json.dump(emails_json_sorted, f, ensure_ascii=False, indent=4)

    # Extract values for CSV in the correct order
    emails_csv = [list(email.values()) for email in emails_json_sorted]

    # Save to CSV
    csv_file_path = os.path.join(dest_folder, 'emails.csv')
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow([
    'Subject', 'Date', 'From', 'To', 'Description', 'Style'
    ])
    writer.writerows(emails_csv)


    if __name__ == "__main__":
    main()