Skip to content

Instantly share code, notes, and snippets.

@justyn-clark
Last active October 24, 2025 02:25
Show Gist options
  • Select an option

  • Save justyn-clark/f1b499b3722f0c03e8939d74061e88a7 to your computer and use it in GitHub Desktop.

Select an option

Save justyn-clark/f1b499b3722f0c03e8939d74061e88a7 to your computer and use it in GitHub Desktop.
import csv
import json
import mailbox
import os
from datetime import datetime
from email.header import decode_header
from email.utils import parsedate_tz, mktime_tz
import pandas as pd
from bs4 import BeautifulSoup
def decode_text(text):
if text is None or not isinstance(text, (str, bytes)):
return ''
decoded_text, charset = decode_header(text)[0]
if charset:
return decoded_text.decode(charset)
return decoded_text
def extract_email_info(msg):
# Extract email information
email_info = {
'subject': decode_text(msg['subject']),
'date': msg['date'],
'from': decode_text(msg['from']),
'to': decode_text(msg['to']),
'Description': '',
'Style': '',
}
def extract_text(content, label):
if not content:
return ''
soup = BeautifulSoup(content, 'html.parser')
element = soup.find('strong', string=label)
if element:
parent = element.find_parent()
if parent:
value_text = parent.get_text().replace(label, '').strip()
if label == 'Deadline':
value_text = value_text.lstrip(':').strip()
return value_text
return ''
date_tuple = parsedate_tz(msg['date'])
if date_tuple:
email_info['date'] = datetime.fromtimestamp(mktime_tz(date_tuple)).isoformat()
else:
email_info['date'] = None
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == 'text/html':
html_content = part.get_payload(decode=True).decode('utf-8', errors='replace')
for label in email_info.keys():
if label not in ['subject', 'date', 'from', 'to', 'Deadline']:
email_info[label] = extract_text(html_content, label + ':')
email_info['Deadline'] = extract_text(html_content, 'Deadline')
else:
payload = msg.get_payload(decode=True)
if payload:
html_content = payload.decode('utf-8', errors='replace')
for label in email_info.keys():
if label not in ['subject', 'date', 'from', 'to', 'Deadline']:
email_info[label] = extract_text(html_content, label + ':')
return email_info
def main():
mbox = mailbox.mbox('/Users/<user>/Documents/DOWNLOADED.mbox')
dest_folder = '/Users/<user>/Documents/extracted-emails'
os.makedirs(dest_folder, exist_ok=True)
emails_json = []
emails_csv = []
for message in mbox:
email_info = extract_email_info(message)
emails_json.append(email_info)
emails_csv.append([
email_info['subject'],
email_info['date'],
email_info['from'],
email_info['to'],
email_info['Description'],
email_info['Style'],
])
# Convert emails list to a DataFrame for sorting
df = pd.DataFrame(emails_json)
# Convert 'date' column to datetime format for correct sorting
df['date'] = pd.to_datetime(df['date'], errors='coerce')
# Sort the DataFrame by date in descending order (newest first)
df_sorted = df.sort_values(by='date', ascending=False)
# Convert the dates back to strings
df_sorted['date'] = df_sorted['date'].dt.strftime('%Y-%m-%dT%H:%M:%S')
# Convert the sorted DataFrame back to list of dictionaries
emails_json_sorted = df_sorted.to_dict(orient='records')
# Save to JSON using emails_json_sorted
json_file_path = os.path.join(dest_folder, 'emails.json')
with open(json_file_path, 'w', encoding='utf-8') as f:
json.dump(emails_json_sorted, f, ensure_ascii=False, indent=4)
# Extract values for CSV in the correct order
emails_csv = [list(email.values()) for email in emails_json_sorted]
# Save to CSV
csv_file_path = os.path.join(dest_folder, 'emails.csv')
with open(csv_file_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow([
'Subject', 'Date', 'From', 'To', 'Description', 'Style'
])
writer.writerows(emails_csv)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment