Last active
October 24, 2025 02:25
-
-
Save justyn-clark/f1b499b3722f0c03e8939d74061e88a7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import csv | |
| import json | |
| import mailbox | |
| import os | |
| from datetime import datetime | |
| from email.header import decode_header | |
| from email.utils import parsedate_tz, mktime_tz | |
| import pandas as pd | |
| from bs4 import BeautifulSoup | |
| def decode_text(text): | |
| if text is None or not isinstance(text, (str, bytes)): | |
| return '' | |
| decoded_text, charset = decode_header(text)[0] | |
| if charset: | |
| return decoded_text.decode(charset) | |
| return decoded_text | |
| def extract_email_info(msg): | |
| # Extract email information | |
| email_info = { | |
| 'subject': decode_text(msg['subject']), | |
| 'date': msg['date'], | |
| 'from': decode_text(msg['from']), | |
| 'to': decode_text(msg['to']), | |
| 'Description': '', | |
| 'Style': '', | |
| } | |
| def extract_text(content, label): | |
| if not content: | |
| return '' | |
| soup = BeautifulSoup(content, 'html.parser') | |
| element = soup.find('strong', string=label) | |
| if element: | |
| parent = element.find_parent() | |
| if parent: | |
| value_text = parent.get_text().replace(label, '').strip() | |
| if label == 'Deadline': | |
| value_text = value_text.lstrip(':').strip() | |
| return value_text | |
| return '' | |
| date_tuple = parsedate_tz(msg['date']) | |
| if date_tuple: | |
| email_info['date'] = datetime.fromtimestamp(mktime_tz(date_tuple)).isoformat() | |
| else: | |
| email_info['date'] = None | |
| if msg.is_multipart(): | |
| for part in msg.walk(): | |
| if part.get_content_type() == 'text/html': | |
| html_content = part.get_payload(decode=True).decode('utf-8', errors='replace') | |
| for label in email_info.keys(): | |
| if label not in ['subject', 'date', 'from', 'to', 'Deadline']: | |
| email_info[label] = extract_text(html_content, label + ':') | |
| email_info['Deadline'] = extract_text(html_content, 'Deadline') | |
| else: | |
| payload = msg.get_payload(decode=True) | |
| if payload: | |
| html_content = payload.decode('utf-8', errors='replace') | |
| for label in email_info.keys(): | |
| if label not in ['subject', 'date', 'from', 'to', 'Deadline']: | |
| email_info[label] = extract_text(html_content, label + ':') | |
| return email_info | |
| def main(): | |
| mbox = mailbox.mbox('/Users/<user>/Documents/DOWNLOADED.mbox') | |
| dest_folder = '/Users/<user>/Documents/extracted-emails' | |
| os.makedirs(dest_folder, exist_ok=True) | |
| emails_json = [] | |
| emails_csv = [] | |
| for message in mbox: | |
| email_info = extract_email_info(message) | |
| emails_json.append(email_info) | |
| emails_csv.append([ | |
| email_info['subject'], | |
| email_info['date'], | |
| email_info['from'], | |
| email_info['to'], | |
| email_info['Description'], | |
| email_info['Style'], | |
| ]) | |
| # Convert emails list to a DataFrame for sorting | |
| df = pd.DataFrame(emails_json) | |
| # Convert 'date' column to datetime format for correct sorting | |
| df['date'] = pd.to_datetime(df['date'], errors='coerce') | |
| # Sort the DataFrame by date in descending order (newest first) | |
| df_sorted = df.sort_values(by='date', ascending=False) | |
| # Convert the dates back to strings | |
| df_sorted['date'] = df_sorted['date'].dt.strftime('%Y-%m-%dT%H:%M:%S') | |
| # Convert the sorted DataFrame back to list of dictionaries | |
| emails_json_sorted = df_sorted.to_dict(orient='records') | |
| # Save to JSON using emails_json_sorted | |
| json_file_path = os.path.join(dest_folder, 'emails.json') | |
| with open(json_file_path, 'w', encoding='utf-8') as f: | |
| json.dump(emails_json_sorted, f, ensure_ascii=False, indent=4) | |
| # Extract values for CSV in the correct order | |
| emails_csv = [list(email.values()) for email in emails_json_sorted] | |
| # Save to CSV | |
| csv_file_path = os.path.join(dest_folder, 'emails.csv') | |
| with open(csv_file_path, 'w', newline='', encoding='utf-8') as f: | |
| writer = csv.writer(f) | |
| writer.writerow([ | |
| 'Subject', 'Date', 'From', 'To', 'Description', 'Style' | |
| ]) | |
| writer.writerows(emails_csv) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment