Last active
October 24, 2025 02:25
-
-
Save justyn-clark/f1b499b3722f0c03e8939d74061e88a7 to your computer and use it in GitHub Desktop.
Revisions
-
justyn-clark revised this gist
Jul 9, 2024 . 1 changed file with 0 additions and 8 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -28,14 +28,6 @@ def extract_email_info(msg): 'to': decode_text(msg['to']), 'Description': '', 'Style': '', } def extract_text(content, label): -
justyn-clark renamed this gist
Jul 9, 2024 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
justyn-clark created this gist
Jul 9, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,134 @@ import csv import json import mailbox import os from datetime import datetime from email.header import decode_header from email.utils import parsedate_tz, mktime_tz import pandas as pd from bs4 import BeautifulSoup def decode_text(text): if text is None or not isinstance(text, (str, bytes)): return '' decoded_text, charset = decode_header(text)[0] if charset: return decoded_text.decode(charset) return decoded_text def extract_email_info(msg): # Extract email information email_info = { 'subject': decode_text(msg['subject']), 'date': msg['date'], 'from': decode_text(msg['from']), 'to': decode_text(msg['to']), 'Description': '', 'Style': '', 'Era': '', 'Tempo': '', 'Format': '', 'Rights': '', 'Term': '', 'Fee Available': '', 'Submission Guidelines': '', 'Deadline': '' } def extract_text(content, label): if not content: return '' soup = BeautifulSoup(content, 'html.parser') element = soup.find('strong', string=label) if element: parent = element.find_parent() if parent: value_text = parent.get_text().replace(label, '').strip() if label == 'Deadline': value_text = value_text.lstrip(':').strip() return value_text return '' date_tuple = parsedate_tz(msg['date']) if date_tuple: email_info['date'] = datetime.fromtimestamp(mktime_tz(date_tuple)).isoformat() else: email_info['date'] = None if msg.is_multipart(): for part in msg.walk(): if part.get_content_type() == 'text/html': html_content = part.get_payload(decode=True).decode('utf-8', errors='replace') for label in email_info.keys(): if label not in ['subject', 'date', 'from', 'to', 'Deadline']: email_info[label] = extract_text(html_content, label + ':') email_info['Deadline'] = extract_text(html_content, 'Deadline') else: payload = msg.get_payload(decode=True) if payload: html_content = payload.decode('utf-8', errors='replace') for label in email_info.keys(): if label not in ['subject', 'date', 'from', 'to', 'Deadline']: email_info[label] = extract_text(html_content, label + ':') return email_info def main(): mbox = mailbox.mbox('/Users/<user>/Documents/DOWNLOADED.mbox') dest_folder = '/Users/<user>/Documents/extracted-emails' os.makedirs(dest_folder, exist_ok=True) emails_json = [] emails_csv = [] for message in mbox: email_info = extract_email_info(message) emails_json.append(email_info) emails_csv.append([ email_info['subject'], email_info['date'], email_info['from'], email_info['to'], email_info['Description'], email_info['Style'], ]) # Convert emails list to a DataFrame for sorting df = pd.DataFrame(emails_json) # Convert 'date' column to datetime format for correct sorting df['date'] = pd.to_datetime(df['date'], errors='coerce') # Sort the DataFrame by date in descending order (newest first) df_sorted = df.sort_values(by='date', ascending=False) # Convert the dates back to strings df_sorted['date'] = df_sorted['date'].dt.strftime('%Y-%m-%dT%H:%M:%S') # Convert the sorted DataFrame back to list of dictionaries emails_json_sorted = df_sorted.to_dict(orient='records') # Save to JSON using emails_json_sorted json_file_path = os.path.join(dest_folder, 'emails.json') with open(json_file_path, 'w', encoding='utf-8') as f: json.dump(emails_json_sorted, f, ensure_ascii=False, indent=4) # Extract values for CSV in the correct order emails_csv = [list(email.values()) for email in emails_json_sorted] # Save to CSV csv_file_path = os.path.join(dest_folder, 'emails.csv') with open(csv_file_path, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow([ 'Subject', 'Date', 'From', 'To', 'Description', 'Style' ]) writer.writerows(emails_csv) if __name__ == "__main__": main()