justyn-clark · October 24, 2025 02:25 · Jul 9, 2024 · Jul 9, 2024 · Jul 9, 2024
diff --git a/Google_Takeout_Email_Extractor.py b/Google_Takeout_Email_Extractor.py
@@ -28,14 +28,6 @@ def extract_email_info(msg):
         'to': decode_text(msg['to']),
         'Description': '',
         'Style': '',
-        'Era': '',
-        'Tempo': '',
-        'Format': '',
-        'Rights': '',
-        'Term': '',
-        'Fee Available': '',
-        'Submission Guidelines': '',
-        'Deadline': ''
     }
 
     def extract_text(content, label):

diff --git a/Google Takeout Email Extractor → Google_Takeout_Email_Extractor.py b/Google Takeout Email Extractor → Google_Takeout_Email_Extractor.py
diff --git a/Google Takeout Email Extractor b/Google Takeout Email Extractor
@@ -0,0 +1,134 @@
+import csv
+import json
+import mailbox
+import os
+from datetime import datetime
+from email.header import decode_header
+from email.utils import parsedate_tz, mktime_tz
+
+import pandas as pd
+from bs4 import BeautifulSoup
+
+
+def decode_text(text):
+    if text is None or not isinstance(text, (str, bytes)):
+        return ''
+    decoded_text, charset = decode_header(text)[0]
+    if charset:
+        return decoded_text.decode(charset)
+    return decoded_text
+
+
+def extract_email_info(msg):
+    # Extract email information
+    email_info = {
+        'subject': decode_text(msg['subject']),
+        'date': msg['date'],
+        'from': decode_text(msg['from']),
+        'to': decode_text(msg['to']),
+        'Description': '',
+        'Style': '',
+        'Era': '',
+        'Tempo': '',
+        'Format': '',
+        'Rights': '',
+        'Term': '',
+        'Fee Available': '',
+        'Submission Guidelines': '',
+        'Deadline': ''
+    }
+
+    def extract_text(content, label):
+        if not content:
+            return ''
+        soup = BeautifulSoup(content, 'html.parser')
+        element = soup.find('strong', string=label)
+        if element:
+            parent = element.find_parent()
+            if parent:
+                value_text = parent.get_text().replace(label, '').strip()
+                if label == 'Deadline':
+                    value_text = value_text.lstrip(':').strip()
+                return value_text
+        return ''
+
+    date_tuple = parsedate_tz(msg['date'])
+    if date_tuple:
+        email_info['date'] = datetime.fromtimestamp(mktime_tz(date_tuple)).isoformat()
+    else:
+        email_info['date'] = None
+
+    if msg.is_multipart():
+        for part in msg.walk():
+            if part.get_content_type() == 'text/html':
+                html_content = part.get_payload(decode=True).decode('utf-8', errors='replace')
+                for label in email_info.keys():
+                    if label not in ['subject', 'date', 'from', 'to', 'Deadline']:
+                        email_info[label] = extract_text(html_content, label + ':')
+                email_info['Deadline'] = extract_text(html_content, 'Deadline')
+    else:
+        payload = msg.get_payload(decode=True)
+        if payload:
+            html_content = payload.decode('utf-8', errors='replace')
+            for label in email_info.keys():
+                if label not in ['subject', 'date', 'from', 'to', 'Deadline']:
+                    email_info[label] = extract_text(html_content, label + ':')
+
+    return email_info
+
+
+def main():
+    mbox = mailbox.mbox('/Users/<user>/Documents/DOWNLOADED.mbox')
+    dest_folder = '/Users/<user>/Documents/extracted-emails'
+    os.makedirs(dest_folder, exist_ok=True)
+
+    emails_json = []
+    emails_csv = []
+
+    for message in mbox:
+        email_info = extract_email_info(message)
+        emails_json.append(email_info)
+        emails_csv.append([
+            email_info['subject'],
+            email_info['date'],
+            email_info['from'],
+            email_info['to'],
+            email_info['Description'],
+            email_info['Style'],
+        ])
+
+    # Convert emails list to a DataFrame for sorting
+    df = pd.DataFrame(emails_json)
+
+    # Convert 'date' column to datetime format for correct sorting
+    df['date'] = pd.to_datetime(df['date'], errors='coerce')
+
+    # Sort the DataFrame by date in descending order (newest first)
+    df_sorted = df.sort_values(by='date', ascending=False)
+
+    # Convert the dates back to strings
+    df_sorted['date'] = df_sorted['date'].dt.strftime('%Y-%m-%dT%H:%M:%S')
+
+    # Convert the sorted DataFrame back to list of dictionaries
+    emails_json_sorted = df_sorted.to_dict(orient='records')
+
+    # Save to JSON using emails_json_sorted
+    json_file_path = os.path.join(dest_folder, 'emails.json')
+    with open(json_file_path, 'w', encoding='utf-8') as f:
+        json.dump(emails_json_sorted, f, ensure_ascii=False, indent=4)
+
+    # Extract values for CSV in the correct order
+    emails_csv = [list(email.values()) for email in emails_json_sorted]
+
+    # Save to CSV
+    csv_file_path = os.path.join(dest_folder, 'emails.csv')
+    with open(csv_file_path, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerow([
+            'Subject', 'Date', 'From', 'To', 'Description', 'Style'    
+        ])
+        writer.writerows(emails_csv)
+
+
+if __name__ == "__main__":
+    main()
No results found