Last active
October 29, 2025 07:45
-
-
Save andyg2/3dc237826268f8046fb4fd6bda40a3e8 to your computer and use it in GitHub Desktop.
Converts a .mbox to separate .eml files and attempts to create a useful folder structure based on labels.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import mailbox | |
| import os | |
| import argparse | |
| import re | |
| from email.utils import parsedate_to_datetime | |
| from email.header import decode_header | |
| def decode_subject(message): | |
| """ | |
| Decodes the email subject into a clean string, handling multi-line and | |
| encoded headers. | |
| """ | |
| subject_header = message.get('Subject', 'No Subject') | |
| if subject_header is None: | |
| return "No Subject" | |
| decoded_parts = decode_header(subject_header) | |
| subject_str = [] | |
| for part, charset in decoded_parts: | |
| if isinstance(part, bytes): | |
| # If a charset is specified, use it; otherwise, guess with 'utf-8' or 'latin-1' | |
| try: | |
| subject_str.append(part.decode(charset or 'utf-8', errors='replace')) | |
| except (UnicodeDecodeError, LookupError): | |
| subject_str.append(part.decode('latin-1', errors='replace')) | |
| else: | |
| subject_str.append(part) | |
| return "".join(subject_str) | |
| def sanitize_filename(filename): | |
| """ | |
| Removes characters that are invalid in Windows and Unix-like file/folder names. | |
| Also removes newline characters. | |
| """ | |
| # Replace all whitespace sequences (spaces, tabs, newlines, etc.) with a single space. | |
| filename = re.sub(r'\s+', ' ', filename) | |
| # Then remove other invalid characters | |
| return re.sub(r'[<>:"/\\|?*]', '_', filename).strip() | |
| def get_folder_from_message(message): | |
| """ | |
| Tries to determine the folder name from message headers. | |
| Checks for Thunderbird and Gmail headers. Falls back to 'Inbox'. | |
| """ | |
| folder = message.get('X-Mozilla-Folder') | |
| if folder: | |
| return folder | |
| labels = message.get('X-Gmail-Labels') | |
| if labels: | |
| return labels.split(',')[0].strip() | |
| return "Inbox" | |
| def mbox_to_eml(mbox_file, output_dir_base): | |
| if not os.path.exists(output_dir_base): | |
| os.makedirs(output_dir_base) | |
| mbox = mailbox.mbox(mbox_file) | |
| filenames_in_use = set() | |
| print(f"Starting conversion of {mbox_file}...") | |
| total_messages = len(mbox) | |
| print(f"Found {total_messages} messages to process.") | |
| for i, message in enumerate(mbox): | |
| # 1. Determine the folder | |
| folder_name = get_folder_from_message(message) | |
| # Sanitize and truncate folder name for safety | |
| sanitized_folder_name = sanitize_filename(folder_name)[:150] | |
| output_folder_path = os.path.join(output_dir_base, sanitized_folder_name) | |
| if not os.path.exists(output_folder_path): | |
| os.makedirs(output_folder_path) | |
| print(f"Created folder: {output_folder_path}") | |
| # 2. Decode the subject properly | |
| subject = decode_subject(message) | |
| # 3. Create a descriptive filename | |
| date_str = message.get('Date', '') | |
| date_prefix = '' | |
| if date_str: | |
| try: | |
| dt = parsedate_to_datetime(date_str) | |
| date_prefix = dt.strftime('%Y-%m-%d_%H-%M-%S') | |
| except Exception: | |
| date_prefix = f"message_{i+1:05d}" # Padded number | |
| else: | |
| date_prefix = f"message_{i+1:05d}" # Padded number | |
| sanitized_subject = sanitize_filename(subject)[:100] | |
| base_eml_filename = f"{date_prefix}_{sanitized_subject}.eml" | |
| eml_filename = base_eml_filename | |
| # 4. Handle potential filename collisions | |
| counter = 1 | |
| full_path_check = os.path.join(output_folder_path, eml_filename) | |
| while full_path_check in filenames_in_use: | |
| eml_filename = f"{date_prefix}_{sanitized_subject}_{counter}.eml" | |
| full_path_check = os.path.join(output_folder_path, eml_filename) | |
| counter += 1 | |
| filenames_in_use.add(full_path_check) | |
| eml_path = full_path_check | |
| # 5. Write the eml content | |
| try: | |
| with open(eml_path, 'wb') as eml_file: | |
| eml_file.write(message.as_bytes()) | |
| except OSError as e: | |
| # Catch file system errors like "File name too long" | |
| print(f"Could not write message {i + 1} to {eml_path}. Error: {e}") | |
| # Try writing with a shorter, generic name | |
| try: | |
| eml_path_fallback = os.path.join(output_folder_path, f"{date_prefix}_fallback_{i+1:05d}.eml") | |
| with open(eml_path_fallback, 'wb') as eml_file: | |
| eml_file.write(message.as_bytes()) | |
| print(f"Successfully wrote message {i + 1} to fallback file: {eml_path_fallback}") | |
| except Exception as fallback_e: | |
| print(f"Fallback write also failed for message {i + 1}. Error: {fallback_e}") | |
| continue | |
| except Exception as e: | |
| print(f"An unexpected error occurred writing message {i + 1}: {e}") | |
| continue | |
| if (i + 1) % 100 == 0: | |
| print(f"Processed {i + 1} / {total_messages} messages...") | |
| print(f"\nConversion complete. Total messages processed: {i + 1}") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser( | |
| description="Convert a large mbox file to individual .eml files, preserving folder structure." | |
| ) | |
| parser.add_argument( | |
| '--file', '-f', | |
| type=str, | |
| required=True, | |
| help='Path to the input mbox file (e.g., "Takeout/Mail/All mail.mbox")' | |
| ) | |
| parser.add_argument( | |
| '--output_dir', '-o', | |
| type=str, | |
| required=True, | |
| help='Path to the base output directory (e.g., "./eml_export")' | |
| ) | |
| args = parser.parse_args() | |
| mbox_to_eml(args.file, args.output_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
No PIP dependencies.
Tested with Python 3.10, 3.11, 3.12
Example usage:
Google takeout
Works with large mbox files e.g. 10GB