Skip to content

Instantly share code, notes, and snippets.

@andyg2
Last active October 29, 2025 07:45
Show Gist options
  • Save andyg2/3dc237826268f8046fb4fd6bda40a3e8 to your computer and use it in GitHub Desktop.
Save andyg2/3dc237826268f8046fb4fd6bda40a3e8 to your computer and use it in GitHub Desktop.
Converts a .mbox to separate .eml files and attempts to create a useful folder structure based on labels.
import mailbox
import os
import argparse
import re
from email.utils import parsedate_to_datetime
from email.header import decode_header
def decode_subject(message):
"""
Decodes the email subject into a clean string, handling multi-line and
encoded headers.
"""
subject_header = message.get('Subject', 'No Subject')
if subject_header is None:
return "No Subject"
decoded_parts = decode_header(subject_header)
subject_str = []
for part, charset in decoded_parts:
if isinstance(part, bytes):
# If a charset is specified, use it; otherwise, guess with 'utf-8' or 'latin-1'
try:
subject_str.append(part.decode(charset or 'utf-8', errors='replace'))
except (UnicodeDecodeError, LookupError):
subject_str.append(part.decode('latin-1', errors='replace'))
else:
subject_str.append(part)
return "".join(subject_str)
def sanitize_filename(filename):
"""
Removes characters that are invalid in Windows and Unix-like file/folder names.
Also removes newline characters.
"""
# Replace all whitespace sequences (spaces, tabs, newlines, etc.) with a single space.
filename = re.sub(r'\s+', ' ', filename)
# Then remove other invalid characters
return re.sub(r'[<>:"/\\|?*]', '_', filename).strip()
def get_folder_from_message(message):
"""
Tries to determine the folder name from message headers.
Checks for Thunderbird and Gmail headers. Falls back to 'Inbox'.
"""
folder = message.get('X-Mozilla-Folder')
if folder:
return folder
labels = message.get('X-Gmail-Labels')
if labels:
return labels.split(',')[0].strip()
return "Inbox"
def mbox_to_eml(mbox_file, output_dir_base):
if not os.path.exists(output_dir_base):
os.makedirs(output_dir_base)
mbox = mailbox.mbox(mbox_file)
filenames_in_use = set()
print(f"Starting conversion of {mbox_file}...")
total_messages = len(mbox)
print(f"Found {total_messages} messages to process.")
for i, message in enumerate(mbox):
# 1. Determine the folder
folder_name = get_folder_from_message(message)
# Sanitize and truncate folder name for safety
sanitized_folder_name = sanitize_filename(folder_name)[:150]
output_folder_path = os.path.join(output_dir_base, sanitized_folder_name)
if not os.path.exists(output_folder_path):
os.makedirs(output_folder_path)
print(f"Created folder: {output_folder_path}")
# 2. Decode the subject properly
subject = decode_subject(message)
# 3. Create a descriptive filename
date_str = message.get('Date', '')
date_prefix = ''
if date_str:
try:
dt = parsedate_to_datetime(date_str)
date_prefix = dt.strftime('%Y-%m-%d_%H-%M-%S')
except Exception:
date_prefix = f"message_{i+1:05d}" # Padded number
else:
date_prefix = f"message_{i+1:05d}" # Padded number
sanitized_subject = sanitize_filename(subject)[:100]
base_eml_filename = f"{date_prefix}_{sanitized_subject}.eml"
eml_filename = base_eml_filename
# 4. Handle potential filename collisions
counter = 1
full_path_check = os.path.join(output_folder_path, eml_filename)
while full_path_check in filenames_in_use:
eml_filename = f"{date_prefix}_{sanitized_subject}_{counter}.eml"
full_path_check = os.path.join(output_folder_path, eml_filename)
counter += 1
filenames_in_use.add(full_path_check)
eml_path = full_path_check
# 5. Write the eml content
try:
with open(eml_path, 'wb') as eml_file:
eml_file.write(message.as_bytes())
except OSError as e:
# Catch file system errors like "File name too long"
print(f"Could not write message {i + 1} to {eml_path}. Error: {e}")
# Try writing with a shorter, generic name
try:
eml_path_fallback = os.path.join(output_folder_path, f"{date_prefix}_fallback_{i+1:05d}.eml")
with open(eml_path_fallback, 'wb') as eml_file:
eml_file.write(message.as_bytes())
print(f"Successfully wrote message {i + 1} to fallback file: {eml_path_fallback}")
except Exception as fallback_e:
print(f"Fallback write also failed for message {i + 1}. Error: {fallback_e}")
continue
except Exception as e:
print(f"An unexpected error occurred writing message {i + 1}: {e}")
continue
if (i + 1) % 100 == 0:
print(f"Processed {i + 1} / {total_messages} messages...")
print(f"\nConversion complete. Total messages processed: {i + 1}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Convert a large mbox file to individual .eml files, preserving folder structure."
)
parser.add_argument(
'--file', '-f',
type=str,
required=True,
help='Path to the input mbox file (e.g., "Takeout/Mail/All mail.mbox")'
)
parser.add_argument(
'--output_dir', '-o',
type=str,
required=True,
help='Path to the base output directory (e.g., "./eml_export")'
)
args = parser.parse_args()
mbox_to_eml(args.file, args.output_dir)
@andyg2
Copy link
Author

andyg2 commented Oct 29, 2025

No PIP dependencies.

Tested with Python 3.10, 3.11, 3.12

Example usage:

Google takeout

python convert_mbox.py -f "./All mail Including Spam and Trash.mbox" -o "./google_emails"

Works with large mbox files e.g. 10GB

Z:\Takeout>python convert_mbox.py -f ".\All mail Including Spam and Trash.mbox" -o ".\google_emails"
Starting conversion of .\All mail Including Spam and Trash.mbox...
Found 65130 messages to process.
Created folder: .\google_email\Bin
Created folder: .\google_email\Inbox
Created folder: .\google_email\Spam
Created folder: .\google_email\Sent
Created folder: .\google_email\Important
Created folder: .\google_email\Category purchases
...
...
Processed 100 / 65130 messages...
Processed 200 / 65130 messages...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment