andyg2 · October 29, 2025 07:45 · andyg2 · Oct 29, 2025
diff --git a/convert_mbox.py b/convert_mbox.py
 import mailbox
 import os
 import argparse
 import re
 from email.utils import parsedate_to_datetime
 from email.header import decode_header

 def decode_subject(message):
    """
    Decodes the email subject into a clean string, handling multi-line and
    encoded headers.
    """
    subject_header = message.get('Subject', 'No Subject')
    if subject_header is None:
        return "No Subject"
        
    decoded_parts = decode_header(subject_header)
    
    subject_str = []
    for part, charset in decoded_parts:
        if isinstance(part, bytes):
            # If a charset is specified, use it; otherwise, guess with 'utf-8' or 'latin-1'
            try:
                subject_str.append(part.decode(charset or 'utf-8', errors='replace'))
            except (UnicodeDecodeError, LookupError):
                subject_str.append(part.decode('latin-1', errors='replace'))
        else:
            subject_str.append(part)
            
    return "".join(subject_str)

 def sanitize_filename(filename):
    """
    Removes characters that are invalid in Windows and Unix-like file/folder names.
    Also removes newline characters.
    """
    # Replace all whitespace sequences (spaces, tabs, newlines, etc.) with a single space.
    filename = re.sub(r'\s+', ' ', filename)

    # Then remove other invalid characters
    return re.sub(r'[<>:"/\\|?*]', '_', filename).strip()

 def get_folder_from_message(message):
    """
    Tries to determine the folder name from message headers.
    Checks for Thunderbird and Gmail headers. Falls back to 'Inbox'.
    """
    folder = message.get('X-Mozilla-Folder')
    if folder:
        return folder

    labels = message.get('X-Gmail-Labels')
    if labels:
        return labels.split(',')[0].strip()
        
    return "Inbox"

 def mbox_to_eml(mbox_file, output_dir_base):
    if not os.path.exists(output_dir_base):
        os.makedirs(output_dir_base)

    mbox = mailbox.mbox(mbox_file)
    filenames_in_use = set()

    print(f"Starting conversion of {mbox_file}...")
    
    total_messages = len(mbox)
    print(f"Found {total_messages} messages to process.")

    for i, message in enumerate(mbox):
        # 1. Determine the folder
        folder_name = get_folder_from_message(message)
        # Sanitize and truncate folder name for safety
        sanitized_folder_name = sanitize_filename(folder_name)[:150]
        
        output_folder_path = os.path.join(output_dir_base, sanitized_folder_name)
        
        if not os.path.exists(output_folder_path):
            os.makedirs(output_folder_path)
            print(f"Created folder: {output_folder_path}")

        # 2. Decode the subject properly
        subject = decode_subject(message)
        
        # 3. Create a descriptive filename
        date_str = message.get('Date', '')
        date_prefix = ''
        if date_str:
            try:
                dt = parsedate_to_datetime(date_str)
                date_prefix = dt.strftime('%Y-%m-%d_%H-%M-%S')
            except Exception:
                date_prefix = f"message_{i+1:05d}" # Padded number
        else:
            date_prefix = f"message_{i+1:05d}" # Padded number

        sanitized_subject = sanitize_filename(subject)[:100]
        
        base_eml_filename = f"{date_prefix}_{sanitized_subject}.eml"
        eml_filename = base_eml_filename
        
        # 4. Handle potential filename collisions
        counter = 1
        full_path_check = os.path.join(output_folder_path, eml_filename)
        while full_path_check in filenames_in_use:
            eml_filename = f"{date_prefix}_{sanitized_subject}_{counter}.eml"
            full_path_check = os.path.join(output_folder_path, eml_filename)
            counter += 1
        
        filenames_in_use.add(full_path_check)
        eml_path = full_path_check

        # 5. Write the eml content
        try:
            with open(eml_path, 'wb') as eml_file:
                eml_file.write(message.as_bytes())
        except OSError as e:
            # Catch file system errors like "File name too long"
            print(f"Could not write message {i + 1} to {eml_path}. Error: {e}")
            # Try writing with a shorter, generic name
            try:
                eml_path_fallback = os.path.join(output_folder_path, f"{date_prefix}_fallback_{i+1:05d}.eml")
                with open(eml_path_fallback, 'wb') as eml_file:
                    eml_file.write(message.as_bytes())
                print(f"Successfully wrote message {i + 1} to fallback file: {eml_path_fallback}")
            except Exception as fallback_e:
                print(f"Fallback write also failed for message {i + 1}. Error: {fallback_e}")
            continue
        except Exception as e:
            print(f"An unexpected error occurred writing message {i + 1}: {e}")
            continue

        if (i + 1) % 100 == 0:
            print(f"Processed {i + 1} / {total_messages} messages...")

    print(f"\nConversion complete. Total messages processed: {i + 1}")

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Convert a large mbox file to individual .eml files, preserving folder structure."
    )
    parser.add_argument(
        '--file', '-f', 
        type=str, 
        required=True, 
        help='Path to the input mbox file (e.g., "Takeout/Mail/All mail.mbox")'
    )
    parser.add_argument(
        '--output_dir', '-o', 
        type=str, 
        required=True, 
        help='Path to the base output directory (e.g., "./eml_export")'
    )

    args = parser.parse_args()
    mbox_to_eml(args.file, args.output_dir)
	import mailbox
	import os
	import argparse
	import re
	from email.utils import parsedate_to_datetime
	from email.header import decode_header

	def decode_subject(message):
	"""
	Decodes the email subject into a clean string, handling multi-line and
	encoded headers.
	"""
	subject_header = message.get('Subject', 'No Subject')
	if subject_header is None:
	return "No Subject"

	decoded_parts = decode_header(subject_header)

	subject_str = []
	for part, charset in decoded_parts:
	if isinstance(part, bytes):
	# If a charset is specified, use it; otherwise, guess with 'utf-8' or 'latin-1'
	try:
	subject_str.append(part.decode(charset or 'utf-8', errors='replace'))
	except (UnicodeDecodeError, LookupError):
	subject_str.append(part.decode('latin-1', errors='replace'))
	else:
	subject_str.append(part)

	return "".join(subject_str)

	def sanitize_filename(filename):
	"""
	Removes characters that are invalid in Windows and Unix-like file/folder names.
	Also removes newline characters.
	"""
	# Replace all whitespace sequences (spaces, tabs, newlines, etc.) with a single space.
	filename = re.sub(r'\s+', ' ', filename)

	# Then remove other invalid characters
	return re.sub(r'[<>:"/\\\|?*]', '_', filename).strip()

	def get_folder_from_message(message):
	"""
	Tries to determine the folder name from message headers.
	Checks for Thunderbird and Gmail headers. Falls back to 'Inbox'.
	"""
	folder = message.get('X-Mozilla-Folder')
	if folder:
	return folder

	labels = message.get('X-Gmail-Labels')
	if labels:
	return labels.split(',')[0].strip()

	return "Inbox"

	def mbox_to_eml(mbox_file, output_dir_base):
	if not os.path.exists(output_dir_base):
	os.makedirs(output_dir_base)

	mbox = mailbox.mbox(mbox_file)
	filenames_in_use = set()

	print(f"Starting conversion of {mbox_file}...")

	total_messages = len(mbox)
	print(f"Found {total_messages} messages to process.")

	for i, message in enumerate(mbox):
	# 1. Determine the folder
	folder_name = get_folder_from_message(message)
	# Sanitize and truncate folder name for safety
	sanitized_folder_name = sanitize_filename(folder_name)[:150]

	output_folder_path = os.path.join(output_dir_base, sanitized_folder_name)

	if not os.path.exists(output_folder_path):
	os.makedirs(output_folder_path)
	print(f"Created folder: {output_folder_path}")

	# 2. Decode the subject properly
	subject = decode_subject(message)

	# 3. Create a descriptive filename
	date_str = message.get('Date', '')
	date_prefix = ''
	if date_str:
	try:
	dt = parsedate_to_datetime(date_str)
	date_prefix = dt.strftime('%Y-%m-%d_%H-%M-%S')
	except Exception:
	date_prefix = f"message_{i+1:05d}" # Padded number
	else:
	date_prefix = f"message_{i+1:05d}" # Padded number

	sanitized_subject = sanitize_filename(subject)[:100]

	base_eml_filename = f"{date_prefix}_{sanitized_subject}.eml"
	eml_filename = base_eml_filename

	# 4. Handle potential filename collisions
	counter = 1
	full_path_check = os.path.join(output_folder_path, eml_filename)
	while full_path_check in filenames_in_use:
	eml_filename = f"{date_prefix}_{sanitized_subject}_{counter}.eml"
	full_path_check = os.path.join(output_folder_path, eml_filename)
	counter += 1

	filenames_in_use.add(full_path_check)
	eml_path = full_path_check

	# 5. Write the eml content
	try:
	with open(eml_path, 'wb') as eml_file:
	eml_file.write(message.as_bytes())
	except OSError as e:
	# Catch file system errors like "File name too long"
	print(f"Could not write message {i + 1} to {eml_path}. Error: {e}")
	# Try writing with a shorter, generic name
	try:
	eml_path_fallback = os.path.join(output_folder_path, f"{date_prefix}_fallback_{i+1:05d}.eml")
	with open(eml_path_fallback, 'wb') as eml_file:
	eml_file.write(message.as_bytes())
	print(f"Successfully wrote message {i + 1} to fallback file: {eml_path_fallback}")
	except Exception as fallback_e:
	print(f"Fallback write also failed for message {i + 1}. Error: {fallback_e}")
	continue
	except Exception as e:
	print(f"An unexpected error occurred writing message {i + 1}: {e}")
	continue

	if (i + 1) % 100 == 0:
	print(f"Processed {i + 1} / {total_messages} messages...")

	print(f"\nConversion complete. Total messages processed: {i + 1}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	description="Convert a large mbox file to individual .eml files, preserving folder structure."
	)
	parser.add_argument(
	'--file', '-f',
	type=str,
	required=True,
	help='Path to the input mbox file (e.g., "Takeout/Mail/All mail.mbox")'
	)
	parser.add_argument(
	'--output_dir', '-o',
	type=str,
	required=True,
	help='Path to the base output directory (e.g., "./eml_export")'
	)

	args = parser.parse_args()
	mbox_to_eml(args.file, args.output_dir)