Skip to content

Instantly share code, notes, and snippets.

@mjseeley
Forked from baali/dlAttachments.py
Last active October 17, 2020 12:32
Show Gist options
  • Save mjseeley/5e182c0c29dde014cfac to your computer and use it in GitHub Desktop.
Save mjseeley/5e182c0c29dde014cfac to your computer and use it in GitHub Desktop.

Revisions

  1. mjseeley revised this gist Mar 27, 2015. 1 changed file with 104 additions and 79 deletions.
    183 changes: 104 additions & 79 deletions dlAttachments.py
    Original file line number Diff line number Diff line change
    @@ -1,104 +1,129 @@
    # Something in lines of http://stackoverflow.com/questions/348630/how-can-i-download-all-emails-with-attachments-from-gmail
    # Make sure you have IMAP enabled in your gmail settings.
    # Right now it does download same file multiple times if their contents are different. Uses MD5 hash of each file to skip identical files
    # If you are using 2-step verification you may need an APP Password.
    # https://support.google.com/accounts/answer/185833
    # Download ALL attachments from GMail
    # 1. Script needs to be run via console not in an IDE, getpass.getpass() will fail otherwise.
    # https://docs.python.org/2/library/getpass.html
    # 2. Make sure you have IMAP enabled in your GMail settings.
    # https://support.google.com/mail/troubleshooter/1668960?hl=en
    # 3. If you are using 2 step verification you may need an APP Password.
    # https://support.google.com/accounts/answer/185833
    # 4. Reference information for GMail IMAP extension can be found here.
    # https://developers.google.com/gmail/imap_extensions


    import email
    import hashlib
    import getpass
    import imaplib
    import os
    from collections import defaultdict, Counter
    import platform

    fileNameCounter = Counter()
    fileNameHashes = defaultdict(set)
    NewMsgIDs = set()
    ProcessedMsgIDs = set()

    def get_hash(file_to_hash):
    # return unique hash of file
    blocksize = 65536
    hasher = hashlib.md5()
    try:
    with open(file_to_hash, 'rb') as afile:
    buf = afile.read(blocksize)
    while len(buf) > 0:
    hasher.update(buf)
    buf = afile.read(blocksize)
    except IOError as err:
    print err
    return hasher.hexdigest()

    def recover(resumeFile):
    if os.path.exists(resumeFile):
    print('Recovery file found resuming...')
    with open(resumeFile) as f:
    processedIds = f.read()
    for ProcessedId in processedIds.split(','):
    ProcessedMsgIDs.add(ProcessedId)
    else:
    print('No Recovery file found.')
    open(resumeFile, 'a').close()

    detach_dir = '.'
    if 'attachments' not in os.listdir(detach_dir):
    os.mkdir('attachments')

    userName = raw_input('Enter your GMail username: ')
    passwd = getpass.getpass('Enter your password: ')
    def GenerateMailMessages(userName, password, resumeFile):
    imapSession = imaplib.IMAP4_SSL('imap.gmail.com')
    typ, accountDetails = imapSession.login(userName, password)

    print(typ)
    print(accountDetails)
    if typ != 'OK':
    print('Not able to sign in!')
    raise

    imapSession = imaplib.IMAP4_SSL('imap.gmail.com')
    imapSession = imaplib.IMAP4_SSL('imap.gmail.com')
    imapSession.select('[Gmail]/All Mail')
    typ, data = imapSession.search(None, '(X-GM-RAW "has:attachment")')
    # typ, data = imapSession.search(None, 'ALL')
    if typ != 'OK':
    print('Error searching Inbox.')
    raise

    typ, accountDetails = imapSession.login(userName, passwd)
    print typ
    print accountDetails
    if typ != 'OK':
    print 'Not able to sign in!'
    raise
    # Iterating over all emails
    for msgId in data[0].split():
    NewMsgIDs.add(msgId)
    typ, messageParts = imapSession.fetch(msgId, '(RFC822)')
    if typ != 'OK':
    print('Error fetching mail.')
    raise
    emailBody = messageParts[0][1]
    if msgId not in ProcessedMsgIDs:
    yield email.message_from_string(emailBody)
    ProcessedMsgIDs.add(msgId)
    with open(resumeFile, "a") as resume:
    resume.write('{id},'.format(id=msgId))

    imapSession.select('[Gmail]/All Mail')
    imapSession.close()
    imapSession.logout()

    typ, data = imapSession.search(None, 'ALL')
    if typ != 'OK':
    print 'Error searching Inbox.'
    raise

    # Iterating over all emails
    for msgId in data[0].split():
    typ, messageParts = imapSession.fetch(msgId, '(RFC822)')
    if typ != 'OK':
    print 'Error fetching mail.'
    raise
    emailBody = messageParts[0][1]
    mail = email.message_from_string(emailBody)
    for part in mail.walk():
    def SaveAttachmentsFromMailMessage(message, directory):
    for part in message.walk():
    if part.get_content_maintype() == 'multipart':
    # print part.as_string()
    # print(part.as_string())
    continue
    if part.get('Content-Disposition') is None:
    # print part.as_string()
    # print(part.as_string())
    continue
    fileName = part.get_filename()
    if fileName is not None:
    fileName = ''.join(fileName.splitlines())
    if fileName:
    # print('Processing: {file}'.format(file=fileName))
    payload = part.get_payload(decode=True)
    if payload:
    x_hash = hashlib.md5(payload).hexdigest()

    if bool(fileName):
    filePath = os.path.join(detach_dir, 'attachments', 'temp.attachment')
    if not os.path.isfile(filePath):
    # print 'Processing: {file}'.format(file=fileName)
    fp = open(filePath, 'wb')
    fp.write(part.get_payload(decode=True))
    fp.close()
    x_hash = get_hash(filePath)

    if x_hash in fileNameList_dict[fileName]:
    print '\tSkipping duplicate file: {file}'.format(file=fileName)
    if os.path.isfile(filePath):
    os.remove(filePath)
    pass
    if x_hash in fileNameHashes[fileName]:
    print('\tSkipping duplicate file: {file}'.format(file=fileName))
    continue
    fileNameCounter[fileName] += 1
    fileStr, fileExtension = os.path.splitext(fileName)
    if fileNameCounter[fileName] > 1:
    new_fileName = '{file}({suffix}){ext}'.format(suffix=fileNameCounter[fileName],
    file=fileStr, ext=fileExtension)
    print('\tRenaming and storing: {file} to {new_file}'.format(file=fileName,
    new_file=new_fileName))
    else:
    fileNameCount_dict[fileName] += 1
    fileStr, fileExtension = os.path.splitext(fileName)
    if fileNameCount_dict[fileName] > 1:
    new_fileName = '{file}({suffix}){ext}'.format(suffix=fileNameCount_dict[fileName], file=fileStr, ext=fileExtension)
    else:
    new_fileName = fileName
    fileNameList_dict[fileName].append(x_hash)
    hash_path = os.path.join(detach_dir, 'attachments', new_fileName)
    if not os.path.isfile(hash_path):
    if new_fileName == fileName:
    print '\tStoring: {file}'.format(file=fileName)
    else:
    print('\tRenaming and storing: {file} to {new_file}'.format(file=fileName, new_file=new_fileName))
    os.rename(filePath, hash_path)
    if os.path.isfile(filePath):
    os.remove(filePath)
    new_fileName = fileName
    print('\tStoring: {file}'.format(file=fileName))
    fileNameHashes[fileName].add(x_hash)
    file_path = os.path.join(directory, new_fileName)
    if os.path.exists(file_path):
    print('\tExists in destination: {file}'.format(file=new_fileName))
    continue
    try:
    with open(file_path, 'wb') as fp:
    fp.write(payload)
    except:
    print('Could not store: {file} it has a shitty file name or path under {op_sys}.'.format(
    file=file_path,
    op_sys=platform.system()))
    else:
    print('Attachment {file} was returned as type: {ftype} skipping...'.format(file=fileName,
    ftype=type(payload)))
    continue

    imapSession.close()
    imapSession.logout()
    if __name__ == '__main__':
    resumeFile = file_path = os.path.join('resume.txt')
    userName = raw_input('Enter your GMail username: ')
    password = getpass.getpass('Enter your password: ')
    recover(resumeFile)
    if 'attachments' not in os.listdir(os.getcwd()):
    os.mkdir('attachments')
    for msg in GenerateMailMessages(userName, password, resumeFile):
    SaveAttachmentsFromMailMessage(msg, 'attachments')
    os.remove(file_path)
  2. mjseeley revised this gist Dec 22, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion dlAttachments.py
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,6 @@
    # Something in lines of http://stackoverflow.com/questions/348630/how-can-i-download-all-emails-with-attachments-from-gmail
    # Make sure you have IMAP enabled in your gmail settings.
    # Right now it does download same file multiple times if their contents are different.
    # Right now it does download same file multiple times if their contents are different. Uses MD5 hash of each file to skip identical files
    # If you are using 2-step verification you may need an APP Password.
    # https://support.google.com/accounts/answer/185833

  3. mjseeley revised this gist Dec 22, 2014. 1 changed file with 28 additions and 10 deletions.
    38 changes: 28 additions & 10 deletions dlAttachments.py
    Original file line number Diff line number Diff line change
    @@ -34,6 +34,7 @@ def get_hash(file_to_hash):
    passwd = getpass.getpass('Enter your password: ')


    imapSession = imaplib.IMAP4_SSL('imap.gmail.com')
    imapSession = imaplib.IMAP4_SSL('imap.gmail.com')

    typ, accountDetails = imapSession.login(userName, passwd)
    @@ -44,6 +45,7 @@ def get_hash(file_to_hash):
    raise

    imapSession.select('[Gmail]/All Mail')

    typ, data = imapSession.search(None, 'ALL')
    if typ != 'OK':
    print 'Error searching Inbox.'
    @@ -67,20 +69,36 @@ def get_hash(file_to_hash):
    fileName = part.get_filename()

    if bool(fileName):
    filePath = os.path.join(detach_dir, 'attachments', 'temp.attachment')
    filePath = os.path.join(detach_dir, 'attachments', 'temp.attachment')
    if not os.path.isfile(filePath):
    print 'Processing: {file}'.format(file=fileName)
    # print 'Processing: {file}'.format(file=fileName)
    fp = open(filePath, 'wb')
    fp.write(part.get_payload(decode=True))
    fp.close()
    x_hash = get_hash(filePath)
    fileStr, fileExtension = os.path.splitext(fileName)
    new_fileName = '{file}(#{suffix}#){ext}'.format(suffix=x_hash, file=fileStr, ext=fileExtension)
    hash_path = os.path.join(detach_dir, 'attachments', new_fileName)
    if not os.path.isfile(hash_path):
    print('Renaming {file} to {new_file}'.format(file=fileName, new_file=new_fileName))
    os.rename(filePath, hash_path)
    if os.path.isfile(filePath):
    os.remove(filePath)

    if x_hash in fileNameList_dict[fileName]:
    print '\tSkipping duplicate file: {file}'.format(file=fileName)
    if os.path.isfile(filePath):
    os.remove(filePath)
    pass
    else:
    fileNameCount_dict[fileName] += 1
    fileStr, fileExtension = os.path.splitext(fileName)
    if fileNameCount_dict[fileName] > 1:
    new_fileName = '{file}({suffix}){ext}'.format(suffix=fileNameCount_dict[fileName], file=fileStr, ext=fileExtension)
    else:
    new_fileName = fileName
    fileNameList_dict[fileName].append(x_hash)
    hash_path = os.path.join(detach_dir, 'attachments', new_fileName)
    if not os.path.isfile(hash_path):
    if new_fileName == fileName:
    print '\tStoring: {file}'.format(file=fileName)
    else:
    print('\tRenaming and storing: {file} to {new_file}'.format(file=fileName, new_file=new_fileName))
    os.rename(filePath, hash_path)
    if os.path.isfile(filePath):
    os.remove(filePath)

    imapSession.close()
    imapSession.logout()
  4. mjseeley revised this gist Dec 22, 2014. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions dlAttachments.py
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,9 @@
    # Something in lines of http://stackoverflow.com/questions/348630/how-can-i-download-all-emails-with-attachments-from-gmail
    # Make sure you have IMAP enabled in your gmail settings.
    # Right now it does download same file multiple times if their contents are different.
    # If you are using 2-step verification you may need an APP Password.
    # https://support.google.com/accounts/answer/185833

    import email
    import hashlib
    import getpass
  5. mjseeley revised this gist Dec 22, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion dlAttachments.py
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,6 @@
    # Something in lines of http://stackoverflow.com/questions/348630/how-can-i-download-all-emails-with-attachments-from-gmail
    # Make sure you have IMAP enabled in your gmail settings.
    # Right now it won't download same file name twice even if their contents are different.
    # Right now it does download same file multiple times if their contents are different.
    import email
    import hashlib
    import getpass
  6. mjseeley revised this gist Dec 22, 2014. 1 changed file with 69 additions and 44 deletions.
    113 changes: 69 additions & 44 deletions dlAttachments.py
    Original file line number Diff line number Diff line change
    @@ -1,58 +1,83 @@
    # Something in lines of http://stackoverflow.com/questions/348630/how-can-i-download-all-emails-with-attachments-from-gmail
    # Make sure you have IMAP enabled in your gmail settings.
    # Right now it won't download same file name twice even if their contents are different.

    import email
    import getpass, imaplib
    import hashlib
    import getpass
    import imaplib
    import os
    import sys


    def get_hash(file_to_hash):
    # return unique hash of file
    blocksize = 65536
    hasher = hashlib.md5()
    try:
    with open(file_to_hash, 'rb') as afile:
    buf = afile.read(blocksize)
    while len(buf) > 0:
    hasher.update(buf)
    buf = afile.read(blocksize)
    except IOError as err:
    print err
    return hasher.hexdigest()


    detach_dir = '.'
    if 'attachments' not in os.listdir(detach_dir):
    os.mkdir('attachments')

    userName = raw_input('Enter your GMail username:')
    userName = raw_input('Enter your GMail username: ')
    passwd = getpass.getpass('Enter your password: ')

    try:
    imapSession = imaplib.IMAP4_SSL('imap.gmail.com')
    typ, accountDetails = imapSession.login(userName, passwd)
    if typ != 'OK':
    print 'Not able to sign in!'
    raise

    imapSession.select('[Gmail]/All Mail')
    typ, data = imapSession.search(None, 'ALL')

    imapSession = imaplib.IMAP4_SSL('imap.gmail.com')

    typ, accountDetails = imapSession.login(userName, passwd)
    print typ
    print accountDetails
    if typ != 'OK':
    print 'Not able to sign in!'
    raise

    imapSession.select('[Gmail]/All Mail')
    typ, data = imapSession.search(None, 'ALL')
    if typ != 'OK':
    print 'Error searching Inbox.'
    raise

    # Iterating over all emails
    for msgId in data[0].split():
    typ, messageParts = imapSession.fetch(msgId, '(RFC822)')
    if typ != 'OK':
    print 'Error searching Inbox.'
    print 'Error fetching mail.'
    raise

    # Iterating over all emails
    for msgId in data[0].split():
    typ, messageParts = imapSession.fetch(msgId, '(RFC822)')
    if typ != 'OK':
    print 'Error fetching mail.'
    raise

    emailBody = messageParts[0][1]
    mail = email.message_from_string(emailBody)
    for part in mail.walk():
    if part.get_content_maintype() == 'multipart':
    # print part.as_string()
    continue
    if part.get('Content-Disposition') is None:
    # print part.as_string()
    continue
    fileName = part.get_filename()

    if bool(fileName):
    filePath = os.path.join(detach_dir, 'attachments', fileName)
    if not os.path.isfile(filePath) :
    print fileName
    fp = open(filePath, 'wb')
    fp.write(part.get_payload(decode=True))
    fp.close()
    imapSession.close()
    imapSession.logout()
    except :
    print 'Not able to download all attachments.'
    emailBody = messageParts[0][1]
    mail = email.message_from_string(emailBody)
    for part in mail.walk():
    if part.get_content_maintype() == 'multipart':
    # print part.as_string()
    continue
    if part.get('Content-Disposition') is None:
    # print part.as_string()
    continue
    fileName = part.get_filename()

    if bool(fileName):
    filePath = os.path.join(detach_dir, 'attachments', 'temp.attachment')
    if not os.path.isfile(filePath):
    print 'Processing: {file}'.format(file=fileName)
    fp = open(filePath, 'wb')
    fp.write(part.get_payload(decode=True))
    fp.close()
    x_hash = get_hash(filePath)
    fileStr, fileExtension = os.path.splitext(fileName)
    new_fileName = '{file}(#{suffix}#){ext}'.format(suffix=x_hash, file=fileStr, ext=fileExtension)
    hash_path = os.path.join(detach_dir, 'attachments', new_fileName)
    if not os.path.isfile(hash_path):
    print('Renaming {file} to {new_file}'.format(file=fileName, new_file=new_fileName))
    os.rename(filePath, hash_path)
    if os.path.isfile(filePath):
    os.remove(filePath)
    imapSession.close()
    imapSession.logout()
  7. @baali baali revised this gist May 8, 2012. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions dlAttachments.py
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,6 @@
    # Something in lines of http://stackoverflow.com/questions/348630/how-can-i-download-all-emails-with-attachments-from-gmail
    # Make sure you have IMAP enabled in your gmail settings.
    # Right now it won't download same file name twice even if their contents are different.

    import email
    import getpass, imaplib
  8. @baali baali created this gist May 8, 2012.
    57 changes: 57 additions & 0 deletions dlAttachments.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,57 @@
    # Something in lines of http://stackoverflow.com/questions/348630/how-can-i-download-all-emails-with-attachments-from-gmail
    # Make sure you have IMAP enabled in your gmail settings.

    import email
    import getpass, imaplib
    import os
    import sys

    detach_dir = '.'
    if 'attachments' not in os.listdir(detach_dir):
    os.mkdir('attachments')

    userName = raw_input('Enter your GMail username:')
    passwd = getpass.getpass('Enter your password: ')

    try:
    imapSession = imaplib.IMAP4_SSL('imap.gmail.com')
    typ, accountDetails = imapSession.login(userName, passwd)
    if typ != 'OK':
    print 'Not able to sign in!'
    raise

    imapSession.select('[Gmail]/All Mail')
    typ, data = imapSession.search(None, 'ALL')
    if typ != 'OK':
    print 'Error searching Inbox.'
    raise

    # Iterating over all emails
    for msgId in data[0].split():
    typ, messageParts = imapSession.fetch(msgId, '(RFC822)')
    if typ != 'OK':
    print 'Error fetching mail.'
    raise

    emailBody = messageParts[0][1]
    mail = email.message_from_string(emailBody)
    for part in mail.walk():
    if part.get_content_maintype() == 'multipart':
    # print part.as_string()
    continue
    if part.get('Content-Disposition') is None:
    # print part.as_string()
    continue
    fileName = part.get_filename()

    if bool(fileName):
    filePath = os.path.join(detach_dir, 'attachments', fileName)
    if not os.path.isfile(filePath) :
    print fileName
    fp = open(filePath, 'wb')
    fp.write(part.get_payload(decode=True))
    fp.close()
    imapSession.close()
    imapSession.logout()
    except :
    print 'Not able to download all attachments.'