Skip to content

Instantly share code, notes, and snippets.

@edison12a
Created July 3, 2021 01:07
Show Gist options
  • Select an option

  • Save edison12a/eb97cd75fc1b84d2a177048aa3007328 to your computer and use it in GitHub Desktop.

Select an option

Save edison12a/eb97cd75fc1b84d2a177048aa3007328 to your computer and use it in GitHub Desktop.

Revisions

  1. edison12a created this gist Jul 3, 2021.
    67 changes: 67 additions & 0 deletions delete_duplicate_photos.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,67 @@
    '''
    iterates over files in a directory, finds photos and compares them for similarity
    if 2 photos are similar, the duplicate is deleted
    '''

    from PIL import Image
    import imagehash
    import os
    import time


    start = time.time()
    RECURSIVE_ROOT = '/mnt/c/xxx/yyy/zzz'
    png_photos = {}
    jpg_photos = {}


    def hash_it(photo_path):
    # return a hash for the photo
    try:
    with Image.open(photo_path) as img:
    return imagehash.average_hash(img)
    except Exception as e:
    # print(e)
    return 0


    # recursively loop over all files and folders in the given path
    for current_dir_path, current_subdirs, current_files in os.walk(RECURSIVE_ROOT):
    for aFile in current_files:
    txt_file_path = str(os.path.join(current_dir_path, aFile))
    # update a file path to the appropriate dictonary
    # with the value being its hash value
    if aFile.endswith('jpg'):
    jpg_photos[txt_file_path] = hash_it(txt_file_path)
    elif aFile.endswith('png'):
    png_photos[txt_file_path] = hash_it(txt_file_path)
    # record time taken
    print('\n\n\nHashing out', time.time()-start, '\n\n\n')


    cutoff = 5 # maximum bits that could be different between the hashes.
    photos_dicts = [png_photos, jpg_photos]
    for photo_dic in photos_dicts:
    # get list of keys / file_names in the dictionary
    fnames = list(photo_dic)
    for fname in fnames:
    hash0 = photo_dic[fname]
    # delete the key from the dictionary so that its not iterated over again
    del photo_dic[fname]
    # iterate over the rest of the dictionary and compare hashes to the key hash0
    for remaining_fname in photo_dic:
    hash1 = photo_dic[remaining_fname]
    try:
    # check if the difference between the hashes is less than the cutoff
    if hash0 - hash1 < cutoff:
    print('images are similar', fname, remaining_fname)
    # delete the photo from the dictionary if it is similar Enough to the key hash0
    os.remove(remaining_fname)
    else:
    pass
    # print('images are not similar')
    except:
    pass

    # record total time taken
    print('\n\n\Comparing photos', time.time()-start, '\n\n\n')