Created
July 3, 2021 01:07
-
-
Save edison12a/eb97cd75fc1b84d2a177048aa3007328 to your computer and use it in GitHub Desktop.
Revisions
-
edison12a created this gist
Jul 3, 2021 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,67 @@ ''' iterates over files in a directory, finds photos and compares them for similarity if 2 photos are similar, the duplicate is deleted ''' from PIL import Image import imagehash import os import time start = time.time() RECURSIVE_ROOT = '/mnt/c/xxx/yyy/zzz' png_photos = {} jpg_photos = {} def hash_it(photo_path): # return a hash for the photo try: with Image.open(photo_path) as img: return imagehash.average_hash(img) except Exception as e: # print(e) return 0 # recursively loop over all files and folders in the given path for current_dir_path, current_subdirs, current_files in os.walk(RECURSIVE_ROOT): for aFile in current_files: txt_file_path = str(os.path.join(current_dir_path, aFile)) # update a file path to the appropriate dictonary # with the value being its hash value if aFile.endswith('jpg'): jpg_photos[txt_file_path] = hash_it(txt_file_path) elif aFile.endswith('png'): png_photos[txt_file_path] = hash_it(txt_file_path) # record time taken print('\n\n\nHashing out', time.time()-start, '\n\n\n') cutoff = 5 # maximum bits that could be different between the hashes. photos_dicts = [png_photos, jpg_photos] for photo_dic in photos_dicts: # get list of keys / file_names in the dictionary fnames = list(photo_dic) for fname in fnames: hash0 = photo_dic[fname] # delete the key from the dictionary so that its not iterated over again del photo_dic[fname] # iterate over the rest of the dictionary and compare hashes to the key hash0 for remaining_fname in photo_dic: hash1 = photo_dic[remaining_fname] try: # check if the difference between the hashes is less than the cutoff if hash0 - hash1 < cutoff: print('images are similar', fname, remaining_fname) # delete the photo from the dictionary if it is similar Enough to the key hash0 os.remove(remaining_fname) else: pass # print('images are not similar') except: pass # record total time taken print('\n\n\Comparing photos', time.time()-start, '\n\n\n')