Skip to content

Instantly share code, notes, and snippets.

@datatalking
Forked from vinovator/checkDuplicates.py
Created July 28, 2021 17:48
Show Gist options
  • Select an option

  • Save datatalking/dd5638d508805a928e9e8a8ab5ccd914 to your computer and use it in GitHub Desktop.

Select an option

Save datatalking/dd5638d508805a928e9e8a8ab5ccd914 to your computer and use it in GitHub Desktop.
Python script to find duplicate files from a folder
# DirectoryAnalyzer.py
# Python 2.7.6
"""
Given a folder, walk through all files within the folder and subfolders
and get list of all files that are duplicates
The md5 checcksum for each file will determine the duplicates
"""
import os
import hashlib
from collections import defaultdict # , Counter
src_folder = "../../"
def generate_md5(fname, chunk_size=4096):
"""
Function which takes a file name and returns md5 checksum of the file
"""
hash = hashlib.md5()
with open(fname, "rb") as f:
# Read the 1st block of the file
chunk = f.read(chunk_size)
# Keep reading the file until the end and update hash
while chunk:
hash.update(chunk)
chunk = f.read(chunk_size)
# Return the hex checksum
return hash.hexdigest()
if __name__ == "__main__":
"""
Starting block of script
"""
"""
extn = Counter()
# Walk through all files and folders within directory
for path, dirs, files in os.walk(src_folder):
for each_file in files:
extn[each_file.split(".")[-1]] += 1
print extn.most_common()
"""
# The dict will hae a list as values
md5_dict = defaultdict(list)
file_types_inscope = ["ppt", "pptx", "pdf", "txt", "html", "mp3",
"mp4", "jpg", "png", "xls", "xlsx", "xml",
"vsd", "py", "json"]
# Walk through all files and folders within directory
for path, dirs, files in os.walk(src_folder):
for each_file in files:
if each_file.split(".")[-1].lower() in file_types_inscope:
# The path variable gets updated for each subfolder
file_path = os.path.join(os.path.abspath(path), each_file)
# If there are more files with same checksum append to list
md5_dict[generate_md5(file_path)].append(file_path)
# Identify keys (checksum) having more than one values (file names)
duplicate_files = [
val for key, val in md5_dict.items() if len(val) > 1]
# Print the list of duplicate files
print("Duplicate files list")
for idx, file in enumerate(duplicate_files):
print idx + 1, file
@datatalking
Copy link
Author

Thinking of putting something like this in main so I can control file types of documents, images, csv, py etc. This way I can pass the "what filetype" I want to search for, vs searching for everything.

'''
if name == 'main':
Users = ['owner', 'machine_name'] # TODO add multiple users
screen_shot_path = "/Users/machine_name/Desktop/"
pdf_path = "/Users/machine_name/Documents/" # TODO add .format(Users)
empty_path = ""
photo_files = ['.png', '.jpg', '.gif', '.tiff']
doc_files = ['.doc', '.pdf']
data_files = ['.txt', '.csv', '.tsv', '.xls', '.xlsx']
main()
'''

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment