datatalking · July 28, 2021 17:48 · datatalking · Jul 29, 2021
diff --git a/DirectoryAnalyzer.py b/DirectoryAnalyzer.py
 # DirectoryAnalyzer.py
 # Python 2.7.6

 """
 Given a folder, walk through all files within the folder and subfolders
 and get list of all files that are duplicates
 The md5 checcksum for each file will determine the duplicates
 """

 import os
 import hashlib
 from collections import defaultdict  # , Counter

 src_folder = "../../"


 def generate_md5(fname, chunk_size=4096):
    """
    Function which takes a file name and returns md5 checksum of the file
    """
    hash = hashlib.md5()
    with open(fname, "rb") as f:
        # Read the 1st block of the file
        chunk = f.read(chunk_size)
        # Keep reading the file until the end and update hash
        while chunk:
            hash.update(chunk)
            chunk = f.read(chunk_size)

    # Return the hex checksum
    return hash.hexdigest()


 if __name__ == "__main__":
    """
    Starting block of script
    """

    """
    extn = Counter()

    # Walk through all files and folders within directory
    for path, dirs, files in os.walk(src_folder):
        for each_file in files:
            extn[each_file.split(".")[-1]] += 1

    print extn.most_common()

    """
    # The dict will hae a list as values
    md5_dict = defaultdict(list)

    file_types_inscope = ["ppt", "pptx", "pdf", "txt", "html", "mp3",
                          "mp4", "jpg", "png", "xls", "xlsx", "xml",
                          "vsd", "py", "json"]

    # Walk through all files and folders within directory
    for path, dirs, files in os.walk(src_folder):
        for each_file in files:
            if each_file.split(".")[-1].lower() in file_types_inscope:
                # The path variable gets updated for each subfolder
                file_path = os.path.join(os.path.abspath(path), each_file)
                # If there are more files with same checksum append to list
                md5_dict[generate_md5(file_path)].append(file_path)

    # Identify keys (checksum) having more than one values (file names)
    duplicate_files = [
        val for key, val in md5_dict.items() if len(val) > 1]

    # Print the list of duplicate files
    print("Duplicate files list")
    for idx, file in enumerate(duplicate_files):
        print idx + 1, file
	# DirectoryAnalyzer.py
	# Python 2.7.6

	"""
	Given a folder, walk through all files within the folder and subfolders
	and get list of all files that are duplicates
	The md5 checcksum for each file will determine the duplicates
	"""

	import os
	import hashlib
	from collections import defaultdict # , Counter

	src_folder = "../../"


	def generate_md5(fname, chunk_size=4096):
	"""
	Function which takes a file name and returns md5 checksum of the file
	"""
	hash = hashlib.md5()
	with open(fname, "rb") as f:
	# Read the 1st block of the file
	chunk = f.read(chunk_size)
	# Keep reading the file until the end and update hash
	while chunk:
	hash.update(chunk)
	chunk = f.read(chunk_size)

	# Return the hex checksum
	return hash.hexdigest()


	if __name__ == "__main__":
	"""
	Starting block of script
	"""

	"""
	extn = Counter()

	# Walk through all files and folders within directory
	for path, dirs, files in os.walk(src_folder):
	for each_file in files:
	extn[each_file.split(".")[-1]] += 1

	print extn.most_common()

	"""
	# The dict will hae a list as values
	md5_dict = defaultdict(list)

	file_types_inscope = ["ppt", "pptx", "pdf", "txt", "html", "mp3",
	"mp4", "jpg", "png", "xls", "xlsx", "xml",
	"vsd", "py", "json"]

	# Walk through all files and folders within directory
	for path, dirs, files in os.walk(src_folder):
	for each_file in files:
	if each_file.split(".")[-1].lower() in file_types_inscope:
	# The path variable gets updated for each subfolder
	file_path = os.path.join(os.path.abspath(path), each_file)
	# If there are more files with same checksum append to list
	md5_dict[generate_md5(file_path)].append(file_path)

	# Identify keys (checksum) having more than one values (file names)
	duplicate_files = [
	val for key, val in md5_dict.items() if len(val) > 1]

	# Print the list of duplicate files
	print("Duplicate files list")
	for idx, file in enumerate(duplicate_files):
	print idx + 1, file
No results found