Skip to content

Instantly share code, notes, and snippets.

@datatalking
Forked from vinovator/checkDuplicates.py
Created July 28, 2021 17:48
Show Gist options
  • Save datatalking/dd5638d508805a928e9e8a8ab5ccd914 to your computer and use it in GitHub Desktop.
Save datatalking/dd5638d508805a928e9e8a8ab5ccd914 to your computer and use it in GitHub Desktop.

Revisions

  1. @vinovator vinovator renamed this gist Feb 10, 2016. 1 changed file with 21 additions and 22 deletions.
    43 changes: 21 additions & 22 deletions DirectoryAnalyzer.py → checkDuplicates.py
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,4 @@
    # DirectoryAnalyzer.py
    # checkDuplicates.py
    # Python 2.7.6

    """
    @@ -9,12 +9,13 @@

    import os
    import hashlib
    from collections import defaultdict # , Counter
    from collections import defaultdict
    import csv

    src_folder = "../../"


    def generate_md5(fname, chunk_size=4096):
    def generate_md5(fname, chunk_size=1024):
    """
    Function which takes a file name and returns md5 checksum of the file
    """
    @@ -36,26 +37,16 @@ def generate_md5(fname, chunk_size=4096):
    Starting block of script
    """

    """
    extn = Counter()
    # Walk through all files and folders within directory
    for path, dirs, files in os.walk(src_folder):
    for each_file in files:
    extn[each_file.split(".")[-1]] += 1
    print extn.most_common()
    """
    # The dict will hae a list as values
    # The dict will have a list as values
    md5_dict = defaultdict(list)

    file_types_inscope = ["ppt", "pptx", "pdf", "txt", "html", "mp3",
    file_types_inscope = ["ppt", "pptx", "pdf", "txt", "html",
    "mp4", "jpg", "png", "xls", "xlsx", "xml",
    "vsd", "py", "json"]

    # Walk through all files and folders within directory
    for path, dirs, files in os.walk(src_folder):
    print("Analyzing {}".format(path))
    for each_file in files:
    if each_file.split(".")[-1].lower() in file_types_inscope:
    # The path variable gets updated for each subfolder
    @@ -64,10 +55,18 @@ def generate_md5(fname, chunk_size=4096):
    md5_dict[generate_md5(file_path)].append(file_path)

    # Identify keys (checksum) having more than one values (file names)
    duplicate_files = [
    val for key, val in md5_dict.items() if len(val) > 1]
    duplicate_files = (
    val for key, val in md5_dict.items() if len(val) > 1)

    # Write the list of duplicate files to csv file
    with open("duplicates.csv", "w") as log:
    # Lineterminator added for windows as it inserts blank rows otherwise
    csv_writer = csv.writer(log, quoting=csv.QUOTE_MINIMAL, delimiter=",",
    lineterminator="\n")
    header = ["File Names"]
    csv_writer.writerow(header)

    for file_name in duplicate_files:
    csv_writer.writerow(file_name)

    # Print the list of duplicate files
    print("Duplicate files list")
    for idx, file in enumerate(duplicate_files):
    print idx + 1, file
    print("Done")
  2. @vinovator vinovator created this gist Feb 10, 2016.
    73 changes: 73 additions & 0 deletions DirectoryAnalyzer.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,73 @@
    # DirectoryAnalyzer.py
    # Python 2.7.6

    """
    Given a folder, walk through all files within the folder and subfolders
    and get list of all files that are duplicates
    The md5 checcksum for each file will determine the duplicates
    """

    import os
    import hashlib
    from collections import defaultdict # , Counter

    src_folder = "../../"


    def generate_md5(fname, chunk_size=4096):
    """
    Function which takes a file name and returns md5 checksum of the file
    """
    hash = hashlib.md5()
    with open(fname, "rb") as f:
    # Read the 1st block of the file
    chunk = f.read(chunk_size)
    # Keep reading the file until the end and update hash
    while chunk:
    hash.update(chunk)
    chunk = f.read(chunk_size)

    # Return the hex checksum
    return hash.hexdigest()


    if __name__ == "__main__":
    """
    Starting block of script
    """

    """
    extn = Counter()
    # Walk through all files and folders within directory
    for path, dirs, files in os.walk(src_folder):
    for each_file in files:
    extn[each_file.split(".")[-1]] += 1
    print extn.most_common()
    """
    # The dict will hae a list as values
    md5_dict = defaultdict(list)

    file_types_inscope = ["ppt", "pptx", "pdf", "txt", "html", "mp3",
    "mp4", "jpg", "png", "xls", "xlsx", "xml",
    "vsd", "py", "json"]

    # Walk through all files and folders within directory
    for path, dirs, files in os.walk(src_folder):
    for each_file in files:
    if each_file.split(".")[-1].lower() in file_types_inscope:
    # The path variable gets updated for each subfolder
    file_path = os.path.join(os.path.abspath(path), each_file)
    # If there are more files with same checksum append to list
    md5_dict[generate_md5(file_path)].append(file_path)

    # Identify keys (checksum) having more than one values (file names)
    duplicate_files = [
    val for key, val in md5_dict.items() if len(val) > 1]

    # Print the list of duplicate files
    print("Duplicate files list")
    for idx, file in enumerate(duplicate_files):
    print idx + 1, file