Skip to content

Instantly share code, notes, and snippets.

@georgy7
Forked from ma11hew28/find-duplicate-files.rb
Last active March 8, 2017 18:09
Show Gist options
  • Select an option

  • Save georgy7/a8ab4d5a2e90282b189c to your computer and use it in GitHub Desktop.

Select an option

Save georgy7/a8ab4d5a2e90282b189c to your computer and use it in GitHub Desktop.

Revisions

  1. georgy7 revised this gist Apr 2, 2015. 1 changed file with 8 additions and 9 deletions.
    17 changes: 8 additions & 9 deletions find-duplicate-files.rb
    Original file line number Diff line number Diff line change
    @@ -50,15 +50,14 @@
    File.open(output, 'w') do |f|
    f.puts '['
    hash.each do |key, filename_array|
    if filename_array.length > 1
    record = {}
    record['files'] = filename_array
    record['md5'] = key

    f.puts ',' if counter > 0
    f.write JSON.pretty_generate(record)
    counter += 1
    end
    next if filename_array.length <= 1
    record = {}
    record['files'] = filename_array
    record['md5'] = key

    f.puts ',' if counter > 0
    f.write JSON.pretty_generate(record)
    counter += 1
    end
    f.puts "\n]"
    end
  2. georgy7 revised this gist Apr 2, 2015. 1 changed file with 11 additions and 11 deletions.
    22 changes: 11 additions & 11 deletions find-duplicate-files.rb
    Original file line number Diff line number Diff line change
    @@ -9,7 +9,7 @@
    # 2. Run the script without any arguments.
    # 3. Watch the progress.
    # 4. Get your dublicates.json file.
    #
    #
    # https://gist.github.com/georgy7/a8ab4d5a2e90282b189c
    # Forked from https://gist.github.com/mattdipasquale/571405
    # Dot (unix hidden) files and folders are ignored.
    @@ -19,45 +19,45 @@
    hash = {}

    output = 'dublicates.json'
    raise "#{output} already exists" if File.exists?(output)
    fail "#{output} already exists" if File.exist?(output)

    puts 'Exploring subdirectories. It may take a long time.'
    counter = 0

    Dir.glob("**/*").each do |filename|
    Dir.glob('**/*').each do |filename|
    next if File.directory?(filename)
    puts "Start!\n" if counter < 1

    begin
    key = Digest::MD5.file(filename).to_s
    if hash.has_key? key
    if hash.key? key
    hash[key].push filename
    else
    hash[key] = [filename]
    end
    rescue
    rescue
    puts "Error processing #{filename}"
    end
    counter = counter + 1

    counter += 1
    sleep(0.005 * rand)
    puts "#{counter} calculated (#{filename})." if 0 == counter % 1000
    end

    puts "\nWriting #{output}"
    counter = 0

    File.open(output,"w") do |f|
    File.open(output, 'w') do |f|
    f.puts '['
    hash.each do |key, filename_array|
    if filename_array.length > 1
    record = {}
    record['files'] = filename_array
    record['files'] = filename_array
    record['md5'] = key

    f.puts "," if counter > 0
    f.puts ',' if counter > 0
    f.write JSON.pretty_generate(record)
    counter = counter + 1
    counter += 1
    end
    end
    f.puts "\n]"
  3. georgy7 revised this gist Apr 2, 2015. 1 changed file with 21 additions and 7 deletions.
    28 changes: 21 additions & 7 deletions find-duplicate-files.rb
    Original file line number Diff line number Diff line change
    @@ -1,18 +1,24 @@
    #! /usr/bin/ruby

    require 'rubygems'
    require 'digest/md5'
    require 'json'

    # Usage:
    # 1. Locate a folder in console.
    # 1. Locate a folder where you want to search dublicates in console.
    # 2. Run the script without any arguments.
    # 3. Watch the progress.
    # 4. Get your dublicates.txt file.
    # 4. Get your dublicates.json file.
    #
    # https://gist.github.com/georgy7/a8ab4d5a2e90282b189c
    # Forked from https://gist.github.com/mattdipasquale/571405
    # Dot (unix hidden) files and folders are ignored.
    # Warning: This script is *very* IO intensive. It can freeze your PC down.
    # It's provided 'as-is', without any express or implied warranty, etc.

    hash = {}

    output = 'dublicates.txt'
    output = 'dublicates.json'
    raise "#{output} already exists" if File.exists?(output)

    puts 'Exploring subdirectories. It may take a long time.'
    @@ -38,15 +44,23 @@
    puts "#{counter} calculated (#{filename})." if 0 == counter % 1000
    end

    puts "\nWriting of output file..."
    puts "\nWriting #{output}"
    counter = 0

    File.open(output,"w") do |f|
    hash.each_value do |filename_array|
    f.puts '['
    hash.each do |key, filename_array|
    if filename_array.length > 1
    f.puts "=== Identical Files ===\n"
    filename_array.each { |filename| f.puts ' '+filename }
    record = {}
    record['files'] = filename_array
    record['md5'] = key

    f.puts "," if counter > 0
    f.write JSON.pretty_generate(record)
    counter = counter + 1
    end
    end
    f.puts "\n]"
    end

    puts "Done.\n"
  4. georgy7 revised this gist Apr 2, 2015. 1 changed file with 7 additions and 9 deletions.
    16 changes: 7 additions & 9 deletions find-duplicate-files.rb
    Original file line number Diff line number Diff line change
    @@ -15,17 +15,16 @@
    output = 'dublicates.txt'
    raise "#{output} already exists" if File.exists?(output)

    puts 'Exploring subdirectories, calculating MD5...'
    puts 'Exploring subdirectories. It may take a long time.'
    counter = 0

    Dir.glob("**/*").each do |filename|
    next if File.directory?(filename)
    # puts 'Checking ' + filename
    puts "Start!\n" if counter < 1

    begin
    key = Digest::MD5.file(filename).to_s
    if hash.has_key? key
    # puts "same file #{filename}"
    hash[key].push filename
    else
    hash[key] = [filename]
    @@ -36,19 +35,18 @@

    counter = counter + 1
    sleep(0.005 * rand)
    if 0 == counter % 1000
    puts "#{counter} calculated (#{filename})."
    sleep(0.5 * rand)
    end
    puts "#{counter} calculated (#{filename})." if 0 == counter % 1000
    end

    puts 'Exploring finished.'
    puts "\nWriting of output file..."

    File.open(output,"w") do |f|
    hash.each_value do |filename_array|
    if filename_array.length > 1
    f.puts "=== Identical Files ===\n"
    filename_array.each { |filename| puts ' '+filename }
    filename_array.each { |filename| f.puts ' '+filename }
    end
    end
    end

    puts "Done.\n"
  5. georgy7 revised this gist Apr 2, 2015. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion find-duplicate-files.rb
    Original file line number Diff line number Diff line change
    @@ -35,7 +35,7 @@
    end

    counter = counter + 1
    sleep(0.01 * rand)
    sleep(0.005 * rand)
    if 0 == counter % 1000
    puts "#{counter} calculated (#{filename})."
    sleep(0.5 * rand)
  6. georgy7 revised this gist Apr 2, 2015. 1 changed file with 42 additions and 11 deletions.
    53 changes: 42 additions & 11 deletions find-duplicate-files.rb
    Original file line number Diff line number Diff line change
    @@ -1,23 +1,54 @@
    require 'digest/md5'

    # Usage:
    # 1. Locate a folder in console.
    # 2. Run the script without any arguments.
    # 3. Watch the progress.
    # 4. Get your dublicates.txt file.
    #
    # Forked from https://gist.github.com/mattdipasquale/571405
    # Dot (unix hidden) files and folders are ignored.
    # Warning: This script is *very* IO intensive. It can freeze your PC down.

    hash = {}

    Dir.glob("**/*", File::FNM_DOTMATCH).each do |filename|
    output = 'dublicates.txt'
    raise "#{output} already exists" if File.exists?(output)

    puts 'Exploring subdirectories, calculating MD5...'
    counter = 0

    Dir.glob("**/*").each do |filename|
    next if File.directory?(filename)
    # puts 'Checking ' + filename

    key = Digest::MD5.hexdigest(IO.read(filename)).to_sym
    if hash.has_key? key
    # puts "same file #{filename}"
    hash[key].push filename
    else
    hash[key] = [filename]
    begin
    key = Digest::MD5.file(filename).to_s
    if hash.has_key? key
    # puts "same file #{filename}"
    hash[key].push filename
    else
    hash[key] = [filename]
    end
    rescue
    puts "Error processing #{filename}"
    end

    counter = counter + 1
    sleep(0.01 * rand)
    if 0 == counter % 1000
    puts "#{counter} calculated (#{filename})."
    sleep(0.5 * rand)
    end
    end

    hash.each_value do |filename_array|
    if filename_array.length > 1
    puts "=== Identical Files ===\n"
    filename_array.each { |filename| puts ' '+filename }
    puts 'Exploring finished.'

    File.open(output,"w") do |f|
    hash.each_value do |filename_array|
    if filename_array.length > 1
    f.puts "=== Identical Files ===\n"
    filename_array.each { |filename| puts ' '+filename }
    end
    end
    end
  7. @ma11hew28 ma11hew28 created this gist Sep 9, 2010.
    23 changes: 23 additions & 0 deletions find-duplicate-files.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,23 @@
    require 'digest/md5'

    hash = {}

    Dir.glob("**/*", File::FNM_DOTMATCH).each do |filename|
    next if File.directory?(filename)
    # puts 'Checking ' + filename

    key = Digest::MD5.hexdigest(IO.read(filename)).to_sym
    if hash.has_key? key
    # puts "same file #{filename}"
    hash[key].push filename
    else
    hash[key] = [filename]
    end
    end

    hash.each_value do |filename_array|
    if filename_array.length > 1
    puts "=== Identical Files ===\n"
    filename_array.each { |filename| puts ' '+filename }
    end
    end