|
|
@@ -0,0 +1,397 @@ |
|
|
import asyncdispatch |
|
|
import asynctools |
|
|
import docopt |
|
|
import json |
|
|
import nre |
|
|
import os |
|
|
import ospaths |
|
|
import sequtils |
|
|
import sha256/sha256sum |
|
|
import strutils |
|
|
import tables |
|
|
import threadpool |
|
|
import times |
|
|
|
|
|
# ### |
|
|
# Constants |
|
|
const FINGERPRINT_MATCH_THRESHOLD = 0.92 |
|
|
const FINGERPRINT_MATCH_OFFSET = 80 |
|
|
const FINGERPRINT_RELEVANT_BITS = uint32(0xFFFFFF00) |
|
|
const MAX_THREADS = 4 |
|
|
|
|
|
# ### |
|
|
# Handle CTRL-C |
|
|
|
|
|
proc chandler() {.noconv.} = |
|
|
setupForeignThreadGc() |
|
|
echo "\nExiting" |
|
|
quit(1) |
|
|
setControlCHook(chandler) |
|
|
|
|
|
# ## |
|
|
# Maximum number of threads |
|
|
setMaxPoolSize(MAX_THREADS) |
|
|
|
|
|
# ### |
|
|
# FFprobe CLI |
|
|
|
|
|
#let FFPROBE = ["-hide_banner", "-of", "json", "-v", "quiet", "-show_format", "-show_entries", "format=filename,duration:format_tags=title,artist,album_artist,composer,album,Acoustid Id,MusicBrainz Release Track Id"] |
|
|
let FFFORMAT = [".mp3", ".m4a", ".ogg", ".flac"] |
|
|
|
|
|
# ### |
|
|
# Command line arguments |
|
|
|
|
|
const DOC = """ |
|
|
Automatic duplicate file finder |
|
|
|
|
|
Usage: |
|
|
autodup [options] <sourcedir> [<dupdir>] |
|
|
|
|
|
Options: |
|
|
-h --help |
|
|
|
|
|
Search |
|
|
-D Search for duplicate files |
|
|
-E Search for empty directories |
|
|
-M Search for duplicate music files (requires fpcalc) |
|
|
|
|
|
Filters |
|
|
-f Include files only |
|
|
-d Include directories only |
|
|
-p <pattern> Include files / directories containing pattern |
|
|
-P <regex> Include files / directories containing regex |
|
|
-s <fsize> Include size greater than (in bytes) |
|
|
-S <fsize> Include size lesser than (in bytes) |
|
|
-t <time> Include last modified after (in days) |
|
|
-T <time> Include last modified before (in days) |
|
|
|
|
|
Actions |
|
|
-m Move search results |
|
|
-x Delete search results |
|
|
-q Quiet - don't display results |
|
|
""" |
|
|
|
|
|
var ARGS {.threadvar.}: Table[string, Value] |
|
|
ARGS = docopt(DOC) |
|
|
|
|
|
proc getintflag(flag: string): int = |
|
|
try: |
|
|
return parseInt($ARGS[flag]) |
|
|
except: |
|
|
echo "Bad integer input for " & flag & ": " & $ARGS[flag] |
|
|
quit(1) |
|
|
|
|
|
# Flags |
|
|
var SOURCEDIR = "." |
|
|
if $ARGS["<sourcedir>"] != "nil": |
|
|
SOURCEDIR = $ARGS["<sourcedir>"] |
|
|
|
|
|
var DUPDIR {.threadvar.}: string |
|
|
DUPDIR = "duplicates" |
|
|
if $ARGS["<dupdir>"] != "nil": |
|
|
DUPDIR = $ARGS["<dupdir>"] |
|
|
|
|
|
var PATTERN = "" |
|
|
if $ARGS["-p"] != "nil": |
|
|
PATTERN = $ARGS["-p"] |
|
|
|
|
|
var REGEX = "" |
|
|
if $ARGS["-P"] != "nil": |
|
|
REGEX = $ARGS["-P"] |
|
|
|
|
|
var FMINSIZE = 0 |
|
|
if $ARGS["-s"] != "nil": |
|
|
FMINSIZE = getintflag("-s") |
|
|
|
|
|
var FMAXSIZE = 0 |
|
|
if $ARGS["-S"] != "nil": |
|
|
FMAXSIZE = getintflag("-S") |
|
|
|
|
|
var TIMEAFTER = 0 |
|
|
if $ARGS["-t"] != "nil": |
|
|
TIMEAFTER = getintflag("-t") |
|
|
|
|
|
var TIMEBEFORE = 0 |
|
|
if $ARGS["-T"] != "nil": |
|
|
TIMEBEFORE = getintflag("-T") |
|
|
|
|
|
var FILES_ONLY = false |
|
|
if ARGS["-D"] or ARGS["-M"] or ARGS["-f"]: |
|
|
FILES_ONLY = true |
|
|
|
|
|
var DIRS_ONLY = false |
|
|
if ARGS["-E"] or ARGS["-d"]: |
|
|
DIRS_ONLY = true |
|
|
|
|
|
var ADD: BiggestInt = 0 |
|
|
var MATCH_COUNT = 0 |
|
|
var FILE_COUNT = 0 |
|
|
var DIR_COUNT = 0 |
|
|
|
|
|
# ### |
|
|
# Tables |
|
|
|
|
|
type |
|
|
FileSize = object |
|
|
first: int |
|
|
hashes: TableRef[string, int] |
|
|
|
|
|
# [fileindex: filename] |
|
|
var FILES: seq[string] = @[] |
|
|
|
|
|
# {filesize: FileSize object} |
|
|
var SIZES: TableRef[BiggestInt, FileSize] = newTable[BiggestInt, FileSize]() |
|
|
|
|
|
# [fileindex: [aidx1, aidx2...]] |
|
|
var SONGS {.threadvar.}: TableRef[int, seq[uint32]] |
|
|
|
|
|
# {aidx: [fileindex1, fileindex2]} |
|
|
var AIDX {.threadvar.}: TableRef[uint32, seq[int]] |
|
|
|
|
|
SONGS = newTable[int, seq[uint32]]() |
|
|
AIDX = newTable[uint32, seq[int]]() |
|
|
|
|
|
# ### |
|
|
# Actions |
|
|
|
|
|
proc moveaction(file, dupdir: string) = |
|
|
var dest = dupdir & DirSep & tailDir(file) |
|
|
|
|
|
try: |
|
|
createDir(parentDir(dest)) |
|
|
moveFile(file, dest) |
|
|
except: |
|
|
echo "Already exists " & dest |
|
|
|
|
|
proc removeaction(file: string, info: FileInfo) = |
|
|
if info.kind == pcFile: |
|
|
if not tryRemoveFile(file): |
|
|
echo "Failed to remove " & file |
|
|
elif info.kind == pcDir: |
|
|
try: |
|
|
removeDir(file) |
|
|
except: |
|
|
echo "Failed to remove dir " & file |
|
|
|
|
|
proc action(file: string, info: FileInfo, orig = "") = |
|
|
if ARGS["-m"]: |
|
|
spawn moveaction(file, DUPDIR) |
|
|
stdout.write("Moving ") |
|
|
elif ARGS["-x"]: |
|
|
spawn removeaction(file, info) |
|
|
stdout.write("Removing ") |
|
|
|
|
|
ADD += info.size |
|
|
MATCH_COUNT += 1 |
|
|
|
|
|
if not ARGS["-q"]: |
|
|
echo file |
|
|
if orig != "": |
|
|
echo " == " & orig |
|
|
|
|
|
# ### |
|
|
# Helpers |
|
|
|
|
|
proc gethash(file: string): Future[string] {.async.} = |
|
|
let hash = spawn sha256sum(file) |
|
|
while not hash.isReady(): |
|
|
await sleepAsync(5) |
|
|
|
|
|
return ^hash |
|
|
|
|
|
# ### |
|
|
# Search |
|
|
|
|
|
proc finddup(idx: int, info: FileInfo) {.async.} = |
|
|
if SIZES.has_key(info.size): |
|
|
# Size seen before |
|
|
let hash = await gethash(FILES[idx]) |
|
|
if SIZES[info.size].hashes != nil: |
|
|
# Hashes initialized |
|
|
if SIZES[info.size].hashes.has_key(hash): |
|
|
# Current hash seen before |
|
|
action(FILES[idx], info, FILES[SIZES[info.size].hashes[hash]]) |
|
|
else: |
|
|
# Unique hash |
|
|
SIZES[info.size].hashes[hash] = idx |
|
|
else: |
|
|
# Hashes not initialized |
|
|
let fhash = await gethash(FILES[SIZES[info.size].first]) |
|
|
SIZES[info.size].hashes = newTable[string, int]() |
|
|
SIZES[info.size].hashes[fhash] = SIZES[info.size].first |
|
|
if fhash == hash: |
|
|
# Current hash same as first hash for size |
|
|
action(FILES[idx], info, FILES[SIZES[info.size].first]) |
|
|
else: |
|
|
# Unique hash |
|
|
SIZES[info.size].hashes[hash] = idx |
|
|
else: |
|
|
# First file size |
|
|
SIZES[info.size] = FileSize(first: idx, hashes: nil) |
|
|
|
|
|
{.compile: "pg_acoustid/acoustid_compare.c".} |
|
|
#~ proc match_fingerprints(a: ptr uint32, asize: cint, b: ptr uint32, bsize: cint): cfloat {.importc, cdecl, gcsafe.} |
|
|
#~ proc match_fingerprints2(a: ptr uint32, asize: cint, b: ptr uint32, bsize: cint, maxoffset: cint): cfloat {.importc, cdecl, gcsafe.} |
|
|
proc match_fingerprints3(a: ptr uint32, asize: cint, b: ptr uint32, bsize: cint, maxoffset: cint): cfloat {.importc, cdecl, gcsafe.} |
|
|
|
|
|
proc fpcalc(file: string): Future[JsonNode] {.async, inline.} = |
|
|
let args = @["-json", "-raw", file] |
|
|
let data = await execProcess("fpcalc.exe", args=args, options={poUsePath}) |
|
|
|
|
|
var jdata: JsonNode |
|
|
try: |
|
|
jdata = parseJson(data.output) |
|
|
except: |
|
|
echo "Bad fingerprint: $#" % file |
|
|
return nil |
|
|
|
|
|
return jdata |
|
|
|
|
|
proc acoustid_compare(idx, id: int): Future[float] {.async.} = |
|
|
var filedata = createSharedU(uint32, SONGS[idx].len()) |
|
|
var fdata = createSharedU(uint32, SONGS[id].len()) |
|
|
|
|
|
filedata.copyMem(addr SONGS[idx][0], SONGS[idx].len() * sizeof(uint32)) |
|
|
fdata.copyMem(addr SONGS[id][0], SONGS[id].len() * sizeof(uint32)) |
|
|
|
|
|
let match = spawn match_fingerprints3(filedata, cint(SONGS[idx].len()), fdata, cint(SONGS[id].len()), FINGERPRINT_MATCH_OFFSET) |
|
|
while not match.isReady(): |
|
|
await sleepAsync(5) |
|
|
|
|
|
filedata.freeShared() |
|
|
fdata.freeShared() |
|
|
|
|
|
return ^match |
|
|
|
|
|
proc findmusicdup(idx: int, info: FileInfo) {.async.} = |
|
|
let jdata = await fpcalc(FILES[idx]) |
|
|
if jdata == nil: |
|
|
return |
|
|
|
|
|
SONGS[idx] = @[] |
|
|
for i in jdata{"fingerprint"}.items: |
|
|
let aidx: uint32 = uint32(i.getNum()) and FINGERPRINT_RELEVANT_BITS |
|
|
SONGS[idx].add(aidx) |
|
|
|
|
|
var compare: seq[int] = @[] |
|
|
for aidx in SONGS[idx].deduplicate(): |
|
|
if AIDX.has_key(aidx): |
|
|
for id in AIDX[aidx]: |
|
|
# Don't compare same two files multiple times |
|
|
if not compare.contains(id): |
|
|
let match = await acoustid_compare(idx, id) |
|
|
if match > FINGERPRINT_MATCH_THRESHOLD: |
|
|
action(FILES[idx], info, FILES[id]) |
|
|
SONGS.del(idx) |
|
|
return |
|
|
else: |
|
|
compare.add(id) |
|
|
|
|
|
# Not a duplicate, add to AIDX index for easy comparison |
|
|
for aidx in SONGS[idx].deduplicate(): |
|
|
if AIDX.has_key(aidx): |
|
|
if not AIDX[aidx].contains(idx): |
|
|
AIDX[aidx].add(idx) |
|
|
else: |
|
|
AIDX[aidx] = @[idx] |
|
|
|
|
|
proc findempty(dir: string, info: FileInfo) = |
|
|
var empty = true |
|
|
for sf in walkPattern(dir & DirSep & "*"): |
|
|
empty = false |
|
|
break |
|
|
if empty: |
|
|
action(dir, info) |
|
|
|
|
|
# ### |
|
|
# Scan |
|
|
|
|
|
proc recurse(dir: string) = |
|
|
let now = getTime() |
|
|
let after = initInterval(days=TIMEAFTER) |
|
|
let before = initInterval(days=TIMEBEFORE) |
|
|
|
|
|
for file in walkPattern(dir & DirSep & "*"): |
|
|
var info: FileInfo |
|
|
try: |
|
|
info = getFileInfo(file) |
|
|
except: |
|
|
continue |
|
|
|
|
|
if info.kind == pcFile: |
|
|
FILE_COUNT += 1 |
|
|
|
|
|
# Skip files |
|
|
if DIRS_ONLY: |
|
|
continue |
|
|
elif info.kind == pcDir: |
|
|
DIR_COUNT += 1 |
|
|
|
|
|
recurse(file) |
|
|
|
|
|
# Skip directories |
|
|
if FILES_ONLY: |
|
|
continue |
|
|
|
|
|
# Skip if doesn't match pattern |
|
|
let (_, name, ext) = splitFile(file) |
|
|
if PATTERN != "": |
|
|
if not (name & ext).contains(PATTERN): |
|
|
continue |
|
|
|
|
|
if REGEX != "": |
|
|
if not (name & ext).contains(re(REGEX)): |
|
|
continue |
|
|
|
|
|
# Skip if smaller than |
|
|
if $ARGS["-s"] != "nil": |
|
|
if info.size < FMINSIZE: |
|
|
continue |
|
|
|
|
|
# Skip if larger than |
|
|
if $ARGS["-S"] != "nil": |
|
|
if info.size > FMAXSIZE: |
|
|
continue |
|
|
|
|
|
# Skip if older than |
|
|
if $ARGS["-t"] != "nil": |
|
|
if info.lastWriteTime < now - after: |
|
|
continue |
|
|
|
|
|
# Skip if newer than |
|
|
if $ARGS["-T"] != "nil": |
|
|
if info.lastWriteTime > now - before: |
|
|
continue |
|
|
|
|
|
if info.kind == pcFile: |
|
|
# Don't process file multiple times |
|
|
if not FILES.contains(file): |
|
|
FILES.add(file) |
|
|
|
|
|
if ARGS["-D"]: |
|
|
asyncCheck finddup(FILES.len()-1, info) |
|
|
elif ARGS["-M"]: |
|
|
if file.splitFile().ext.toLowerAscii() in FFFORMAT: |
|
|
asyncCheck findmusicdup(FILES.len()-1, info) |
|
|
else: |
|
|
action(file, info) |
|
|
elif info.kind == pcDir: |
|
|
if ARGS["-E"]: |
|
|
findempty(file, info) |
|
|
else: |
|
|
action(file, info) |
|
|
|
|
|
sync() |
|
|
try: |
|
|
runForever() |
|
|
except: |
|
|
discard |
|
|
|
|
|
if not ARGS["-q"]: |
|
|
stdout.write("$# matches: $# MB / $# dirs, $# files\r" % [$MATCH_COUNT, formatFloat(float(ADD)/1024/1024, ffDecimal, 2), $DIR_COUNT, $FILE_COUNT]) |
|
|
stdout.flushFile() |
|
|
|
|
|
# ### |
|
|
# Main |
|
|
|
|
|
recurse(SOURCEDIR) |