Skip to content

Instantly share code, notes, and snippets.

@zacharycarter
Forked from genotrance/autodup.nim
Created October 27, 2018 00:50
Show Gist options
  • Select an option

  • Save zacharycarter/d02a66cd0cdafb5d484a007219015a3d to your computer and use it in GitHub Desktop.

Select an option

Save zacharycarter/d02a66cd0cdafb5d484a007219015a3d to your computer and use it in GitHub Desktop.

Revisions

  1. @genotrance genotrance created this gist Apr 19, 2018.
    397 changes: 397 additions & 0 deletions autodup.nim
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,397 @@
    import asyncdispatch
    import asynctools
    import docopt
    import json
    import nre
    import os
    import ospaths
    import sequtils
    import sha256/sha256sum
    import strutils
    import tables
    import threadpool
    import times

    # ###
    # Constants
    const FINGERPRINT_MATCH_THRESHOLD = 0.92
    const FINGERPRINT_MATCH_OFFSET = 80
    const FINGERPRINT_RELEVANT_BITS = uint32(0xFFFFFF00)
    const MAX_THREADS = 4

    # ###
    # Handle CTRL-C

    proc chandler() {.noconv.} =
    setupForeignThreadGc()
    echo "\nExiting"
    quit(1)
    setControlCHook(chandler)

    # ##
    # Maximum number of threads
    setMaxPoolSize(MAX_THREADS)

    # ###
    # FFprobe CLI

    #let FFPROBE = ["-hide_banner", "-of", "json", "-v", "quiet", "-show_format", "-show_entries", "format=filename,duration:format_tags=title,artist,album_artist,composer,album,Acoustid Id,MusicBrainz Release Track Id"]
    let FFFORMAT = [".mp3", ".m4a", ".ogg", ".flac"]

    # ###
    # Command line arguments

    const DOC = """
    Automatic duplicate file finder
    Usage:
    autodup [options] <sourcedir> [<dupdir>]
    Options:
    -h --help
    Search
    -D Search for duplicate files
    -E Search for empty directories
    -M Search for duplicate music files (requires fpcalc)
    Filters
    -f Include files only
    -d Include directories only
    -p <pattern> Include files / directories containing pattern
    -P <regex> Include files / directories containing regex
    -s <fsize> Include size greater than (in bytes)
    -S <fsize> Include size lesser than (in bytes)
    -t <time> Include last modified after (in days)
    -T <time> Include last modified before (in days)
    Actions
    -m Move search results
    -x Delete search results
    -q Quiet - don't display results
    """

    var ARGS {.threadvar.}: Table[string, Value]
    ARGS = docopt(DOC)

    proc getintflag(flag: string): int =
    try:
    return parseInt($ARGS[flag])
    except:
    echo "Bad integer input for " & flag & ": " & $ARGS[flag]
    quit(1)

    # Flags
    var SOURCEDIR = "."
    if $ARGS["<sourcedir>"] != "nil":
    SOURCEDIR = $ARGS["<sourcedir>"]

    var DUPDIR {.threadvar.}: string
    DUPDIR = "duplicates"
    if $ARGS["<dupdir>"] != "nil":
    DUPDIR = $ARGS["<dupdir>"]

    var PATTERN = ""
    if $ARGS["-p"] != "nil":
    PATTERN = $ARGS["-p"]

    var REGEX = ""
    if $ARGS["-P"] != "nil":
    REGEX = $ARGS["-P"]

    var FMINSIZE = 0
    if $ARGS["-s"] != "nil":
    FMINSIZE = getintflag("-s")

    var FMAXSIZE = 0
    if $ARGS["-S"] != "nil":
    FMAXSIZE = getintflag("-S")

    var TIMEAFTER = 0
    if $ARGS["-t"] != "nil":
    TIMEAFTER = getintflag("-t")

    var TIMEBEFORE = 0
    if $ARGS["-T"] != "nil":
    TIMEBEFORE = getintflag("-T")

    var FILES_ONLY = false
    if ARGS["-D"] or ARGS["-M"] or ARGS["-f"]:
    FILES_ONLY = true

    var DIRS_ONLY = false
    if ARGS["-E"] or ARGS["-d"]:
    DIRS_ONLY = true

    var ADD: BiggestInt = 0
    var MATCH_COUNT = 0
    var FILE_COUNT = 0
    var DIR_COUNT = 0

    # ###
    # Tables

    type
    FileSize = object
    first: int
    hashes: TableRef[string, int]

    # [fileindex: filename]
    var FILES: seq[string] = @[]

    # {filesize: FileSize object}
    var SIZES: TableRef[BiggestInt, FileSize] = newTable[BiggestInt, FileSize]()

    # [fileindex: [aidx1, aidx2...]]
    var SONGS {.threadvar.}: TableRef[int, seq[uint32]]

    # {aidx: [fileindex1, fileindex2]}
    var AIDX {.threadvar.}: TableRef[uint32, seq[int]]

    SONGS = newTable[int, seq[uint32]]()
    AIDX = newTable[uint32, seq[int]]()

    # ###
    # Actions

    proc moveaction(file, dupdir: string) =
    var dest = dupdir & DirSep & tailDir(file)

    try:
    createDir(parentDir(dest))
    moveFile(file, dest)
    except:
    echo "Already exists " & dest

    proc removeaction(file: string, info: FileInfo) =
    if info.kind == pcFile:
    if not tryRemoveFile(file):
    echo "Failed to remove " & file
    elif info.kind == pcDir:
    try:
    removeDir(file)
    except:
    echo "Failed to remove dir " & file

    proc action(file: string, info: FileInfo, orig = "") =
    if ARGS["-m"]:
    spawn moveaction(file, DUPDIR)
    stdout.write("Moving ")
    elif ARGS["-x"]:
    spawn removeaction(file, info)
    stdout.write("Removing ")

    ADD += info.size
    MATCH_COUNT += 1

    if not ARGS["-q"]:
    echo file
    if orig != "":
    echo " == " & orig

    # ###
    # Helpers

    proc gethash(file: string): Future[string] {.async.} =
    let hash = spawn sha256sum(file)
    while not hash.isReady():
    await sleepAsync(5)

    return ^hash

    # ###
    # Search

    proc finddup(idx: int, info: FileInfo) {.async.} =
    if SIZES.has_key(info.size):
    # Size seen before
    let hash = await gethash(FILES[idx])
    if SIZES[info.size].hashes != nil:
    # Hashes initialized
    if SIZES[info.size].hashes.has_key(hash):
    # Current hash seen before
    action(FILES[idx], info, FILES[SIZES[info.size].hashes[hash]])
    else:
    # Unique hash
    SIZES[info.size].hashes[hash] = idx
    else:
    # Hashes not initialized
    let fhash = await gethash(FILES[SIZES[info.size].first])
    SIZES[info.size].hashes = newTable[string, int]()
    SIZES[info.size].hashes[fhash] = SIZES[info.size].first
    if fhash == hash:
    # Current hash same as first hash for size
    action(FILES[idx], info, FILES[SIZES[info.size].first])
    else:
    # Unique hash
    SIZES[info.size].hashes[hash] = idx
    else:
    # First file size
    SIZES[info.size] = FileSize(first: idx, hashes: nil)

    {.compile: "pg_acoustid/acoustid_compare.c".}
    #~ proc match_fingerprints(a: ptr uint32, asize: cint, b: ptr uint32, bsize: cint): cfloat {.importc, cdecl, gcsafe.}
    #~ proc match_fingerprints2(a: ptr uint32, asize: cint, b: ptr uint32, bsize: cint, maxoffset: cint): cfloat {.importc, cdecl, gcsafe.}
    proc match_fingerprints3(a: ptr uint32, asize: cint, b: ptr uint32, bsize: cint, maxoffset: cint): cfloat {.importc, cdecl, gcsafe.}

    proc fpcalc(file: string): Future[JsonNode] {.async, inline.} =
    let args = @["-json", "-raw", file]
    let data = await execProcess("fpcalc.exe", args=args, options={poUsePath})

    var jdata: JsonNode
    try:
    jdata = parseJson(data.output)
    except:
    echo "Bad fingerprint: $#" % file
    return nil

    return jdata

    proc acoustid_compare(idx, id: int): Future[float] {.async.} =
    var filedata = createSharedU(uint32, SONGS[idx].len())
    var fdata = createSharedU(uint32, SONGS[id].len())

    filedata.copyMem(addr SONGS[idx][0], SONGS[idx].len() * sizeof(uint32))
    fdata.copyMem(addr SONGS[id][0], SONGS[id].len() * sizeof(uint32))

    let match = spawn match_fingerprints3(filedata, cint(SONGS[idx].len()), fdata, cint(SONGS[id].len()), FINGERPRINT_MATCH_OFFSET)
    while not match.isReady():
    await sleepAsync(5)

    filedata.freeShared()
    fdata.freeShared()

    return ^match

    proc findmusicdup(idx: int, info: FileInfo) {.async.} =
    let jdata = await fpcalc(FILES[idx])
    if jdata == nil:
    return

    SONGS[idx] = @[]
    for i in jdata{"fingerprint"}.items:
    let aidx: uint32 = uint32(i.getNum()) and FINGERPRINT_RELEVANT_BITS
    SONGS[idx].add(aidx)

    var compare: seq[int] = @[]
    for aidx in SONGS[idx].deduplicate():
    if AIDX.has_key(aidx):
    for id in AIDX[aidx]:
    # Don't compare same two files multiple times
    if not compare.contains(id):
    let match = await acoustid_compare(idx, id)
    if match > FINGERPRINT_MATCH_THRESHOLD:
    action(FILES[idx], info, FILES[id])
    SONGS.del(idx)
    return
    else:
    compare.add(id)

    # Not a duplicate, add to AIDX index for easy comparison
    for aidx in SONGS[idx].deduplicate():
    if AIDX.has_key(aidx):
    if not AIDX[aidx].contains(idx):
    AIDX[aidx].add(idx)
    else:
    AIDX[aidx] = @[idx]

    proc findempty(dir: string, info: FileInfo) =
    var empty = true
    for sf in walkPattern(dir & DirSep & "*"):
    empty = false
    break
    if empty:
    action(dir, info)

    # ###
    # Scan

    proc recurse(dir: string) =
    let now = getTime()
    let after = initInterval(days=TIMEAFTER)
    let before = initInterval(days=TIMEBEFORE)

    for file in walkPattern(dir & DirSep & "*"):
    var info: FileInfo
    try:
    info = getFileInfo(file)
    except:
    continue

    if info.kind == pcFile:
    FILE_COUNT += 1

    # Skip files
    if DIRS_ONLY:
    continue
    elif info.kind == pcDir:
    DIR_COUNT += 1

    recurse(file)

    # Skip directories
    if FILES_ONLY:
    continue

    # Skip if doesn't match pattern
    let (_, name, ext) = splitFile(file)
    if PATTERN != "":
    if not (name & ext).contains(PATTERN):
    continue

    if REGEX != "":
    if not (name & ext).contains(re(REGEX)):
    continue

    # Skip if smaller than
    if $ARGS["-s"] != "nil":
    if info.size < FMINSIZE:
    continue

    # Skip if larger than
    if $ARGS["-S"] != "nil":
    if info.size > FMAXSIZE:
    continue

    # Skip if older than
    if $ARGS["-t"] != "nil":
    if info.lastWriteTime < now - after:
    continue

    # Skip if newer than
    if $ARGS["-T"] != "nil":
    if info.lastWriteTime > now - before:
    continue

    if info.kind == pcFile:
    # Don't process file multiple times
    if not FILES.contains(file):
    FILES.add(file)

    if ARGS["-D"]:
    asyncCheck finddup(FILES.len()-1, info)
    elif ARGS["-M"]:
    if file.splitFile().ext.toLowerAscii() in FFFORMAT:
    asyncCheck findmusicdup(FILES.len()-1, info)
    else:
    action(file, info)
    elif info.kind == pcDir:
    if ARGS["-E"]:
    findempty(file, info)
    else:
    action(file, info)

    sync()
    try:
    runForever()
    except:
    discard

    if not ARGS["-q"]:
    stdout.write("$# matches: $# MB / $# dirs, $# files\r" % [$MATCH_COUNT, formatFloat(float(ADD)/1024/1024, ffDecimal, 2), $DIR_COUNT, $FILE_COUNT])
    stdout.flushFile()

    # ###
    # Main

    recurse(SOURCEDIR)