Skip to content

Instantly share code, notes, and snippets.

@cesardv
Forked from SimplGy/renameToHash.sh
Created July 20, 2020 05:50
Show Gist options
  • Save cesardv/b9ae2af491e7ac7a57ad81a684b2a269 to your computer and use it in GitHub Desktop.
Save cesardv/b9ae2af491e7ac7a57ad81a684b2a269 to your computer and use it in GitHub Desktop.

Revisions

  1. @SimplGy SimplGy revised this gist Nov 27, 2016. 1 changed file with 10 additions and 10 deletions.
    20 changes: 10 additions & 10 deletions renameToHash.sh
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,7 @@
    #!/bin/bash
    # TODO: skip tiny files (so small they couldn't be photos)
    # TODO: make sure sym links and other file system oddities are handled
    # TODO: look at paralellization for perf boost

    #
    # Constants
    @@ -9,7 +10,7 @@ CHAR_COUNT=12
    BLOCK_COUNT=6
    SKIP_SIZE=3 # Every new block is sampled by skipping this amount of blocks to the next position
    COMPUTE_FULL_HASH=false # Set `true` to trade processing speed for fewer false positives
    DEFAULT_PATTERN=".*\.(jpg|png|gif|mov|avi|mkv)$"
    DEFAULT_PATTERN=".*\.(jpg|png|gif|mov|avi|mkv|jpeg)$"

    #
    # Parameters
    @@ -31,10 +32,10 @@ echo ""
    #
    # Get list and count of files. Confirm with user if we should proceed
    #
    files=$(find . -maxdepth 1 -type f | egrep -i $PATTERN)
    files=$(find . -maxdepth 1 -type f | egrep -i "$PATTERN")
    count=$(echo "$files" | wc -l | sed 's/^ *//') # The `sed` at the end removes whitespace from wc output
    echo "Found $count files that match the pattern $PATTERN"
    read -p "Rename all? <Y/n> " prompt
    read -rp "Rename all? <Y/n> " prompt
    if [[ $prompt == "n" || $prompt == "N" || $prompt == "NO" || $prompt == "no" ]]
    then
    exit 0
    @@ -49,8 +50,8 @@ for f in $files
    do

    # Hash the full file
    if [ COMPUTE_FULL_HASH = true ] ; then
    hash=$(md5 -q $f)
    if [ $COMPUTE_FULL_HASH = true ] ; then
    hash=$(md5 -q "$f")

    # Hash an assortment of bytes
    else
    @@ -59,16 +60,15 @@ do

    # Skip along the file, sampling bytes as we go
    bytes=""
    for(( i=1; i<=$BLOCK_COUNT; ++i )) do
    for(( i=1; i<=BLOCK_COUNT; ++i )) do
    let BLOCK=$i*$SKIP_SIZE
    bytes+=$(dd if="$f" bs=512 count=1 skip=$BLOCK 2> /dev/null)
    done
    hash=$(md5 <<< $bytes)
    hash=$(md5 <<< "$bytes")
    fi

    shortHash=$(echo $hash | cut -c1-$CHAR_COUNT)
    shortHash=$(echo "$hash" | cut -c1-$CHAR_COUNT)
    ext=$(echo "$f" | sed 's/^.*\.//')
    originalNameWithoutPath="${f##*/}"
    # If you've already run this script on some of these files, we shouldn't duplicate them.
    if [[ $f == *"$shortHash"* ]]
    then
    @@ -87,6 +87,6 @@ do
    done

    echo "$newName <- $f"
    mv $f $newName
    mv "$f" "$newName"

    done
  2. @SimplGy SimplGy revised this gist Nov 27, 2016. 1 changed file with 15 additions and 3 deletions.
    18 changes: 15 additions & 3 deletions renameToHash.sh
    Original file line number Diff line number Diff line change
    @@ -6,7 +6,8 @@
    # Constants
    #
    CHAR_COUNT=12
    BLOCK_COUNT=3
    BLOCK_COUNT=6
    SKIP_SIZE=3 # Every new block is sampled by skipping this amount of blocks to the next position
    COMPUTE_FULL_HASH=false # Set `true` to trade processing speed for fewer false positives
    DEFAULT_PATTERN=".*\.(jpg|png|gif|mov|avi|mkv)$"

    @@ -47,11 +48,22 @@ IFS=$'\n' # make newlines the only iteration separator: http://askubuntu.com/que
    for f in $files
    do

    # Hash the full file
    if [ COMPUTE_FULL_HASH = true ] ; then
    hash=$(md5 -q $f)

    # Hash an assortment of bytes
    else
    firstPartOfFile=$(dd if="$f" bs=512 count=$BLOCK_COUNT 2> /dev/null)
    hash=$(md5 <<< $firstPartOfFile)
    # Naiive: Just grab a continguous chunk of N blocks. But this could be all empty space or all metadata. Too many false positivies.
    # bytes=$(dd if="$f" bs=512 count=$BLOCK_COUNT skip=$SKIP_START_BLOCKS 2> /dev/null)

    # Skip along the file, sampling bytes as we go
    bytes=""
    for(( i=1; i<=$BLOCK_COUNT; ++i )) do
    let BLOCK=$i*$SKIP_SIZE
    bytes+=$(dd if="$f" bs=512 count=1 skip=$BLOCK 2> /dev/null)
    done
    hash=$(md5 <<< $bytes)
    fi

    shortHash=$(echo $hash | cut -c1-$CHAR_COUNT)
  3. @SimplGy SimplGy revised this gist Nov 27, 2016. 1 changed file with 11 additions and 1 deletion.
    12 changes: 11 additions & 1 deletion renameToHash.sh
    Original file line number Diff line number Diff line change
    @@ -8,7 +8,17 @@
    CHAR_COUNT=12
    BLOCK_COUNT=3
    COMPUTE_FULL_HASH=false # Set `true` to trade processing speed for fewer false positives
    PATTERN=".*\.(jpg|png|gif|mov|avi|mkv)$"
    DEFAULT_PATTERN=".*\.(jpg|png|gif|mov|avi|mkv)$"

    #
    # Parameters
    #
    if [ -z "$1" ]
    then
    PATTERN="$DEFAULT_PATTERN"
    else
    PATTERN=$1
    fi

    #
    # Introduction
  4. @SimplGy SimplGy revised this gist Nov 27, 2016. No changes.
  5. @SimplGy SimplGy revised this gist Nov 27, 2016. No changes.
  6. @SimplGy SimplGy revised this gist Nov 27, 2016. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion renameToHash.sh
    Original file line number Diff line number Diff line change
    @@ -8,7 +8,7 @@
    CHAR_COUNT=12
    BLOCK_COUNT=3
    COMPUTE_FULL_HASH=false # Set `true` to trade processing speed for fewer false positives
    PATTERN=".*\.(jpg|png|gif|mov)$"
    PATTERN=".*\.(jpg|png|gif|mov|avi|mkv)$"

    #
    # Introduction
  7. @SimplGy SimplGy created this gist Nov 27, 2016.
    70 changes: 70 additions & 0 deletions renameToHash.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,70 @@
    #!/bin/bash
    # TODO: skip tiny files (so small they couldn't be photos)
    # TODO: make sure sym links and other file system oddities are handled

    #
    # Constants
    #
    CHAR_COUNT=12
    BLOCK_COUNT=3
    COMPUTE_FULL_HASH=false # Set `true` to trade processing speed for fewer false positives
    PATTERN=".*\.(jpg|png|gif|mov)$"

    #
    # Introduction
    #
    echo "This script will get the hash of $BLOCK_COUNT 512 byte blocks for each file it processes"
    echo "The first $CHAR_COUNT chars of this hash are used to rename the file"
    echo ""

    #
    # Get list and count of files. Confirm with user if we should proceed
    #
    files=$(find . -maxdepth 1 -type f | egrep -i $PATTERN)
    count=$(echo "$files" | wc -l | sed 's/^ *//') # The `sed` at the end removes whitespace from wc output
    echo "Found $count files that match the pattern $PATTERN"
    read -p "Rename all? <Y/n> " prompt
    if [[ $prompt == "n" || $prompt == "N" || $prompt == "NO" || $prompt == "no" ]]
    then
    exit 0
    fi
    echo ""

    #
    # For every file, compute a hash and rename
    #
    IFS=$'\n' # make newlines the only iteration separator: http://askubuntu.com/questions/344407/how-to-read-complete-line-in-for-loop-with-spaces
    for f in $files
    do

    if [ COMPUTE_FULL_HASH = true ] ; then
    hash=$(md5 -q $f)
    else
    firstPartOfFile=$(dd if="$f" bs=512 count=$BLOCK_COUNT 2> /dev/null)
    hash=$(md5 <<< $firstPartOfFile)
    fi

    shortHash=$(echo $hash | cut -c1-$CHAR_COUNT)
    ext=$(echo "$f" | sed 's/^.*\.//')
    originalNameWithoutPath="${f##*/}"
    # If you've already run this script on some of these files, we shouldn't duplicate them.
    if [[ $f == *"$shortHash"* ]]
    then
    echo "Skipping file. Name already contains the hash of its contents: $f"
    continue
    fi

    newName="$shortHash.$ext"

    # If a file with this name already exists, increment a number until it does not.
    # This is a likely duplicate, and the whole reason for running this script
    i=0
    while [ -f "$newName" ]; do
    let i=i+1
    newName="$shortHash ($i).$ext"
    done

    echo "$newName <- $f"
    mv $f $newName

    done