cesardv · July 20, 2020 05:50 · Nov 27, 2016 · Nov 27, 2016 · Nov 27, 2016 · Nov 27, 2016
diff --git a/renameToHash.sh b/renameToHash.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 # TODO: skip tiny files (so small they couldn't be photos)
 # TODO: make sure sym links and other file system oddities are handled
+# TODO: look at paralellization for perf boost
 
 #
 # Constants
@@ -9,7 +10,7 @@ CHAR_COUNT=12
 BLOCK_COUNT=6
 SKIP_SIZE=3 # Every new block is sampled by skipping this amount of blocks to the next position
 COMPUTE_FULL_HASH=false # Set `true` to trade processing speed for fewer false positives
-DEFAULT_PATTERN=".*\.(jpg|png|gif|mov|avi|mkv)$"
+DEFAULT_PATTERN=".*\.(jpg|png|gif|mov|avi|mkv|jpeg)$"
 
 #
 # Parameters
@@ -31,10 +32,10 @@ echo ""
 #
 # Get list and count of files. Confirm with user if we should proceed
 #
-files=$(find . -maxdepth 1 -type f | egrep -i $PATTERN)
+files=$(find . -maxdepth 1 -type f | egrep -i "$PATTERN")
 count=$(echo "$files" | wc -l | sed 's/^ *//') # The `sed` at the end removes whitespace from wc output
 echo "Found $count files that match the pattern $PATTERN"
-read -p "Rename all? <Y/n> " prompt
+read -rp "Rename all? <Y/n> " prompt
 if [[ $prompt == "n" || $prompt == "N" || $prompt == "NO" || $prompt == "no" ]]
 then
   exit 0
@@ -49,8 +50,8 @@ for f in $files
 do
 
   # Hash the full file
-  if [ COMPUTE_FULL_HASH = true ] ; then
-    hash=$(md5 -q $f)
+  if [ $COMPUTE_FULL_HASH = true ] ; then
+    hash=$(md5 -q "$f")
 
   # Hash an assortment of bytes
   else
@@ -59,16 +60,15 @@ do
 
     # Skip along the file, sampling bytes as we go
     bytes=""
-    for(( i=1; i<=$BLOCK_COUNT; ++i )) do
+    for(( i=1; i<=BLOCK_COUNT; ++i )) do
       let BLOCK=$i*$SKIP_SIZE
       bytes+=$(dd if="$f" bs=512 count=1 skip=$BLOCK 2> /dev/null)
     done
-    hash=$(md5 <<< $bytes)
+    hash=$(md5 <<< "$bytes")
   fi
 
-  shortHash=$(echo $hash | cut -c1-$CHAR_COUNT)
+  shortHash=$(echo "$hash" | cut -c1-$CHAR_COUNT)
   ext=$(echo "$f" | sed 's/^.*\.//')
-  originalNameWithoutPath="${f##*/}"
   # If you've already run this script on some of these files, we shouldn't duplicate them.
   if [[ $f == *"$shortHash"* ]]
   then
@@ -87,6 +87,6 @@ do
   done
 
   echo "$newName   <-   $f"
-  mv $f $newName
+  mv "$f" "$newName"
 
 done
diff --git a/renameToHash.sh b/renameToHash.sh
@@ -6,7 +6,8 @@
 # Constants
 #
 CHAR_COUNT=12
-BLOCK_COUNT=3
+BLOCK_COUNT=6
+SKIP_SIZE=3 # Every new block is sampled by skipping this amount of blocks to the next position
 COMPUTE_FULL_HASH=false # Set `true` to trade processing speed for fewer false positives
 DEFAULT_PATTERN=".*\.(jpg|png|gif|mov|avi|mkv)$"
 
@@ -47,11 +48,22 @@ IFS=$'\n' # make newlines the only iteration separator: http://askubuntu.com/que
 for f in $files
 do
 
+  # Hash the full file
   if [ COMPUTE_FULL_HASH = true ] ; then
     hash=$(md5 -q $f)
+
+  # Hash an assortment of bytes
   else
-    firstPartOfFile=$(dd if="$f" bs=512 count=$BLOCK_COUNT 2> /dev/null)
-    hash=$(md5 <<< $firstPartOfFile)
+    # Naiive: Just grab a continguous chunk of N blocks. But this could be all empty space or all metadata. Too many false positivies.
+    # bytes=$(dd if="$f" bs=512 count=$BLOCK_COUNT skip=$SKIP_START_BLOCKS 2> /dev/null)
+
+    # Skip along the file, sampling bytes as we go
+    bytes=""
+    for(( i=1; i<=$BLOCK_COUNT; ++i )) do
+      let BLOCK=$i*$SKIP_SIZE
+      bytes+=$(dd if="$f" bs=512 count=1 skip=$BLOCK 2> /dev/null)
+    done
+    hash=$(md5 <<< $bytes)
   fi
 
   shortHash=$(echo $hash | cut -c1-$CHAR_COUNT)

diff --git a/renameToHash.sh b/renameToHash.sh
@@ -8,7 +8,17 @@
 CHAR_COUNT=12
 BLOCK_COUNT=3
 COMPUTE_FULL_HASH=false # Set `true` to trade processing speed for fewer false positives
-PATTERN=".*\.(jpg|png|gif|mov|avi|mkv)$"
+DEFAULT_PATTERN=".*\.(jpg|png|gif|mov|avi|mkv)$"
+
+#
+# Parameters
+#
+if [ -z "$1" ]
+then
+  PATTERN="$DEFAULT_PATTERN"
+else
+  PATTERN=$1
+fi
 
 #
 # Introduction

diff --git a/renameToHash.sh b/renameToHash.sh
@@ -8,7 +8,7 @@
 CHAR_COUNT=12
 BLOCK_COUNT=3
 COMPUTE_FULL_HASH=false # Set `true` to trade processing speed for fewer false positives
-PATTERN=".*\.(jpg|png|gif|mov)$"
+PATTERN=".*\.(jpg|png|gif|mov|avi|mkv)$"
 
 #
 # Introduction

diff --git a/renameToHash.sh b/renameToHash.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+# TODO: skip tiny files (so small they couldn't be photos)
+# TODO: make sure sym links and other file system oddities are handled
+
+#
+# Constants
+#
+CHAR_COUNT=12
+BLOCK_COUNT=3
+COMPUTE_FULL_HASH=false # Set `true` to trade processing speed for fewer false positives
+PATTERN=".*\.(jpg|png|gif|mov)$"
+
+#
+# Introduction
+#
+echo "This script will get the hash of $BLOCK_COUNT 512 byte blocks for each file it processes"
+echo "The first $CHAR_COUNT chars of this hash are used to rename the file"
+echo ""
+
+#
+# Get list and count of files. Confirm with user if we should proceed
+#
+files=$(find . -maxdepth 1 -type f | egrep -i $PATTERN)
+count=$(echo "$files" | wc -l | sed 's/^ *//') # The `sed` at the end removes whitespace from wc output
+echo "Found $count files that match the pattern $PATTERN"
+read -p "Rename all? <Y/n> " prompt
+if [[ $prompt == "n" || $prompt == "N" || $prompt == "NO" || $prompt == "no" ]]
+then
+  exit 0
+fi
+echo ""
+
+#
+# For every file, compute a hash and rename
+#
+IFS=$'\n' # make newlines the only iteration separator: http://askubuntu.com/questions/344407/how-to-read-complete-line-in-for-loop-with-spaces
+for f in $files
+do
+
+  if [ COMPUTE_FULL_HASH = true ] ; then
+    hash=$(md5 -q $f)
+  else
+    firstPartOfFile=$(dd if="$f" bs=512 count=$BLOCK_COUNT 2> /dev/null)
+    hash=$(md5 <<< $firstPartOfFile)
+  fi
+
+  shortHash=$(echo $hash | cut -c1-$CHAR_COUNT)
+  ext=$(echo "$f" | sed 's/^.*\.//')
+  originalNameWithoutPath="${f##*/}"
+  # If you've already run this script on some of these files, we shouldn't duplicate them.
+  if [[ $f == *"$shortHash"* ]]
+  then
+    echo "Skipping file. Name already contains the hash of its contents: $f"
+    continue
+  fi
+
+  newName="$shortHash.$ext"
+
+  # If a file with this name already exists, increment a number until it does not.
+  # This is a likely duplicate, and the whole reason for running this script
+  i=0
+  while [ -f "$newName" ]; do
+    let i=i+1
+    newName="$shortHash ($i).$ext"
+  done
+
+  echo "$newName   <-   $f"
+  mv $f $newName
+
+done