BenKoerber · January 27, 2022 08:20 · Jan 26, 2022
diff --git a/clean_pdf.sh b/clean_pdf.sh
@@ -0,0 +1,55 @@
+# --------------------------------------------------------------------
+# Recursively find pdfs from the directory given as the first argument, 
+# otherwise search the current directory.
+# Use exiftool and qpdf (both must be installed and locatable on $PATH) 
+# to strip all top-level metadata from PDFs.
+#
+# Note - This only removes file-level metadata, not any metadata
+# in embedded images, etc. 
+#
+# Code is provided as-is, I take no responsibility for its use,
+# and I make no guarantee that this code works
+# or makes your PDFs "safe," whatever that means to you.
+#
+# You may need to enable execution of this script before using,
+# eg. chmod +x clean_pdf.sh
+#
+# example:
+# clean current directory:
+# >>> ./clean_pdf.sh
+#
+# clean specific directory:
+# >>> ./clean_pdf.sh some/other/directory
+# --------------------------------------------------------------------
+
+
+# Color Codes so that warnings/errors stick out
+GREEN="\e[32m"
+RED="\e[31m"
+CLEAR="\e[0m"
+
+# loop through all PDFs in first argument ($1),
+# or use '.' (this directory) if not given
+DIR="${1:-.}"
+
+echo "Cleaning PDFs in directory $DIR"
+
+# use find to locate files, pip to while read to get the
+# whole line instead of space delimited
+# Note -- this will find pdfs recursively!!
+find $DIR -type f -name "*.pdf" | while read -r i
+do
+
+  # output file as original filename with suffix _clean.pdf
+  TMP=${i%.*}_clean.pdf
+
+  # remove the temporary file if it already exists
+  if [ -f "$TMP" ]; then
+      rm "$TMP";
+  fi
+
+  exiftool -q -q -all:all= "$i" -o "$TMP"
+  qpdf --linearize --replace-input "$TMP"
+  echo -e $(printf "${GREEN}Processed ${RED}${i} ${CLEAR}as ${GREEN}${TMP}${CLEAR}")
+
+done
No results found