tanema · March 4, 2022 09:23 · Oct 20, 2014 · Oct 17, 2014 · Oct 16, 2014 · Oct 16, 2014
diff --git a/multiprocess_migration.sh b/multiprocess_migration.sh
@@ -69,7 +69,7 @@ syncfile () {
 
   #check if file is already on the server
   file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l)))
-  if [[ $file_count -eq 0 ]]; then
+  if [[ $file_count -gt 0 ]]; then
     log_ok "$status Already on server"
   else
     filename="_migration-$_current_file-$(uuidgen)"
@@ -123,7 +123,7 @@ kill_all_workers () {
 #allows ctrl c to work in the while loop
 trap "kill_all_workers" SIGINT SIGHUP SIGTERM
 
-for ((i=0; i <= $thread_count; ++i)); do
+for ((i=0; i < $thread_count; ++i)); do
   echo "starting worker $i"
   #call process on this chunk of files
   process_lines $i $((lines_per_file * i)) &

diff --git a/multiprocess_migration.sh b/multiprocess_migration.sh
@@ -1,31 +1,52 @@
 #! /bin/bash
-#how many times to split up the list
+###################### USAGE ######################################
+usage() {
+  echo "
+Usage: mongotos3 [-t n] mongo_host mongo_collection s3_bucket
+  -t                : number of parallel processes to use
+  mongo_host        : the host of the mongodb server
+  mongo_collection  : the collection to collecthe gridfs data from
+  s3_bucket         : the name of the bucket you want to cp the files to
+  "
+}
+###################### END USAGE ##################################
+# how many times to split up the list
 thread_count=8
+# parrallel process pid array
+_worker_pids=()
+# incremented variable to see progress 
+_current_file=1 
 
-while getopts 't:' opt; do
+# get options just -t for setting how many threads you want
+while getopts 't:*:' opt; do
   case $opt in
     t) thread_count=$OPTARG;;
+    *) 
+      usage
+      exit
+      ;;
   esac
 done
 shift $((OPTIND-1))
 
 # script params
+if [ "$#" -ne 3 ]
+then
+  usage
+fi
 # mongo host
-_host="${1:?Usage: mongodb host}"
-#mongo collection to pull grid_fs data from
-_db="${2:?Usage: mongodb collection}"
-#s3 bucket for everything to be synced to
-_bucket="${3:?Usage: aws dest bucket}"
-#incremented variable to see progress 
-_current_file=1 
+_host="${1:?Mongo Host Required}"
+# mongo collection to pull grid_fs data from
+_db="${2:?Mongo Collection required}"
+# s3 bucket for everything to be synced to
+_bucket="${3:?AWS Bucket Required}"
 # all the files
 _files_list=$(mongofiles -h $_host -db $_db list)
-#total files to be synced
+# total files to be synced
 _total_files=$(echo "$_files_list" | wc -l | awk {'print $1'})
-
 # how many lines to send to each thread
-((lines_per_file = (_total_files + thread_count - 1) / thread_count))
- 
+((lines_per_file=(_total_files + thread_count - 1) / thread_count))
+###################### LOGGING ####################################
 RED=$(tput setaf 1)
 GREEN=$(tput setaf 2)
 NORMAL=$(tput sgr0)
@@ -38,8 +59,10 @@ log_fail() {
   let COL=$(tput cols)-${#1}+${#RED}+${#NORMAL}
   printf "%s%${COL}s" "$1" "$RED[FAIL]$NORMAL"
 }
-
-# param 1: filepath from mongo
+###################### END LOGGING ################################
+###################### METHOD DEFINITIONS #########################
+# param $1: filepath from mongo
+# param $2: worker identity number
 syncfile () {
   status="(worker $2) $_current_file/$lines_per_file $_bucket/$1"
   ((_current_file++))
@@ -71,7 +94,8 @@ syncfile () {
   fi
 }
 
-# $1 is the split file list
+# param: $1 worker identity number
+# param: $2 starting line number in the file to process
 process_lines () {
   while read -r line; do
     #get filename
@@ -83,8 +107,8 @@ process_lines () {
   done < <(echo "$_files_list" | head -n $(($2 + $lines_per_file)) | tail -n $lines_per_file)
 }
 
-#used for kill signals
-#calls kill on each pid
+# used for kill signals
+# calls kill on each pid
 kill_all_workers () {
   echo 'killing all workers'
   for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do
@@ -94,17 +118,11 @@ kill_all_workers () {
   #cleanup any files that were interrupted
   rm _migration-* > /dev/null 2>&1
 }
-
-##############################################################################
-#                                                                            #
-#                     MAIN METHOD AREA                                       #
-#                                                                            #
-##############################################################################
+###################### END METHOD DEFINITIONS #####################
 
 #allows ctrl c to work in the while loop
 trap "kill_all_workers" SIGINT SIGHUP SIGTERM
 
-_worker_pids=()
 for ((i=0; i <= $thread_count; ++i)); do
   echo "starting worker $i"
   #call process on this chunk of files

diff --git a/bulk_image_migration.sh b/bulk_image_migration.sh
@@ -1,35 +0,0 @@
-#! /bin/bash
-# this script will pull all the files out of the database first then sync them
-# good if you have a lot of space or low data. faster for syncing all the files at once
-
-# intial script taken from: 
-# http://blog.vladimirm.com/2011/06/export-files-from-mongodb-gridfs-with-directory-paths/
-# kudos to Vladimir Momirov
-# script params
-_host="${1:?Usage: mongodb host}"
-_db="${2:?Usage: mongodb collection}"
-_bucket="${3:?Usage: aws dest bucket}"
-#create temp folder to work in
-mkdir _uploads
-cd _uploads
-#for each file in gridfs
-while read -r line; do
-  #get filename
-  file=$(echo "$line" | awk -F'\t' '{ print $1 }')
-  #if connected message then continue
-  [[ $file == 'connected to'* ]] && continue
-  #get the relative path to the file
-  directory=${file%/*}
-  # make the relative path to where the file should be located
-  mkdir -p ./$directory
-  #get file from gridfs and put it in its path
-  mongofiles -h $_host -db $_db get $file
-done < <(mongofiles -h $_host -db $_db list)
-#sync the whole temp folder with the configured bucket
-echo "Syncing with $_bucket bucket"
-aws s3 sync ./ s3://$_bucket/
-#rm the temp uploads folder
-echo "Cleaning Up"
-cd ..
-rm -rf _uploads
-echo "Done."

diff --git a/per_file_image_migration.sh b/per_file_image_migration.sh
@@ -1,63 +0,0 @@
-#! /bin/bash
-# script params
-# mongo host
-_host="${1:?Usage: mongodb host}"
-# mongo collection to pull grid_fs data from
-_db="${2:?Usage: mongodb collection}"
-# s3 bucket for everything to be synced to
-_bucket="${3:?Usage: aws dest bucket}"
-
-# incremented variable to see progress 
-_current_file=1 
-# total files to be synced
-_total_files=$(wc -l < <(mongofiles -h $_host -db $_db list) | awk {'print $1'})
-
-# param 1: filepath from mongo
-syncfile () {
-  echo "$_current_file / $_total_files $_bucket/$1"
-
-  #check if file is already on the server
-  file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l)))
-  if [[ $file_count -gt 0 ]]; then
-    echo "File is already on server"
-  else
-    #get file from gridfs and create a temp file of it
-    echo "creating $1"
-    mongofiles -h $_host -db $_db get --local _temp $1 > /dev/null 2>&1
-    #get file succeeded
-    if [ $? -eq 0 ]; then
-      #send it to s3
-      echo "sending to s3://$_bucket/$1"
-      aws s3 cp _temp s3://$_bucket/$1 > /dev/null 2>&1
-      #send file status
-      if [ $? -eq 0 ]; then
-        echo OK
-      else
-        echo FAIL
-      fi
-
-      #rm temp file gotten from gridfs
-      rm _temp
-    fi
-  fi
-
-  ((_current_file++))
-}
-
-#allows ctrl c to work in the while loop
-trap "break" SIGINT SIGHUP SIGTERM
-
-#for each file in gridfs
-while read -r line; do
-  #get filename
-  file=$(echo "$line" | awk -F'\t' '{ print $1 }')
-  #if connected message then continue
-  [[ $file == 'connected to'* ]] && continue
-  # sync the file with the server
-  syncfile $file
-done < <(mongofiles -h $_host -db $_db list)
-
-#if no errors say we are complete
-if [ $? -eq 0 ]; then
-  echo DONE
-fi

diff --git a/multiprocess_migration.sh b/multiprocess_migration.sh
@@ -25,13 +25,29 @@ _total_files=$(echo "$_files_list" | wc -l | awk {'print $1'})
 
 # how many lines to send to each thread
 ((lines_per_file = (_total_files + thread_count - 1) / thread_count))
+
+RED=$(tput setaf 1)
+GREEN=$(tput setaf 2)
+NORMAL=$(tput sgr0)
+log_ok() {
+  let COL=$(tput cols)-${#1}+${#GREEN}+${#NORMAL}
+  printf "%s%${COL}s" "$1" "$GREEN[OK]$NORMAL"
+}
+
+log_fail() {
+  let COL=$(tput cols)-${#1}+${#RED}+${#NORMAL}
+  printf "%s%${COL}s" "$1" "$RED[FAIL]$NORMAL"
+}
 
 # param 1: filepath from mongo
 syncfile () {
+  status="(worker $2) $_current_file/$lines_per_file $_bucket/$1"
+  ((_current_file++))
+
   #check if file is already on the server
   file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l)))
-  if [[ $file_count -gt 0 ]]; then
-    echo "File is already on server"
+  if [[ $file_count -eq 0 ]]; then
+    log_ok "$status Already on server"
   else
     filename="_migration-$_current_file-$(uuidgen)"
     #get file from gridfs and create a temp file of it
@@ -42,19 +58,17 @@ syncfile () {
       aws s3 cp $filename s3://$_bucket/$1 --dryrun --quiet
       #send file status and if this file migration succeeded
       if [ $? -eq 0 ]; then
-        echo "$2 $_current_file / $lines_per_file $_bucket/$1 OK"
+        log_ok "$status"
       else
-        echo "$_current_file / $lines_per_file $_bucket/$1 FAIL"
+        log_fail "$status"
       fi
 
       #rm temp file gotten from gridfs
       rm $filename
     else
-      echo "$_current_file / $lines_per_file $_bucket/$1 FAIL get"
+      log_fail "$status Get from db failed"
     fi
   fi
-
-  ((_current_file++))
 }
 
 # $1 is the split file list
@@ -77,6 +91,8 @@ kill_all_workers () {
     kill -6 ${_worker_pids[i]} > /dev/null 2>&1
   done
   echo 'migration aborted'
+  #cleanup any files that were interrupted
+  rm _migration-* > /dev/null 2>&1
 }
 
 ##############################################################################
@@ -105,5 +121,4 @@ done
 #if no errors say we are complete
 if [ $? -eq 0 ]; then
   echo DONE
-fi
-
+fi
diff --git a/multiprocess_migration.sh b/multiprocess_migration.sh
@@ -0,0 +1,109 @@
+#! /bin/bash
+#how many times to split up the list
+thread_count=8
+
+while getopts 't:' opt; do
+  case $opt in
+    t) thread_count=$OPTARG;;
+  esac
+done
+shift $((OPTIND-1))
+
+# script params
+# mongo host
+_host="${1:?Usage: mongodb host}"
+#mongo collection to pull grid_fs data from
+_db="${2:?Usage: mongodb collection}"
+#s3 bucket for everything to be synced to
+_bucket="${3:?Usage: aws dest bucket}"
+#incremented variable to see progress 
+_current_file=1 
+# all the files
+_files_list=$(mongofiles -h $_host -db $_db list)
+#total files to be synced
+_total_files=$(echo "$_files_list" | wc -l | awk {'print $1'})
+
+# how many lines to send to each thread
+((lines_per_file = (_total_files + thread_count - 1) / thread_count))
+
+# param 1: filepath from mongo
+syncfile () {
+  #check if file is already on the server
+  file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l)))
+  if [[ $file_count -gt 0 ]]; then
+    echo "File is already on server"
+  else
+    filename="_migration-$_current_file-$(uuidgen)"
+    #get file from gridfs and create a temp file of it
+    mongofiles -h $_host -db $_db get --local $filename $1 > /dev/null 2>&1
+    #get file succeeded
+    if [ $? -eq 0 ]; then
+      #send it to s3
+      aws s3 cp $filename s3://$_bucket/$1 --dryrun --quiet
+      #send file status and if this file migration succeeded
+      if [ $? -eq 0 ]; then
+        echo "$2 $_current_file / $lines_per_file $_bucket/$1 OK"
+      else
+        echo "$_current_file / $lines_per_file $_bucket/$1 FAIL"
+      fi
+
+      #rm temp file gotten from gridfs
+      rm $filename
+    else
+      echo "$_current_file / $lines_per_file $_bucket/$1 FAIL get"
+    fi
+  fi
+
+  ((_current_file++))
+}
+
+# $1 is the split file list
+process_lines () {
+  while read -r line; do
+    #get filename
+    file=$(echo "$line" | awk -F'\t' '{ print $1 }')
+    #if connected message then continue
+    [[ $file == 'connected to'* ]] && continue
+    # sync the file with the server
+    syncfile $file $1
+  done < <(echo "$_files_list" | head -n $(($2 + $lines_per_file)) | tail -n $lines_per_file)
+}
+
+#used for kill signals
+#calls kill on each pid
+kill_all_workers () {
+  echo 'killing all workers'
+  for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do
+    kill -6 ${_worker_pids[i]} > /dev/null 2>&1
+  done
+  echo 'migration aborted'
+}
+
+##############################################################################
+#                                                                            #
+#                     MAIN METHOD AREA                                       #
+#                                                                            #
+##############################################################################
+
+#allows ctrl c to work in the while loop
+trap "kill_all_workers" SIGINT SIGHUP SIGTERM
+
+_worker_pids=()
+for ((i=0; i <= $thread_count; ++i)); do
+  echo "starting worker $i"
+  #call process on this chunk of files
+  process_lines $i $((lines_per_file * i)) &
+  #record the pid for cleanup and waiting 
+  _worker_pids+=($!)
+done
+
+#wait for each process to finish
+for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do
+  wait ${_worker_pids[i]} > /dev/null 2>&1
+done
+
+#if no errors say we are complete
+if [ $? -eq 0 ]; then
+  echo DONE
+fi
+
diff --git a/per_file_image_migration.sh b/per_file_image_migration.sh
@@ -1,41 +1,44 @@
 #! /bin/bash
-# this script will pull out one file at a time and push it to s3
-# good if you have low space or big amounts of data, slower because it has to connect to amazon for each file
-
 # script params
+# mongo host
 _host="${1:?Usage: mongodb host}"
+# mongo collection to pull grid_fs data from
 _db="${2:?Usage: mongodb collection}"
+# s3 bucket for everything to be synced to
 _bucket="${3:?Usage: aws dest bucket}"
 
-_current_file=1
+# incremented variable to see progress 
+_current_file=1 
+# total files to be synced
 _total_files=$(wc -l < <(mongofiles -h $_host -db $_db list) | awk {'print $1'})
 
+# param 1: filepath from mongo
 syncfile () {
   echo "$_current_file / $_total_files $_bucket/$1"
 
+  #check if file is already on the server
   file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l)))
   if [[ $file_count -gt 0 ]]; then
     echo "File is already on server"
   else
     #get file from gridfs and create a temp file of it
     echo "creating $1"
     mongofiles -h $_host -db $_db get --local _temp $1 > /dev/null 2>&1
+    #get file succeeded
     if [ $? -eq 0 ]; then
-      echo OK
-    else
-      echo FAIL
+      #send it to s3
+      echo "sending to s3://$_bucket/$1"
+      aws s3 cp _temp s3://$_bucket/$1 > /dev/null 2>&1
+      #send file status
+      if [ $? -eq 0 ]; then
+        echo OK
+      else
+        echo FAIL
+      fi
+
+      #rm temp file gotten from gridfs
+      rm _temp
     fi
-
-    #send it to s3
-    echo "sending to s3://$_bucket/$1"
-    aws s3 cp _temp s3://$_bucket/$1 > /dev/null 2>&1
-    if [ $? -eq 0 ]; then
-      echo OK
-    else
-      echo FAIL
-    fi
-
-    rm _temp
   fi
 
   ((_current_file++))
@@ -50,10 +53,11 @@ while read -r line; do
   file=$(echo "$line" | awk -F'\t' '{ print $1 }')
   #if connected message then continue
   [[ $file == 'connected to'* ]] && continue
+  # sync the file with the server
   syncfile $file
 done < <(mongofiles -h $_host -db $_db list)
 
 #if no errors say we are complete
 if [ $? -eq 0 ]; then
   echo DONE
-fi
+fi
diff --git a/per_file_image_migration.sh b/per_file_image_migration.sh
@@ -8,7 +8,7 @@ _db="${2:?Usage: mongodb collection}"
 _bucket="${3:?Usage: aws dest bucket}"
 
 _current_file=1
-_total_files=$(wc -l < <(mongofiles -h localhost -db shiftee_development list) | awk {'print $1'})
+_total_files=$(wc -l < <(mongofiles -h $_host -db $_db list) | awk {'print $1'})
 
 syncfile () {
   echo "$_current_file / $_total_files $_bucket/$1"

diff --git a/per_file_image_migration.sh b/per_file_image_migration.sh
@@ -1,5 +1,5 @@
 #! /bin/bash
-#this script will pull out one file at a time and push it to s3
+# this script will pull out one file at a time and push it to s3
 # good if you have low space or big amounts of data, slower because it has to connect to amazon for each file
 
 # script params

diff --git a/bulk_image_migration.sh b/bulk_image_migration.sh
@@ -1,5 +1,6 @@
 #! /bin/bash
 # this script will pull all the files out of the database first then sync them
+# good if you have a lot of space or low data. faster for syncing all the files at once
 
 # intial script taken from: 
 # http://blog.vladimirm.com/2011/06/export-files-from-mongodb-gridfs-with-directory-paths/

diff --git a/per_file_image_migration.sh b/per_file_image_migration.sh
@@ -1,5 +1,6 @@
 #! /bin/bash
 #this script will pull out one file at a time and push it to s3
+# good if you have low space or big amounts of data, slower because it has to connect to amazon for each file
 
 # script params
 _host="${1:?Usage: mongodb host}"

diff --git a/image_migration.sh → bulk_image_migration.sh b/image_migration.sh → bulk_image_migration.sh
@@ -1,4 +1,6 @@
 #! /bin/bash
+# this script will pull all the files out of the database first then sync them
+
 # intial script taken from: 
 # http://blog.vladimirm.com/2011/06/export-files-from-mongodb-gridfs-with-directory-paths/
 # kudos to Vladimir Momirov

diff --git a/per_file_image_migration.sh b/per_file_image_migration.sh
@@ -0,0 +1,58 @@
+#! /bin/bash
+#this script will pull out one file at a time and push it to s3
+
+# script params
+_host="${1:?Usage: mongodb host}"
+_db="${2:?Usage: mongodb collection}"
+_bucket="${3:?Usage: aws dest bucket}"
+
+_current_file=1
+_total_files=$(wc -l < <(mongofiles -h localhost -db shiftee_development list) | awk {'print $1'})
+
+syncfile () {
+  echo "$_current_file / $_total_files $_bucket/$1"
+
+  file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l)))
+  if [[ $file_count -gt 0 ]]; then
+    echo "File is already on server"
+  else
+    #get file from gridfs and create a temp file of it
+    echo "creating $1"
+    mongofiles -h $_host -db $_db get --local _temp $1 > /dev/null 2>&1
+    if [ $? -eq 0 ]; then
+      echo OK
+    else
+      echo FAIL
+    fi
+
+    #send it to s3
+    echo "sending to s3://$_bucket/$1"
+    aws s3 cp _temp s3://$_bucket/$1 > /dev/null 2>&1
+    if [ $? -eq 0 ]; then
+      echo OK
+    else
+      echo FAIL
+    fi
+
+    rm _temp
+  fi
+
+  ((_current_file++))
+}
+
+#allows ctrl c to work in the while loop
+trap "break" SIGINT SIGHUP SIGTERM
+
+#for each file in gridfs
+while read -r line; do
+  #get filename
+  file=$(echo "$line" | awk -F'\t' '{ print $1 }')
+  #if connected message then continue
+  [[ $file == 'connected to'* ]] && continue
+  syncfile $file
+done < <(mongofiles -h $_host -db $_db list)
+
+#if no errors say we are complete
+if [ $? -eq 0 ]; then
+  echo DONE
+fi
diff --git a/image_migration.sh b/image_migration.sh
@@ -3,8 +3,8 @@
 # http://blog.vladimirm.com/2011/06/export-files-from-mongodb-gridfs-with-directory-paths/
 # kudos to Vladimir Momirov
 # script params
-_host="${1:?Usage: gridfs host db}"
-_db="${2:?Usage: gridfs host db}"
+_host="${1:?Usage: mongodb host}"
+_db="${2:?Usage: mongodb collection}"
 _bucket="${3:?Usage: aws dest bucket}"
 #create temp folder to work in
 mkdir _uploads

diff --git a/image_migration.sh b/image_migration.sh
@@ -1,4 +1,7 @@
 #! /bin/bash
+# intial script taken from: 
+# http://blog.vladimirm.com/2011/06/export-files-from-mongodb-gridfs-with-directory-paths/
+# kudos to Vladimir Momirov
 # script params
 _host="${1:?Usage: gridfs host db}"
 _db="${2:?Usage: gridfs host db}"

diff --git a/image_migration.sh b/image_migration.sh
@@ -0,0 +1,29 @@
+#! /bin/bash
+# script params
+_host="${1:?Usage: gridfs host db}"
+_db="${2:?Usage: gridfs host db}"
+_bucket="${3:?Usage: aws dest bucket}"
+#create temp folder to work in
+mkdir _uploads
+cd _uploads
+#for each file in gridfs
+while read -r line; do
+  #get filename
+  file=$(echo "$line" | awk -F'\t' '{ print $1 }')
+  #if connected message then continue
+  [[ $file == 'connected to'* ]] && continue
+  #get the relative path to the file
+  directory=${file%/*}
+  # make the relative path to where the file should be located
+  mkdir -p ./$directory
+  #get file from gridfs and put it in its path
+  mongofiles -h $_host -db $_db get $file
+done < <(mongofiles -h $_host -db $_db list)
+#sync the whole temp folder with the configured bucket
+echo "Syncing with $_bucket bucket"
+aws s3 sync ./ s3://$_bucket/
+#rm the temp uploads folder
+echo "Cleaning Up"
+cd ..
+rm -rf _uploads
+echo "Done."