Skip to content

Instantly share code, notes, and snippets.

@tanema
Last active March 4, 2022 09:23
Show Gist options
  • Save tanema/2c752d3c9725c7ffea94 to your computer and use it in GitHub Desktop.
Save tanema/2c752d3c9725c7ffea94 to your computer and use it in GitHub Desktop.

Revisions

  1. tanema revised this gist Oct 20, 2014. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions multiprocess_migration.sh
    Original file line number Diff line number Diff line change
    @@ -69,7 +69,7 @@ syncfile () {

    #check if file is already on the server
    file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l)))
    if [[ $file_count -eq 0 ]]; then
    if [[ $file_count -gt 0 ]]; then
    log_ok "$status Already on server"
    else
    filename="_migration-$_current_file-$(uuidgen)"
    @@ -123,7 +123,7 @@ kill_all_workers () {
    #allows ctrl c to work in the while loop
    trap "kill_all_workers" SIGINT SIGHUP SIGTERM

    for ((i=0; i <= $thread_count; ++i)); do
    for ((i=0; i < $thread_count; ++i)); do
    echo "starting worker $i"
    #call process on this chunk of files
    process_lines $i $((lines_per_file * i)) &
  2. tanema revised this gist Oct 17, 2014. 1 changed file with 43 additions and 25 deletions.
    68 changes: 43 additions & 25 deletions multiprocess_migration.sh
    Original file line number Diff line number Diff line change
    @@ -1,31 +1,52 @@
    #! /bin/bash
    #how many times to split up the list
    ###################### USAGE ######################################
    usage() {
    echo "
    Usage: mongotos3 [-t n] mongo_host mongo_collection s3_bucket
    -t : number of parallel processes to use
    mongo_host : the host of the mongodb server
    mongo_collection : the collection to collecthe gridfs data from
    s3_bucket : the name of the bucket you want to cp the files to
    "
    }
    ###################### END USAGE ##################################
    # how many times to split up the list
    thread_count=8
    # parrallel process pid array
    _worker_pids=()
    # incremented variable to see progress
    _current_file=1

    while getopts 't:' opt; do
    # get options just -t for setting how many threads you want
    while getopts 't:*:' opt; do
    case $opt in
    t) thread_count=$OPTARG;;
    *)
    usage
    exit
    ;;
    esac
    done
    shift $((OPTIND-1))

    # script params
    if [ "$#" -ne 3 ]
    then
    usage
    fi
    # mongo host
    _host="${1:?Usage: mongodb host}"
    #mongo collection to pull grid_fs data from
    _db="${2:?Usage: mongodb collection}"
    #s3 bucket for everything to be synced to
    _bucket="${3:?Usage: aws dest bucket}"
    #incremented variable to see progress
    _current_file=1
    _host="${1:?Mongo Host Required}"
    # mongo collection to pull grid_fs data from
    _db="${2:?Mongo Collection required}"
    # s3 bucket for everything to be synced to
    _bucket="${3:?AWS Bucket Required}"
    # all the files
    _files_list=$(mongofiles -h $_host -db $_db list)
    #total files to be synced
    # total files to be synced
    _total_files=$(echo "$_files_list" | wc -l | awk {'print $1'})

    # how many lines to send to each thread
    ((lines_per_file = (_total_files + thread_count - 1) / thread_count))
    ((lines_per_file=(_total_files + thread_count - 1) / thread_count))
    ###################### LOGGING ####################################
    RED=$(tput setaf 1)
    GREEN=$(tput setaf 2)
    NORMAL=$(tput sgr0)
    @@ -38,8 +59,10 @@ log_fail() {
    let COL=$(tput cols)-${#1}+${#RED}+${#NORMAL}
    printf "%s%${COL}s" "$1" "$RED[FAIL]$NORMAL"
    }

    # param 1: filepath from mongo
    ###################### END LOGGING ################################
    ###################### METHOD DEFINITIONS #########################
    # param $1: filepath from mongo
    # param $2: worker identity number
    syncfile () {
    status="(worker $2) $_current_file/$lines_per_file $_bucket/$1"
    ((_current_file++))
    @@ -71,7 +94,8 @@ syncfile () {
    fi
    }

    # $1 is the split file list
    # param: $1 worker identity number
    # param: $2 starting line number in the file to process
    process_lines () {
    while read -r line; do
    #get filename
    @@ -83,8 +107,8 @@ process_lines () {
    done < <(echo "$_files_list" | head -n $(($2 + $lines_per_file)) | tail -n $lines_per_file)
    }

    #used for kill signals
    #calls kill on each pid
    # used for kill signals
    # calls kill on each pid
    kill_all_workers () {
    echo 'killing all workers'
    for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do
    @@ -94,17 +118,11 @@ kill_all_workers () {
    #cleanup any files that were interrupted
    rm _migration-* > /dev/null 2>&1
    }

    ##############################################################################
    # #
    # MAIN METHOD AREA #
    # #
    ##############################################################################
    ###################### END METHOD DEFINITIONS #####################

    #allows ctrl c to work in the while loop
    trap "kill_all_workers" SIGINT SIGHUP SIGTERM

    _worker_pids=()
    for ((i=0; i <= $thread_count; ++i)); do
    echo "starting worker $i"
    #call process on this chunk of files
  3. tanema revised this gist Oct 16, 2014. 2 changed files with 0 additions and 98 deletions.
    35 changes: 0 additions & 35 deletions bulk_image_migration.sh
    Original file line number Diff line number Diff line change
    @@ -1,35 +0,0 @@
    #! /bin/bash
    # this script will pull all the files out of the database first then sync them
    # good if you have a lot of space or low data. faster for syncing all the files at once

    # intial script taken from:
    # http://blog.vladimirm.com/2011/06/export-files-from-mongodb-gridfs-with-directory-paths/
    # kudos to Vladimir Momirov
    # script params
    _host="${1:?Usage: mongodb host}"
    _db="${2:?Usage: mongodb collection}"
    _bucket="${3:?Usage: aws dest bucket}"
    #create temp folder to work in
    mkdir _uploads
    cd _uploads
    #for each file in gridfs
    while read -r line; do
    #get filename
    file=$(echo "$line" | awk -F'\t' '{ print $1 }')
    #if connected message then continue
    [[ $file == 'connected to'* ]] && continue
    #get the relative path to the file
    directory=${file%/*}
    # make the relative path to where the file should be located
    mkdir -p ./$directory
    #get file from gridfs and put it in its path
    mongofiles -h $_host -db $_db get $file
    done < <(mongofiles -h $_host -db $_db list)
    #sync the whole temp folder with the configured bucket
    echo "Syncing with $_bucket bucket"
    aws s3 sync ./ s3://$_bucket/
    #rm the temp uploads folder
    echo "Cleaning Up"
    cd ..
    rm -rf _uploads
    echo "Done."
    63 changes: 0 additions & 63 deletions per_file_image_migration.sh
    Original file line number Diff line number Diff line change
    @@ -1,63 +0,0 @@
    #! /bin/bash
    # script params
    # mongo host
    _host="${1:?Usage: mongodb host}"
    # mongo collection to pull grid_fs data from
    _db="${2:?Usage: mongodb collection}"
    # s3 bucket for everything to be synced to
    _bucket="${3:?Usage: aws dest bucket}"

    # incremented variable to see progress
    _current_file=1
    # total files to be synced
    _total_files=$(wc -l < <(mongofiles -h $_host -db $_db list) | awk {'print $1'})

    # param 1: filepath from mongo
    syncfile () {
    echo "$_current_file / $_total_files $_bucket/$1"

    #check if file is already on the server
    file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l)))
    if [[ $file_count -gt 0 ]]; then
    echo "File is already on server"
    else
    #get file from gridfs and create a temp file of it
    echo "creating $1"
    mongofiles -h $_host -db $_db get --local _temp $1 > /dev/null 2>&1
    #get file succeeded
    if [ $? -eq 0 ]; then
    #send it to s3
    echo "sending to s3://$_bucket/$1"
    aws s3 cp _temp s3://$_bucket/$1 > /dev/null 2>&1
    #send file status
    if [ $? -eq 0 ]; then
    echo OK
    else
    echo FAIL
    fi

    #rm temp file gotten from gridfs
    rm _temp
    fi
    fi

    ((_current_file++))
    }

    #allows ctrl c to work in the while loop
    trap "break" SIGINT SIGHUP SIGTERM

    #for each file in gridfs
    while read -r line; do
    #get filename
    file=$(echo "$line" | awk -F'\t' '{ print $1 }')
    #if connected message then continue
    [[ $file == 'connected to'* ]] && continue
    # sync the file with the server
    syncfile $file
    done < <(mongofiles -h $_host -db $_db list)

    #if no errors say we are complete
    if [ $? -eq 0 ]; then
    echo DONE
    fi
  4. tanema revised this gist Oct 16, 2014. 1 changed file with 24 additions and 9 deletions.
    33 changes: 24 additions & 9 deletions multiprocess_migration.sh
    Original file line number Diff line number Diff line change
    @@ -25,13 +25,29 @@ _total_files=$(echo "$_files_list" | wc -l | awk {'print $1'})

    # how many lines to send to each thread
    ((lines_per_file = (_total_files + thread_count - 1) / thread_count))

    RED=$(tput setaf 1)
    GREEN=$(tput setaf 2)
    NORMAL=$(tput sgr0)
    log_ok() {
    let COL=$(tput cols)-${#1}+${#GREEN}+${#NORMAL}
    printf "%s%${COL}s" "$1" "$GREEN[OK]$NORMAL"
    }

    log_fail() {
    let COL=$(tput cols)-${#1}+${#RED}+${#NORMAL}
    printf "%s%${COL}s" "$1" "$RED[FAIL]$NORMAL"
    }

    # param 1: filepath from mongo
    syncfile () {
    status="(worker $2) $_current_file/$lines_per_file $_bucket/$1"
    ((_current_file++))

    #check if file is already on the server
    file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l)))
    if [[ $file_count -gt 0 ]]; then
    echo "File is already on server"
    if [[ $file_count -eq 0 ]]; then
    log_ok "$status Already on server"
    else
    filename="_migration-$_current_file-$(uuidgen)"
    #get file from gridfs and create a temp file of it
    @@ -42,19 +58,17 @@ syncfile () {
    aws s3 cp $filename s3://$_bucket/$1 --dryrun --quiet
    #send file status and if this file migration succeeded
    if [ $? -eq 0 ]; then
    echo "$2 $_current_file / $lines_per_file $_bucket/$1 OK"
    log_ok "$status"
    else
    echo "$_current_file / $lines_per_file $_bucket/$1 FAIL"
    log_fail "$status"
    fi

    #rm temp file gotten from gridfs
    rm $filename
    else
    echo "$_current_file / $lines_per_file $_bucket/$1 FAIL get"
    log_fail "$status Get from db failed"
    fi
    fi

    ((_current_file++))
    }

    # $1 is the split file list
    @@ -77,6 +91,8 @@ kill_all_workers () {
    kill -6 ${_worker_pids[i]} > /dev/null 2>&1
    done
    echo 'migration aborted'
    #cleanup any files that were interrupted
    rm _migration-* > /dev/null 2>&1
    }

    ##############################################################################
    @@ -105,5 +121,4 @@ done
    #if no errors say we are complete
    if [ $? -eq 0 ]; then
    echo DONE
    fi

    fi
  5. tanema revised this gist Oct 15, 2014. 1 changed file with 109 additions and 0 deletions.
    109 changes: 109 additions & 0 deletions multiprocess_migration.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,109 @@
    #! /bin/bash
    #how many times to split up the list
    thread_count=8

    while getopts 't:' opt; do
    case $opt in
    t) thread_count=$OPTARG;;
    esac
    done
    shift $((OPTIND-1))

    # script params
    # mongo host
    _host="${1:?Usage: mongodb host}"
    #mongo collection to pull grid_fs data from
    _db="${2:?Usage: mongodb collection}"
    #s3 bucket for everything to be synced to
    _bucket="${3:?Usage: aws dest bucket}"
    #incremented variable to see progress
    _current_file=1
    # all the files
    _files_list=$(mongofiles -h $_host -db $_db list)
    #total files to be synced
    _total_files=$(echo "$_files_list" | wc -l | awk {'print $1'})

    # how many lines to send to each thread
    ((lines_per_file = (_total_files + thread_count - 1) / thread_count))

    # param 1: filepath from mongo
    syncfile () {
    #check if file is already on the server
    file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l)))
    if [[ $file_count -gt 0 ]]; then
    echo "File is already on server"
    else
    filename="_migration-$_current_file-$(uuidgen)"
    #get file from gridfs and create a temp file of it
    mongofiles -h $_host -db $_db get --local $filename $1 > /dev/null 2>&1
    #get file succeeded
    if [ $? -eq 0 ]; then
    #send it to s3
    aws s3 cp $filename s3://$_bucket/$1 --dryrun --quiet
    #send file status and if this file migration succeeded
    if [ $? -eq 0 ]; then
    echo "$2 $_current_file / $lines_per_file $_bucket/$1 OK"
    else
    echo "$_current_file / $lines_per_file $_bucket/$1 FAIL"
    fi

    #rm temp file gotten from gridfs
    rm $filename
    else
    echo "$_current_file / $lines_per_file $_bucket/$1 FAIL get"
    fi
    fi

    ((_current_file++))
    }

    # $1 is the split file list
    process_lines () {
    while read -r line; do
    #get filename
    file=$(echo "$line" | awk -F'\t' '{ print $1 }')
    #if connected message then continue
    [[ $file == 'connected to'* ]] && continue
    # sync the file with the server
    syncfile $file $1
    done < <(echo "$_files_list" | head -n $(($2 + $lines_per_file)) | tail -n $lines_per_file)
    }

    #used for kill signals
    #calls kill on each pid
    kill_all_workers () {
    echo 'killing all workers'
    for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do
    kill -6 ${_worker_pids[i]} > /dev/null 2>&1
    done
    echo 'migration aborted'
    }

    ##############################################################################
    # #
    # MAIN METHOD AREA #
    # #
    ##############################################################################

    #allows ctrl c to work in the while loop
    trap "kill_all_workers" SIGINT SIGHUP SIGTERM

    _worker_pids=()
    for ((i=0; i <= $thread_count; ++i)); do
    echo "starting worker $i"
    #call process on this chunk of files
    process_lines $i $((lines_per_file * i)) &
    #record the pid for cleanup and waiting
    _worker_pids+=($!)
    done

    #wait for each process to finish
    for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do
    wait ${_worker_pids[i]} > /dev/null 2>&1
    done

    #if no errors say we are complete
    if [ $? -eq 0 ]; then
    echo DONE
    fi

  6. tanema revised this gist Oct 14, 2014. 1 changed file with 23 additions and 19 deletions.
    42 changes: 23 additions & 19 deletions per_file_image_migration.sh
    Original file line number Diff line number Diff line change
    @@ -1,41 +1,44 @@
    #! /bin/bash
    # this script will pull out one file at a time and push it to s3
    # good if you have low space or big amounts of data, slower because it has to connect to amazon for each file

    # script params
    # mongo host
    _host="${1:?Usage: mongodb host}"
    # mongo collection to pull grid_fs data from
    _db="${2:?Usage: mongodb collection}"
    # s3 bucket for everything to be synced to
    _bucket="${3:?Usage: aws dest bucket}"

    _current_file=1
    # incremented variable to see progress
    _current_file=1
    # total files to be synced
    _total_files=$(wc -l < <(mongofiles -h $_host -db $_db list) | awk {'print $1'})

    # param 1: filepath from mongo
    syncfile () {
    echo "$_current_file / $_total_files $_bucket/$1"

    #check if file is already on the server
    file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l)))
    if [[ $file_count -gt 0 ]]; then
    echo "File is already on server"
    else
    #get file from gridfs and create a temp file of it
    echo "creating $1"
    mongofiles -h $_host -db $_db get --local _temp $1 > /dev/null 2>&1
    #get file succeeded
    if [ $? -eq 0 ]; then
    echo OK
    else
    echo FAIL
    #send it to s3
    echo "sending to s3://$_bucket/$1"
    aws s3 cp _temp s3://$_bucket/$1 > /dev/null 2>&1
    #send file status
    if [ $? -eq 0 ]; then
    echo OK
    else
    echo FAIL
    fi

    #rm temp file gotten from gridfs
    rm _temp
    fi

    #send it to s3
    echo "sending to s3://$_bucket/$1"
    aws s3 cp _temp s3://$_bucket/$1 > /dev/null 2>&1
    if [ $? -eq 0 ]; then
    echo OK
    else
    echo FAIL
    fi

    rm _temp
    fi

    ((_current_file++))
    @@ -50,10 +53,11 @@ while read -r line; do
    file=$(echo "$line" | awk -F'\t' '{ print $1 }')
    #if connected message then continue
    [[ $file == 'connected to'* ]] && continue
    # sync the file with the server
    syncfile $file
    done < <(mongofiles -h $_host -db $_db list)

    #if no errors say we are complete
    if [ $? -eq 0 ]; then
    echo DONE
    fi
    fi
  7. tanema revised this gist Oct 14, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion per_file_image_migration.sh
    Original file line number Diff line number Diff line change
    @@ -8,7 +8,7 @@ _db="${2:?Usage: mongodb collection}"
    _bucket="${3:?Usage: aws dest bucket}"

    _current_file=1
    _total_files=$(wc -l < <(mongofiles -h localhost -db shiftee_development list) | awk {'print $1'})
    _total_files=$(wc -l < <(mongofiles -h $_host -db $_db list) | awk {'print $1'})

    syncfile () {
    echo "$_current_file / $_total_files $_bucket/$1"
  8. tanema revised this gist Oct 14, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion per_file_image_migration.sh
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,5 @@
    #! /bin/bash
    #this script will pull out one file at a time and push it to s3
    # this script will pull out one file at a time and push it to s3
    # good if you have low space or big amounts of data, slower because it has to connect to amazon for each file

    # script params
  9. tanema revised this gist Oct 14, 2014. 2 changed files with 2 additions and 0 deletions.
    1 change: 1 addition & 0 deletions bulk_image_migration.sh
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,6 @@
    #! /bin/bash
    # this script will pull all the files out of the database first then sync them
    # good if you have a lot of space or low data. faster for syncing all the files at once

    # intial script taken from:
    # http://blog.vladimirm.com/2011/06/export-files-from-mongodb-gridfs-with-directory-paths/
    1 change: 1 addition & 0 deletions per_file_image_migration.sh
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,6 @@
    #! /bin/bash
    #this script will pull out one file at a time and push it to s3
    # good if you have low space or big amounts of data, slower because it has to connect to amazon for each file

    # script params
    _host="${1:?Usage: mongodb host}"
  10. tanema revised this gist Oct 14, 2014. 2 changed files with 60 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions image_migration.sh → bulk_image_migration.sh
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,6 @@
    #! /bin/bash
    # this script will pull all the files out of the database first then sync them

    # intial script taken from:
    # http://blog.vladimirm.com/2011/06/export-files-from-mongodb-gridfs-with-directory-paths/
    # kudos to Vladimir Momirov
    58 changes: 58 additions & 0 deletions per_file_image_migration.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,58 @@
    #! /bin/bash
    #this script will pull out one file at a time and push it to s3

    # script params
    _host="${1:?Usage: mongodb host}"
    _db="${2:?Usage: mongodb collection}"
    _bucket="${3:?Usage: aws dest bucket}"

    _current_file=1
    _total_files=$(wc -l < <(mongofiles -h localhost -db shiftee_development list) | awk {'print $1'})

    syncfile () {
    echo "$_current_file / $_total_files $_bucket/$1"

    file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l)))
    if [[ $file_count -gt 0 ]]; then
    echo "File is already on server"
    else
    #get file from gridfs and create a temp file of it
    echo "creating $1"
    mongofiles -h $_host -db $_db get --local _temp $1 > /dev/null 2>&1
    if [ $? -eq 0 ]; then
    echo OK
    else
    echo FAIL
    fi

    #send it to s3
    echo "sending to s3://$_bucket/$1"
    aws s3 cp _temp s3://$_bucket/$1 > /dev/null 2>&1
    if [ $? -eq 0 ]; then
    echo OK
    else
    echo FAIL
    fi

    rm _temp
    fi

    ((_current_file++))
    }

    #allows ctrl c to work in the while loop
    trap "break" SIGINT SIGHUP SIGTERM

    #for each file in gridfs
    while read -r line; do
    #get filename
    file=$(echo "$line" | awk -F'\t' '{ print $1 }')
    #if connected message then continue
    [[ $file == 'connected to'* ]] && continue
    syncfile $file
    done < <(mongofiles -h $_host -db $_db list)

    #if no errors say we are complete
    if [ $? -eq 0 ]; then
    echo DONE
    fi
  11. tanema revised this gist Oct 6, 2014. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions image_migration.sh
    Original file line number Diff line number Diff line change
    @@ -3,8 +3,8 @@
    # http://blog.vladimirm.com/2011/06/export-files-from-mongodb-gridfs-with-directory-paths/
    # kudos to Vladimir Momirov
    # script params
    _host="${1:?Usage: gridfs host db}"
    _db="${2:?Usage: gridfs host db}"
    _host="${1:?Usage: mongodb host}"
    _db="${2:?Usage: mongodb collection}"
    _bucket="${3:?Usage: aws dest bucket}"
    #create temp folder to work in
    mkdir _uploads
  12. tanema revised this gist Oct 6, 2014. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions image_migration.sh
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,7 @@
    #! /bin/bash
    # intial script taken from:
    # http://blog.vladimirm.com/2011/06/export-files-from-mongodb-gridfs-with-directory-paths/
    # kudos to Vladimir Momirov
    # script params
    _host="${1:?Usage: gridfs host db}"
    _db="${2:?Usage: gridfs host db}"
  13. tanema created this gist Oct 6, 2014.
    29 changes: 29 additions & 0 deletions image_migration.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,29 @@
    #! /bin/bash
    # script params
    _host="${1:?Usage: gridfs host db}"
    _db="${2:?Usage: gridfs host db}"
    _bucket="${3:?Usage: aws dest bucket}"
    #create temp folder to work in
    mkdir _uploads
    cd _uploads
    #for each file in gridfs
    while read -r line; do
    #get filename
    file=$(echo "$line" | awk -F'\t' '{ print $1 }')
    #if connected message then continue
    [[ $file == 'connected to'* ]] && continue
    #get the relative path to the file
    directory=${file%/*}
    # make the relative path to where the file should be located
    mkdir -p ./$directory
    #get file from gridfs and put it in its path
    mongofiles -h $_host -db $_db get $file
    done < <(mongofiles -h $_host -db $_db list)
    #sync the whole temp folder with the configured bucket
    echo "Syncing with $_bucket bucket"
    aws s3 sync ./ s3://$_bucket/
    #rm the temp uploads folder
    echo "Cleaning Up"
    cd ..
    rm -rf _uploads
    echo "Done."