Skip to content

Instantly share code, notes, and snippets.

@genomicsITER
Forked from mojaveazure/subsample.sh
Created August 9, 2017 17:11
Show Gist options
  • Save genomicsITER/9d756726b6f88c4bcb9cf9c07c7d918b to your computer and use it in GitHub Desktop.
Save genomicsITER/9d756726b6f88c4bcb9cf9c07c7d918b to your computer and use it in GitHub Desktop.

Revisions

  1. @mojaveazure mojaveazure revised this gist Sep 1, 2015. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion subsample.sh
    Original file line number Diff line number Diff line change
    @@ -24,7 +24,7 @@ Optional: 'outdir' is the directory to put the subsampled files \n\
    If not specified, we will use \n\
    `pwd`
    \n\
    'seed' is the seed value to be used for all samples run during \n\
    'seed' is the seed value to be used for all samples run during \n\
    this instance. If not specified, a random number will \n\
    be generated and recorded for posterity's sake \n\
    " >&2
  2. @mojaveazure mojaveazure revised this gist Aug 29, 2015. No changes.
  3. @mojaveazure mojaveazure revised this gist Aug 29, 2015. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion subsample.sh
    Original file line number Diff line number Diff line change
    @@ -4,7 +4,7 @@ set -e
    set -u
    set -o pipefail

    # A script to subsample
    # A script to subsample Fasta/FastQ files using Seqtk
    if ! `command -v seqtk > /dev/null 2> /dev/null`
    then
    echo "Failed to find seqtk!"
  4. @mojaveazure mojaveazure revised this gist Aug 29, 2015. 1 changed file with 68 additions and 2 deletions.
    70 changes: 68 additions & 2 deletions subsample.sh
    100644 → 100755
    Original file line number Diff line number Diff line change
    @@ -1,18 +1,84 @@
    #!/bin/bash

    set -e
    set -u
    set -o pipefail

    # A script to subsample
    if ! `command -v seqtk > /dev/null 2> /dev/null`
    then
    echo "Failed to find seqtk!"
    exit 1
    fi

    SEED=`echo "$RANDOM"`
    # Define a usage message
    function Usage() {
    echo -e "\
    Usage: subsample.sh sample_info frac [outdir] [seed]\n\
    \n\
    where: 'sample_info' is a Fasta/FastQ file to be subsampled \n\
    or a list of Fasta/FastQ files to be subsampled \n\
    'frac' is either the fraction or number of reads to be kept \n\
    \n\
    Optional: 'outdir' is the directory to put the subsampled files \n\
    If not specified, we will use \n\
    `pwd`
    \n\
    'seed' is the seed value to be used for all samples run during \n\
    this instance. If not specified, a random number will \n\
    be generated and recorded for posterity's sake \n\
    " >&2
    exit 1
    }

    # Do we have the two required arguments?
    if [ "$#" -lt 2 ]
    then
    Usage
    fi

    # Give the arguments names and set default values
    SAMPLE_INFO=$1
    FRAC=$2
    OUTDIR=${3:-`pwd`}
    SEED=${4:-`echo "$RANDOM"`}

    # Make sure that the outdirectory exists
    mkdir -p ${OUTDIR}

    # Define a function to carry out the subsampling
    function subsample() {
    sample="$1"
    seed="$2"
    frac="$3"
    out="$4"
    echo "Subsampling ${sample}"
    name=`basename "${sample}" | cut -d '.' -f 1`
    ext=`basename "${sample}" | cut -d '.' -f 2`
    seqtk sample -s"${seed}" "${sample}" "${frac}" > "${name}"_sub_"${frac}"."${ext}"
    seqtk sample -s"${seed}" "${sample}" "${frac}" > "${out}"/"${name}"_sub_"${frac}"."${ext}"
    }

    # Export this function
    export -f subsample

    # Get the first line of ${SAMPLE_INFO}
    LINE=`head -1 "${SAMPLE_INFO}"`

    # See if this first line is a file
    if [[ -f "${LINE}" ]]
    then # If so
    if `command -v parallel > /dev/null 2> /dev/null` # See if we have GNU Parallel installed
    then # If so
    cat "${SAMPLE_INFO}" | parallel "subsample {} ${SEED} ${FRAC} ${OUTDIR}" # Run the subsample function on all files in parallel
    else # If we don't have parallel
    for fastq in `cat "${SAMPLE_INFO}"` # Run the subsample function in serial
    do
    subsample "$fastq" "${SEED}" "${FRAC}" "${OUTDIR}"
    done
    fi
    else # If the first line doesn't point to a file, assume we were given a Fasta/FastQ file
    subsample "${SAMPLE_INFO}" "${SEED}" "${FRAC}" "${OUTDIR}"
    fi

    # Record what ${SEED} was, let us know where the final files can be found, and write all this to an out message file
    echo -e "`date`:\nWe used a seed value of ${SEED}\nAll subsampled files can be found at ${OUTDIR}\nThis message can be read again at ${OUTDIR}/outMSG.txt" | tee -a ${OUTDIR}/outMSG.txt
  5. @mojaveazure mojaveazure created this gist Aug 28, 2015.
    18 changes: 18 additions & 0 deletions subsample.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,18 @@
    #!/bin/bash

    if ! `command -v seqtk > /dev/null 2> /dev/null`
    then
    echo "Failed to find seqtk!"
    exit 1
    fi

    SEED=`echo "$RANDOM"`

    function subsample() {
    sample="$1"
    seed="$2"
    frac="$3"
    name=`basename "${sample}" | cut -d '.' -f 1`
    ext=`basename "${sample}" | cut -d '.' -f 2`
    seqtk sample -s"${seed}" "${sample}" "${frac}" > "${name}"_sub_"${frac}"."${ext}"
    }