Skip to content

Instantly share code, notes, and snippets.

@sirselim
Last active January 24, 2025 08:14
Show Gist options
  • Select an option

  • Save sirselim/dcaad07523c90b46c1c0685efbc5d04e to your computer and use it in GitHub Desktop.

Select an option

Save sirselim/dcaad07523c90b46c1c0685efbc5d04e to your computer and use it in GitHub Desktop.

Revisions

  1. sirselim revised this gist Aug 23, 2019. No changes.
  2. sirselim revised this gist Aug 23, 2019. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -57,11 +57,11 @@ awkBody1='$8 != "."'
    awkBody2='BEGIN{FS=OFS="\t"} {t = $2; $2 = $9; $9 = t; x = $1; $1 = $8; $8 = $1; print;}'

    # parallel implementation of the hg19 format and sort
    parallel -j 6 \
    parallel -j 4 \
    "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \
    awk '${awkBody1}' | \
    awk '${awkBody2}' | \
    LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
    LC_ALL=C sort --parallel=2 -n -S 1G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
    bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
    #TODO: need to implement a defined approach to GNU parallel, can't use $THREADS

  3. sirselim revised this gist Aug 23, 2019. No changes.
  4. sirselim revised this gist Aug 23, 2019. 1 changed file with 6 additions and 5 deletions.
    11 changes: 6 additions & 5 deletions dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -53,16 +53,17 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
    echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt

    # set awk params as variable for parallel
    awkBody='$8 != "."'
    awkBody1='$8 != "."'
    awkBody2='BEGIN{FS=OFS="\t"} {t = $2; $2 = $9; $9 = t; x = $1; $1 = $8; $8 = $1; print;}'

    # parallel implementation of the hg19 format and sort
    parallel -j 4 \
    parallel -j 6 \
    "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \
    awk '${awkBody}' | \
    cut -f 8,9,3-7,1,2,10- | \
    awk '${awkBody1}' | \
    awk '${awkBody2}' | \
    LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
    bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
    #TODO: address correct threading options here, i.e. can't set GNU parallel to $THREADS as well as the parallel arguments to sort and bgzip
    #TODO: need to implement a defined approach to GNU parallel, can't use $THREADS

    # cat all files back together (should retain sort order)
    cat header.gz dbNSFPv${version}.chr{1..22}.hg19.custombuild.gz dbNSFPv${version}.chrX.hg19.custombuild.gz dbNSFPv${version}.chrY.hg19.custombuild.gz dbNSFPv${version}.chrM.hg19.custombuild.gz > dbNSFPv${version}.hg19.custombuild.gz
  5. sirselim revised this gist Aug 23, 2019. 1 changed file with 2 additions and 9 deletions.
    11 changes: 2 additions & 9 deletions dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -55,21 +55,14 @@ echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt
    # set awk params as variable for parallel
    awkBody='$8 != "."'

    # parallel dry run
    # parallel --dry-run -j 6 \
    # "zgrep -v '#chr' dbNSFP4.0a_variant.chr{}.gz | \
    # awk ${awkBody} | \
    # cut -f 8,9,3-7,1,2,10- | \
    # LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
    # bgzip -@ 2 > dbNSFPv4.0a.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt

    # parallel implementation of the hg19 format and sort
    parallel -j 6 \
    parallel -j 4 \
    "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \
    awk '${awkBody}' | \
    cut -f 8,9,3-7,1,2,10- | \
    LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
    bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
    #TODO: address correct threading options here, i.e. can't set GNU parallel to $THREADS as well as the parallel arguments to sort and bgzip

    # cat all files back together (should retain sort order)
    cat header.gz dbNSFPv${version}.chr{1..22}.hg19.custombuild.gz dbNSFPv${version}.chrX.hg19.custombuild.gz dbNSFPv${version}.chrY.hg19.custombuild.gz dbNSFPv${version}.chrM.hg19.custombuild.gz > dbNSFPv${version}.hg19.custombuild.gz
  6. sirselim revised this gist Aug 23, 2019. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -58,8 +58,8 @@ awkBody='$8 != "."'
    # parallel dry run
    # parallel --dry-run -j 6 \
    # "zgrep -v '#chr' dbNSFP4.0a_variant.chr{}.gz | \
    # awk ${awkBody1} | \
    # awk ${awkBody2} | \
    # awk ${awkBody} | \
    # cut -f 8,9,3-7,1,2,10- | \
    # LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
    # bgzip -@ 2 > dbNSFPv4.0a.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt

  7. sirselim revised this gist Aug 23, 2019. 1 changed file with 3 additions and 4 deletions.
    7 changes: 3 additions & 4 deletions dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -53,8 +53,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
    echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt

    # set awk params as variable for parallel
    awkBody1='$8 != "."'
    awkBody2='BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'
    awkBody='$8 != "."'

    # parallel dry run
    # parallel --dry-run -j 6 \
    @@ -67,8 +66,8 @@ awkBody2='BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'
    # parallel implementation of the hg19 format and sort
    parallel -j 6 \
    "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \
    awk '${awkBody1}' | \
    awk '${awkBody2}' | \
    awk '${awkBody}' | \
    cut -f 8,9,3-7,1,2,10- | \
    LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
    bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt

  8. sirselim revised this gist Aug 23, 2019. 1 changed file with 27 additions and 18 deletions.
    45 changes: 27 additions & 18 deletions dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -48,13 +48,32 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
    # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
    # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
    # NOTE: bgzip parameter -@ X represents number of threads
    zcat dbNSFPv${version}_custombuild.gz | \
    awk '$8 != "."' | \
    awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| \
    LC_ALL=C sort --parallel=${THREADS} -n -S 20G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
    bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz
    # NOTE: to try and overcome disk space limits giving sort 20Gb of RAM
    # TESTING: removed -V (version sort) option and replaced with -n (numeric sort), should improve speed

    # create a file with ordered chrosome names
    echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt

    # set awk params as variable for parallel
    awkBody1='$8 != "."'
    awkBody2='BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'

    # parallel dry run
    # parallel --dry-run -j 6 \
    # "zgrep -v '#chr' dbNSFP4.0a_variant.chr{}.gz | \
    # awk ${awkBody1} | \
    # awk ${awkBody2} | \
    # LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
    # bgzip -@ 2 > dbNSFPv4.0a.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt

    # parallel implementation of the hg19 format and sort
    parallel -j 6 \
    "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \
    awk '${awkBody1}' | \
    awk '${awkBody2}' | \
    LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
    bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt

    # cat all files back together (should retain sort order)
    cat header.gz dbNSFPv${version}.chr{1..22}.hg19.custombuild.gz dbNSFPv${version}.chrX.hg19.custombuild.gz dbNSFPv${version}.chrY.hg19.custombuild.gz dbNSFPv${version}.chrM.hg19.custombuild.gz > dbNSFPv${version}.hg19.custombuild.gz

    # Create tabix index
    tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz
    @@ -65,14 +84,4 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz
    # clean up
    #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
    #/END hg38
    ###



    zgrep -v '#chr' dbNSFP4.0a_variant.chr{19..21}.gz | \
    awk '$8 != "."' | \
    awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| \
    LC_ALL=C sort --parallel=20 -n -S 1G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
    bgzip -@ 20 > dbNSFPv4.0a.test.hg19.custombuild.gz

    tabix -s 1 -b 2 -e 2 dbNSFPv4.0a.test.hg19.custombuild.gz
    ###
  9. sirselim revised this gist Aug 22, 2019. 1 changed file with 11 additions and 1 deletion.
    12 changes: 11 additions & 1 deletion dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -65,4 +65,14 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz
    # clean up
    #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
    #/END hg38
    ###
    ###



    zgrep -v '#chr' dbNSFP4.0a_variant.chr{19..21}.gz | \
    awk '$8 != "."' | \
    awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| \
    LC_ALL=C sort --parallel=20 -n -S 1G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
    bgzip -@ 20 > dbNSFPv4.0a.test.hg19.custombuild.gz

    tabix -s 1 -b 2 -e 2 dbNSFPv4.0a.test.hg19.custombuild.gz
  10. sirselim revised this gist Aug 21, 2019. 1 changed file with 6 additions and 1 deletion.
    7 changes: 6 additions & 1 deletion dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -48,8 +48,13 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
    # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
    # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
    # NOTE: bgzip parameter -@ X represents number of threads
    zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=${THREADS} -n -S 20G -T . -V -k 1,1 -k 2,2 --compress-program=gzip| bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz
    zcat dbNSFPv${version}_custombuild.gz | \
    awk '$8 != "."' | \
    awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| \
    LC_ALL=C sort --parallel=${THREADS} -n -S 20G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
    bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz
    # NOTE: to try and overcome disk space limits giving sort 20Gb of RAM
    # TESTING: removed -V (version sort) option and replaced with -n (numeric sort), should improve speed

    # Create tabix index
    tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz
  11. sirselim revised this gist Aug 21, 2019. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -48,7 +48,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
    # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
    # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
    # NOTE: bgzip parameter -@ X represents number of threads
    zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=${THREADS} -S 20G -T . -V -k 1,1 -k 2,2 | bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz
    zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=${THREADS} -n -S 20G -T . -V -k 1,1 -k 2,2 --compress-program=gzip| bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz
    # NOTE: to try and overcome disk space limits giving sort 20Gb of RAM

    # Create tabix index
  12. sirselim revised this gist Aug 21, 2019. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -49,6 +49,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
    # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
    # NOTE: bgzip parameter -@ X represents number of threads
    zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=${THREADS} -S 20G -T . -V -k 1,1 -k 2,2 | bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz
    # NOTE: to try and overcome disk space limits giving sort 20Gb of RAM

    # Create tabix index
    tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz
  13. sirselim revised this gist Aug 21, 2019. 1 changed file with 4 additions and 2 deletions.
    6 changes: 4 additions & 2 deletions dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -7,6 +7,8 @@
    # Set to dbNSFP version to download and build
    version="4.0a"
    #TODO: add an option to 'scrape' this from the url to always return latest version
    # define thread number for parallel processing where able
    THREADS=6

    # Download dbNSFP database
    wget -O dbNSFP${version}.zip "ftp://dbnsfp:[email protected]/dbNSFP${version}.zip"
    @@ -22,7 +24,7 @@ zcat dbNSFP${version}_variant.chr1.gz | head -n 1 | bgzip > header.gz

    # Create a single file version
    # NOTE: bgzip parameter -@ X represents number of threads
    cat dbNSFP${version}_variant.chr{1..22}.gz dbNSFP${version}_variant.chrX.gz dbNSFP${version}_variant.chrY.gz dbNSFP${version}_variant.chrM.gz | zgrep -v '#chr' | bgzip -@ 6 > dbNSFPv${version}_custom.gz
    cat dbNSFP${version}_variant.chr{1..22}.gz dbNSFP${version}_variant.chrX.gz dbNSFP${version}_variant.chrY.gz dbNSFP${version}_variant.chrM.gz | zgrep -v '#chr' | bgzip -@ ${THREADS} > dbNSFPv${version}_custom.gz
    #TODO: there must be a 'nicer' way of ordering the input into the cat (to include the X,Y and M chrs without explicitly stating them)

    # add header back into file
    @@ -46,7 +48,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
    # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
    # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
    # NOTE: bgzip parameter -@ X represents number of threads
    zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=6 -S 20G -T . -V -k 1,1 -k 2,2 | bgzip -@ 6 > dbNSFPv${version}.hg19.custombuild.gz
    zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=${THREADS} -S 20G -T . -V -k 1,1 -k 2,2 | bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz

    # Create tabix index
    tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz
  14. sirselim revised this gist Aug 21, 2019. No changes.
  15. sirselim revised this gist Aug 21, 2019. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -46,7 +46,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
    # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
    # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
    # NOTE: bgzip parameter -@ X represents number of threads
    zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=6 -T . -V -k 1,1 -k 2,2 | bgzip -@ 6 > dbNSFPv${version}.hg19.custombuild.gz
    zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=6 -S 20G -T . -V -k 1,1 -k 2,2 | bgzip -@ 6 > dbNSFPv${version}.hg19.custombuild.gz

    # Create tabix index
    tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz
  16. sirselim revised this gist Aug 21, 2019. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -46,7 +46,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
    # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
    # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
    # NOTE: bgzip parameter -@ X represents number of threads
    zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort -T . -V -k 1,1 -k 2,2 | bgzip -@ 6 > dbNSFPv${version}.hg19.custombuild.gz
    zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=6 -T . -V -k 1,1 -k 2,2 | bgzip -@ 6 > dbNSFPv${version}.hg19.custombuild.gz

    # Create tabix index
    tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz
  17. sirselim revised this gist Aug 20, 2019. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -32,7 +32,7 @@ cat header.gz dbNSFPv${version}_custom.gz > dbNSFPv${version}_custombuild.gz
    tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz

    # test annotation
    java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}_custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
    # java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}_custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
    #TODO: provide actual unit test files for testing purposes, i.e. a section of public data with known annotation rates.
    #TODO: the above is currently a placeholder but it had it's intended purpose in terms of identifying incorrect genome build.

    @@ -52,7 +52,7 @@ zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"
    tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz

    # test hg19 annotation
    java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}.hg19.custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
    # java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}.hg19.custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf

    # clean up
    #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
  18. sirselim revised this gist Aug 20, 2019. 1 changed file with 60 additions and 1 deletion.
    61 changes: 60 additions & 1 deletion dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -1 +1,60 @@
    version="4.0a"
    #!/bin/bash
    ## small bash script to download and reformat dbNSFP for pipeline
    ## Miles Benton
    ## created: 2018-01-13
    ## modified: 2019-08-21

    # Set to dbNSFP version to download and build
    version="4.0a"
    #TODO: add an option to 'scrape' this from the url to always return latest version

    # Download dbNSFP database
    wget -O dbNSFP${version}.zip "ftp://dbnsfp:[email protected]/dbNSFP${version}.zip"

    # Uncompress
    unzip dbNSFP${version}.zip

    # grab header
    zcat dbNSFP${version}_variant.chr1.gz | head -n 1 | bgzip > header.gz

    ### this section will produce data for hg38 capable pipelines
    ## hg38 version

    # Create a single file version
    # NOTE: bgzip parameter -@ X represents number of threads
    cat dbNSFP${version}_variant.chr{1..22}.gz dbNSFP${version}_variant.chrX.gz dbNSFP${version}_variant.chrY.gz dbNSFP${version}_variant.chrM.gz | zgrep -v '#chr' | bgzip -@ 6 > dbNSFPv${version}_custom.gz
    #TODO: there must be a 'nicer' way of ordering the input into the cat (to include the X,Y and M chrs without explicitly stating them)

    # add header back into file
    cat header.gz dbNSFPv${version}_custom.gz > dbNSFPv${version}_custombuild.gz

    # Create tabix index
    tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz

    # test annotation
    java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}_custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
    #TODO: provide actual unit test files for testing purposes, i.e. a section of public data with known annotation rates.
    #TODO: the above is currently a placeholder but it had it's intended purpose in terms of identifying incorrect genome build.

    # clean up
    #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
    #/END hg38
    ###

    ### this section will produce data for hg19 capable pipelines
    ## hg19 version
    # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
    # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
    # NOTE: bgzip parameter -@ X represents number of threads
    zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort -T . -V -k 1,1 -k 2,2 | bgzip -@ 6 > dbNSFPv${version}.hg19.custombuild.gz

    # Create tabix index
    tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz

    # test hg19 annotation
    java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}.hg19.custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf

    # clean up
    #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
    #/END hg38
    ###
  19. sirselim created this gist Aug 20, 2019.
    1 change: 1 addition & 0 deletions dbNSFP_pipeline_build.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    version="4.0a"