sirselim · January 24, 2025 08:14 · Aug 23, 2019 · Aug 23, 2019 · Aug 23, 2019 · Aug 23, 2019
diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -57,11 +57,11 @@ awkBody1='$8 != "."'
 awkBody2='BEGIN{FS=OFS="\t"} {t = $2; $2 = $9; $9 = t; x = $1; $1 = $8; $8 = $1; print;}'
 
 # parallel implementation of the hg19 format and sort
-parallel -j 6 \
+parallel -j 4 \
 "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \
 awk '${awkBody1}' | \
 awk '${awkBody2}' | \
-LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
+LC_ALL=C sort --parallel=2 -n -S 1G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
 bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
 #TODO: need to implement a defined approach to GNU parallel, can't use $THREADS 
 

diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -53,16 +53,17 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
 echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt
 
 # set awk params as variable for parallel
-awkBody='$8 != "."'
+awkBody1='$8 != "."'
+awkBody2='BEGIN{FS=OFS="\t"} {t = $2; $2 = $9; $9 = t; x = $1; $1 = $8; $8 = $1; print;}'
 
 # parallel implementation of the hg19 format and sort
-parallel -j 4 \
+parallel -j 6 \
 "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \
-awk '${awkBody}' | \
-cut -f 8,9,3-7,1,2,10- | \
+awk '${awkBody1}' | \
+awk '${awkBody2}' | \
 LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
 bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
-#TODO: address correct threading options here, i.e. can't set GNU parallel to $THREADS as well as the parallel arguments to sort and bgzip
+#TODO: need to implement a defined approach to GNU parallel, can't use $THREADS 
 
 # cat all files back together (should retain sort order)
 cat header.gz dbNSFPv${version}.chr{1..22}.hg19.custombuild.gz dbNSFPv${version}.chrX.hg19.custombuild.gz dbNSFPv${version}.chrY.hg19.custombuild.gz dbNSFPv${version}.chrM.hg19.custombuild.gz > dbNSFPv${version}.hg19.custombuild.gz

diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -55,21 +55,14 @@ echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt
 # set awk params as variable for parallel
 awkBody='$8 != "."'
 
-# parallel dry run
-# parallel --dry-run -j 6 \
-# "zgrep -v '#chr' dbNSFP4.0a_variant.chr{}.gz | \
-# awk ${awkBody} | \
-# cut -f 8,9,3-7,1,2,10- | \
-# LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
-# bgzip -@ 2 > dbNSFPv4.0a.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
-
 # parallel implementation of the hg19 format and sort
-parallel -j 6 \
+parallel -j 4 \
 "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \
 awk '${awkBody}' | \
 cut -f 8,9,3-7,1,2,10- | \
 LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
 bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
+#TODO: address correct threading options here, i.e. can't set GNU parallel to $THREADS as well as the parallel arguments to sort and bgzip
 
 # cat all files back together (should retain sort order)
 cat header.gz dbNSFPv${version}.chr{1..22}.hg19.custombuild.gz dbNSFPv${version}.chrX.hg19.custombuild.gz dbNSFPv${version}.chrY.hg19.custombuild.gz dbNSFPv${version}.chrM.hg19.custombuild.gz > dbNSFPv${version}.hg19.custombuild.gz

diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -58,8 +58,8 @@ awkBody='$8 != "."'
 # parallel dry run
 # parallel --dry-run -j 6 \
 # "zgrep -v '#chr' dbNSFP4.0a_variant.chr{}.gz | \
-# awk ${awkBody1} | \
-# awk ${awkBody2} | \
+# awk ${awkBody} | \
+# cut -f 8,9,3-7,1,2,10- | \
 # LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
 # bgzip -@ 2 > dbNSFPv4.0a.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
 

diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -53,8 +53,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
 echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt
 
 # set awk params as variable for parallel
-awkBody1='$8 != "."'
-awkBody2='BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'
+awkBody='$8 != "."'
 
 # parallel dry run
 # parallel --dry-run -j 6 \
@@ -67,8 +66,8 @@ awkBody2='BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'
 # parallel implementation of the hg19 format and sort
 parallel -j 6 \
 "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \
-awk '${awkBody1}' | \
-awk '${awkBody2}' | \
+awk '${awkBody}' | \
+cut -f 8,9,3-7,1,2,10- | \
 LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
 bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
 

diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -48,13 +48,32 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
 # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
 # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
 # NOTE: bgzip parameter -@ X represents number of threads
-zcat dbNSFPv${version}_custombuild.gz | \
-  awk '$8 != "."' | \
-  awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| \
-  LC_ALL=C sort --parallel=${THREADS} -n -S 20G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
-  bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz
-# NOTE: to try and overcome disk space limits giving sort 20Gb of RAM
-# TESTING: removed -V (version sort) option and replaced with -n (numeric sort), should improve speed
+
+# create a file with ordered chrosome names
+echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt
+
+# set awk params as variable for parallel
+awkBody1='$8 != "."'
+awkBody2='BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'
+
+# parallel dry run
+# parallel --dry-run -j 6 \
+# "zgrep -v '#chr' dbNSFP4.0a_variant.chr{}.gz | \
+# awk ${awkBody1} | \
+# awk ${awkBody2} | \
+# LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
+# bgzip -@ 2 > dbNSFPv4.0a.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
+
+# parallel implementation of the hg19 format and sort
+parallel -j 6 \
+"zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \
+awk '${awkBody1}' | \
+awk '${awkBody2}' | \
+LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
+bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
+
+# cat all files back together (should retain sort order)
+cat header.gz dbNSFPv${version}.chr{1..22}.hg19.custombuild.gz dbNSFPv${version}.chrX.hg19.custombuild.gz dbNSFPv${version}.chrY.hg19.custombuild.gz dbNSFPv${version}.chrM.hg19.custombuild.gz > dbNSFPv${version}.hg19.custombuild.gz
 
 # Create tabix index
 tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz
@@ -65,14 +84,4 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz
 # clean up
 #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
 #/END hg38
-###
-
-
-
-zgrep -v '#chr' dbNSFP4.0a_variant.chr{19..21}.gz | \
-  awk '$8 != "."' | \
-  awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| \
-  LC_ALL=C sort --parallel=20 -n -S 1G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
-  bgzip -@ 20 > dbNSFPv4.0a.test.hg19.custombuild.gz
-
-  tabix -s 1 -b 2 -e 2 dbNSFPv4.0a.test.hg19.custombuild.gz
+###
diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -65,4 +65,14 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz
 # clean up
 #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
 #/END hg38
-###
+###
+
+
+
+zgrep -v '#chr' dbNSFP4.0a_variant.chr{19..21}.gz | \
+  awk '$8 != "."' | \
+  awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| \
+  LC_ALL=C sort --parallel=20 -n -S 1G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
+  bgzip -@ 20 > dbNSFPv4.0a.test.hg19.custombuild.gz
+
+  tabix -s 1 -b 2 -e 2 dbNSFPv4.0a.test.hg19.custombuild.gz
diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -48,8 +48,13 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
 # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
 # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
 # NOTE: bgzip parameter -@ X represents number of threads
-zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=${THREADS} -n -S 20G -T . -V -k 1,1 -k 2,2 --compress-program=gzip| bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz
+zcat dbNSFPv${version}_custombuild.gz | \
+  awk '$8 != "."' | \
+  awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| \
+  LC_ALL=C sort --parallel=${THREADS} -n -S 20G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
+  bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz
 # NOTE: to try and overcome disk space limits giving sort 20Gb of RAM
+# TESTING: removed -V (version sort) option and replaced with -n (numeric sort), should improve speed
 
 # Create tabix index
 tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz

diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -48,7 +48,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
 # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
 # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
 # NOTE: bgzip parameter -@ X represents number of threads
-zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=${THREADS} -S 20G -T . -V -k 1,1 -k 2,2 | bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz
+zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=${THREADS} -n -S 20G -T . -V -k 1,1 -k 2,2 --compress-program=gzip| bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz
 # NOTE: to try and overcome disk space limits giving sort 20Gb of RAM
 
 # Create tabix index

diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -49,6 +49,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
 # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
 # NOTE: bgzip parameter -@ X represents number of threads
 zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=${THREADS} -S 20G -T . -V -k 1,1 -k 2,2 | bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz
+# NOTE: to try and overcome disk space limits giving sort 20Gb of RAM
 
 # Create tabix index
 tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz

diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -7,6 +7,8 @@
 # Set to dbNSFP version to download and build
 version="4.0a"
 #TODO: add an option to 'scrape' this from the url to always return latest version
+# define thread number for parallel processing where able
+THREADS=6
 
 # Download dbNSFP database
 wget -O dbNSFP${version}.zip "ftp://dbnsfp:[email protected]/dbNSFP${version}.zip"
@@ -22,7 +24,7 @@ zcat dbNSFP${version}_variant.chr1.gz | head -n 1 | bgzip > header.gz
 
 # Create a single file version
 # NOTE: bgzip parameter -@ X represents number of threads
-cat dbNSFP${version}_variant.chr{1..22}.gz dbNSFP${version}_variant.chrX.gz dbNSFP${version}_variant.chrY.gz dbNSFP${version}_variant.chrM.gz | zgrep -v '#chr' | bgzip -@ 6 > dbNSFPv${version}_custom.gz
+cat dbNSFP${version}_variant.chr{1..22}.gz dbNSFP${version}_variant.chrX.gz dbNSFP${version}_variant.chrY.gz dbNSFP${version}_variant.chrM.gz | zgrep -v '#chr' | bgzip -@ ${THREADS} > dbNSFPv${version}_custom.gz
 #TODO: there must be a 'nicer' way of ordering the input into the cat (to include the X,Y and M chrs without explicitly stating them)
 
 # add header back into file
@@ -46,7 +48,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
 # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
 # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
 # NOTE: bgzip parameter -@ X represents number of threads
-zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=6 -S 20G -T . -V -k 1,1 -k 2,2 | bgzip -@ 6 > dbNSFPv${version}.hg19.custombuild.gz
+zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=${THREADS} -S 20G -T . -V -k 1,1 -k 2,2 | bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz
 
 # Create tabix index
 tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz

diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -46,7 +46,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
 # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
 # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
 # NOTE: bgzip parameter -@ X represents number of threads
-zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=6 -T . -V -k 1,1 -k 2,2 | bgzip -@ 6 > dbNSFPv${version}.hg19.custombuild.gz
+zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=6 -S 20G -T . -V -k 1,1 -k 2,2 | bgzip -@ 6 > dbNSFPv${version}.hg19.custombuild.gz
 
 # Create tabix index
 tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz

diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -46,7 +46,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
 # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
 # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
 # NOTE: bgzip parameter -@ X represents number of threads
-zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort -T . -V -k 1,1 -k 2,2 | bgzip -@ 6 > dbNSFPv${version}.hg19.custombuild.gz
+zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=6 -T . -V -k 1,1 -k 2,2 | bgzip -@ 6 > dbNSFPv${version}.hg19.custombuild.gz
 
 # Create tabix index
 tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz

diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -32,7 +32,7 @@ cat header.gz dbNSFPv${version}_custom.gz > dbNSFPv${version}_custombuild.gz
 tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
 
 # test annotation
-java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}_custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
+# java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}_custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
 #TODO: provide actual unit test files for testing purposes, i.e. a section of public data with known annotation rates.
 #TODO: the above is currently a placeholder but it had it's intended purpose in terms of identifying incorrect genome build. 
 
@@ -52,7 +52,7 @@ zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"
 tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz
 
 # test hg19 annotation
-java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}.hg19.custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
+# java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}.hg19.custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
 
 # clean up
 #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')

diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -1 +1,60 @@
-version="4.0a"
+#!/bin/bash
+## small bash script to download and reformat dbNSFP for pipeline 
+## Miles Benton
+## created: 2018-01-13
+## modified: 2019-08-21
+
+# Set to dbNSFP version to download and build
+version="4.0a"
+#TODO: add an option to 'scrape' this from the url to always return latest version
+
+# Download dbNSFP database
+wget -O dbNSFP${version}.zip "ftp://dbnsfp:[email protected]/dbNSFP${version}.zip"
+
+# Uncompress
+unzip dbNSFP${version}.zip
+
+# grab header
+zcat dbNSFP${version}_variant.chr1.gz | head -n 1 | bgzip > header.gz
+
+### this section will produce data for hg38 capable pipelines
+## hg38 version
+
+# Create a single file version
+# NOTE: bgzip parameter -@ X represents number of threads
+cat dbNSFP${version}_variant.chr{1..22}.gz dbNSFP${version}_variant.chrX.gz dbNSFP${version}_variant.chrY.gz dbNSFP${version}_variant.chrM.gz | zgrep -v '#chr' | bgzip -@ 6 > dbNSFPv${version}_custom.gz
+#TODO: there must be a 'nicer' way of ordering the input into the cat (to include the X,Y and M chrs without explicitly stating them)
+
+# add header back into file
+cat header.gz dbNSFPv${version}_custom.gz > dbNSFPv${version}_custombuild.gz
+
+# Create tabix index
+tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz
+
+# test annotation
+java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}_custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
+#TODO: provide actual unit test files for testing purposes, i.e. a section of public data with known annotation rates.
+#TODO: the above is currently a placeholder but it had it's intended purpose in terms of identifying incorrect genome build. 
+
+# clean up
+#TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
+#/END hg38
+###
+
+### this section will produce data for hg19 capable pipelines
+## hg19 version
+# for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
+# this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
+# NOTE: bgzip parameter -@ X represents number of threads
+zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort -T . -V -k 1,1 -k 2,2 | bgzip -@ 6 > dbNSFPv${version}.hg19.custombuild.gz
+
+# Create tabix index
+tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz
+
+# test hg19 annotation
+java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}.hg19.custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
+
+# clean up
+#TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
+#/END hg38
+###
diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
@@ -0,0 +1 @@
+version="4.0a"
No results found