Last active
January 24, 2025 08:14
-
-
Save sirselim/dcaad07523c90b46c1c0685efbc5d04e to your computer and use it in GitHub Desktop.
Revisions
-
sirselim revised this gist
Aug 23, 2019 . No changes.There are no files selected for viewing
-
sirselim revised this gist
Aug 23, 2019 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -57,11 +57,11 @@ awkBody1='$8 != "."' awkBody2='BEGIN{FS=OFS="\t"} {t = $2; $2 = $9; $9 = t; x = $1; $1 = $8; $8 = $1; print;}' # parallel implementation of the hg19 format and sort parallel -j 4 \ "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \ awk '${awkBody1}' | \ awk '${awkBody2}' | \ LC_ALL=C sort --parallel=2 -n -S 1G -T . -k 1,1 -k 2,2 --compress-program=gzip | \ bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt #TODO: need to implement a defined approach to GNU parallel, can't use $THREADS -
sirselim revised this gist
Aug 23, 2019 . No changes.There are no files selected for viewing
-
sirselim revised this gist
Aug 23, 2019 . 1 changed file with 6 additions and 5 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -53,16 +53,17 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt # set awk params as variable for parallel awkBody1='$8 != "."' awkBody2='BEGIN{FS=OFS="\t"} {t = $2; $2 = $9; $9 = t; x = $1; $1 = $8; $8 = $1; print;}' # parallel implementation of the hg19 format and sort parallel -j 6 \ "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \ awk '${awkBody1}' | \ awk '${awkBody2}' | \ LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \ bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt #TODO: need to implement a defined approach to GNU parallel, can't use $THREADS # cat all files back together (should retain sort order) cat header.gz dbNSFPv${version}.chr{1..22}.hg19.custombuild.gz dbNSFPv${version}.chrX.hg19.custombuild.gz dbNSFPv${version}.chrY.hg19.custombuild.gz dbNSFPv${version}.chrM.hg19.custombuild.gz > dbNSFPv${version}.hg19.custombuild.gz -
sirselim revised this gist
Aug 23, 2019 . 1 changed file with 2 additions and 9 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -55,21 +55,14 @@ echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt # set awk params as variable for parallel awkBody='$8 != "."' # parallel implementation of the hg19 format and sort parallel -j 4 \ "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \ awk '${awkBody}' | \ cut -f 8,9,3-7,1,2,10- | \ LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \ bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt #TODO: address correct threading options here, i.e. can't set GNU parallel to $THREADS as well as the parallel arguments to sort and bgzip # cat all files back together (should retain sort order) cat header.gz dbNSFPv${version}.chr{1..22}.hg19.custombuild.gz dbNSFPv${version}.chrX.hg19.custombuild.gz dbNSFPv${version}.chrY.hg19.custombuild.gz dbNSFPv${version}.chrM.hg19.custombuild.gz > dbNSFPv${version}.hg19.custombuild.gz -
sirselim revised this gist
Aug 23, 2019 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -58,8 +58,8 @@ awkBody='$8 != "."' # parallel dry run # parallel --dry-run -j 6 \ # "zgrep -v '#chr' dbNSFP4.0a_variant.chr{}.gz | \ # awk ${awkBody} | \ # cut -f 8,9,3-7,1,2,10- | \ # LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \ # bgzip -@ 2 > dbNSFPv4.0a.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt -
sirselim revised this gist
Aug 23, 2019 . 1 changed file with 3 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -53,8 +53,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt # set awk params as variable for parallel awkBody='$8 != "."' # parallel dry run # parallel --dry-run -j 6 \ @@ -67,8 +66,8 @@ awkBody2='BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1' # parallel implementation of the hg19 format and sort parallel -j 6 \ "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \ awk '${awkBody}' | \ cut -f 8,9,3-7,1,2,10- | \ LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \ bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt -
sirselim revised this gist
Aug 23, 2019 . 1 changed file with 27 additions and 18 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -48,13 +48,32 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position]) # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output # NOTE: bgzip parameter -@ X represents number of threads # create a file with ordered chrosome names echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt # set awk params as variable for parallel awkBody1='$8 != "."' awkBody2='BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1' # parallel dry run # parallel --dry-run -j 6 \ # "zgrep -v '#chr' dbNSFP4.0a_variant.chr{}.gz | \ # awk ${awkBody1} | \ # awk ${awkBody2} | \ # LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \ # bgzip -@ 2 > dbNSFPv4.0a.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt # parallel implementation of the hg19 format and sort parallel -j 6 \ "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \ awk '${awkBody1}' | \ awk '${awkBody2}' | \ LC_ALL=C sort --parallel=2 -n -S 2G -T . -k 1,1 -k 2,2 --compress-program=gzip | \ bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt # cat all files back together (should retain sort order) cat header.gz dbNSFPv${version}.chr{1..22}.hg19.custombuild.gz dbNSFPv${version}.chrX.hg19.custombuild.gz dbNSFPv${version}.chrY.hg19.custombuild.gz dbNSFPv${version}.chrM.hg19.custombuild.gz > dbNSFPv${version}.hg19.custombuild.gz # Create tabix index tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz @@ -65,14 +84,4 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz # clean up #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates') #/END hg38 ### -
sirselim revised this gist
Aug 22, 2019 . 1 changed file with 11 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -65,4 +65,14 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz # clean up #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates') #/END hg38 ### zgrep -v '#chr' dbNSFP4.0a_variant.chr{19..21}.gz | \ awk '$8 != "."' | \ awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| \ LC_ALL=C sort --parallel=20 -n -S 1G -T . -k 1,1 -k 2,2 --compress-program=gzip | \ bgzip -@ 20 > dbNSFPv4.0a.test.hg19.custombuild.gz tabix -s 1 -b 2 -e 2 dbNSFPv4.0a.test.hg19.custombuild.gz -
sirselim revised this gist
Aug 21, 2019 . 1 changed file with 6 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -48,8 +48,13 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position]) # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output # NOTE: bgzip parameter -@ X represents number of threads zcat dbNSFPv${version}_custombuild.gz | \ awk '$8 != "."' | \ awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| \ LC_ALL=C sort --parallel=${THREADS} -n -S 20G -T . -k 1,1 -k 2,2 --compress-program=gzip | \ bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz # NOTE: to try and overcome disk space limits giving sort 20Gb of RAM # TESTING: removed -V (version sort) option and replaced with -n (numeric sort), should improve speed # Create tabix index tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz -
sirselim revised this gist
Aug 21, 2019 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -48,7 +48,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position]) # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output # NOTE: bgzip parameter -@ X represents number of threads zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=${THREADS} -n -S 20G -T . -V -k 1,1 -k 2,2 --compress-program=gzip| bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz # NOTE: to try and overcome disk space limits giving sort 20Gb of RAM # Create tabix index -
sirselim revised this gist
Aug 21, 2019 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -49,6 +49,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output # NOTE: bgzip parameter -@ X represents number of threads zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=${THREADS} -S 20G -T . -V -k 1,1 -k 2,2 | bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz # NOTE: to try and overcome disk space limits giving sort 20Gb of RAM # Create tabix index tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz -
sirselim revised this gist
Aug 21, 2019 . 1 changed file with 4 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -7,6 +7,8 @@ # Set to dbNSFP version to download and build version="4.0a" #TODO: add an option to 'scrape' this from the url to always return latest version # define thread number for parallel processing where able THREADS=6 # Download dbNSFP database wget -O dbNSFP${version}.zip "ftp://dbnsfp:[email protected]/dbNSFP${version}.zip" @@ -22,7 +24,7 @@ zcat dbNSFP${version}_variant.chr1.gz | head -n 1 | bgzip > header.gz # Create a single file version # NOTE: bgzip parameter -@ X represents number of threads cat dbNSFP${version}_variant.chr{1..22}.gz dbNSFP${version}_variant.chrX.gz dbNSFP${version}_variant.chrY.gz dbNSFP${version}_variant.chrM.gz | zgrep -v '#chr' | bgzip -@ ${THREADS} > dbNSFPv${version}_custom.gz #TODO: there must be a 'nicer' way of ordering the input into the cat (to include the X,Y and M chrs without explicitly stating them) # add header back into file @@ -46,7 +48,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position]) # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output # NOTE: bgzip parameter -@ X represents number of threads zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=${THREADS} -S 20G -T . -V -k 1,1 -k 2,2 | bgzip -@ ${THREADS} > dbNSFPv${version}.hg19.custombuild.gz # Create tabix index tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz -
sirselim revised this gist
Aug 21, 2019 . No changes.There are no files selected for viewing
-
sirselim revised this gist
Aug 21, 2019 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -46,7 +46,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position]) # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output # NOTE: bgzip parameter -@ X represents number of threads zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=6 -S 20G -T . -V -k 1,1 -k 2,2 | bgzip -@ 6 > dbNSFPv${version}.hg19.custombuild.gz # Create tabix index tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz -
sirselim revised this gist
Aug 21, 2019 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -46,7 +46,7 @@ tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position]) # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output # NOTE: bgzip parameter -@ X represents number of threads zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort --parallel=6 -T . -V -k 1,1 -k 2,2 | bgzip -@ 6 > dbNSFPv${version}.hg19.custombuild.gz # Create tabix index tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz -
sirselim revised this gist
Aug 20, 2019 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -32,7 +32,7 @@ cat header.gz dbNSFPv${version}_custom.gz > dbNSFPv${version}_custombuild.gz tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz # test annotation # java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}_custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf #TODO: provide actual unit test files for testing purposes, i.e. a section of public data with known annotation rates. #TODO: the above is currently a placeholder but it had it's intended purpose in terms of identifying incorrect genome build. @@ -52,7 +52,7 @@ zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t" tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz # test hg19 annotation # java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}.hg19.custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf # clean up #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates') -
sirselim revised this gist
Aug 20, 2019 . 1 changed file with 60 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +1,60 @@ #!/bin/bash ## small bash script to download and reformat dbNSFP for pipeline ## Miles Benton ## created: 2018-01-13 ## modified: 2019-08-21 # Set to dbNSFP version to download and build version="4.0a" #TODO: add an option to 'scrape' this from the url to always return latest version # Download dbNSFP database wget -O dbNSFP${version}.zip "ftp://dbnsfp:[email protected]/dbNSFP${version}.zip" # Uncompress unzip dbNSFP${version}.zip # grab header zcat dbNSFP${version}_variant.chr1.gz | head -n 1 | bgzip > header.gz ### this section will produce data for hg38 capable pipelines ## hg38 version # Create a single file version # NOTE: bgzip parameter -@ X represents number of threads cat dbNSFP${version}_variant.chr{1..22}.gz dbNSFP${version}_variant.chrX.gz dbNSFP${version}_variant.chrY.gz dbNSFP${version}_variant.chrM.gz | zgrep -v '#chr' | bgzip -@ 6 > dbNSFPv${version}_custom.gz #TODO: there must be a 'nicer' way of ordering the input into the cat (to include the X,Y and M chrs without explicitly stating them) # add header back into file cat header.gz dbNSFPv${version}_custom.gz > dbNSFPv${version}_custombuild.gz # Create tabix index tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz # test annotation java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}_custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf #TODO: provide actual unit test files for testing purposes, i.e. a section of public data with known annotation rates. #TODO: the above is currently a placeholder but it had it's intended purpose in terms of identifying incorrect genome build. # clean up #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates') #/END hg38 ### ### this section will produce data for hg19 capable pipelines ## hg19 version # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position]) # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output # NOTE: bgzip parameter -@ X represents number of threads zcat dbNSFPv${version}_custombuild.gz | awk '$8 != "."' | awk 'BEGIN{FS=OFS="\t"} {$1=$8 && $2=$9; NF--}1'| LC_ALL=C sort -T . -V -k 1,1 -k 2,2 | bgzip -@ 6 > dbNSFPv${version}.hg19.custombuild.gz # Create tabix index tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz # test hg19 annotation java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}.hg19.custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf # clean up #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates') #/END hg38 ### -
sirselim created this gist
Aug 20, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ version="4.0a"