-
Star
(172)
You must be signed in to star a gist -
Fork
(62)
You must be signed in to fork a gist
-
-
Save elowy01/93922762e131d7abd3c7e8e166a74a0b to your computer and use it in GitHub Desktop.
| *bcftools filter | |
| *Filter variants per region (in this example, print out only variants mapped to chr1 and chr2) | |
| qbcftools filter -r1,2 ALL.chip.omni_broad_sanger_combined.20140818.snps.genotypes.hg38.vcf.gz | |
| *printing out info for only 2 samples: | |
| bcftools view -s NA20818,NA20819 filename.vcf.gz | |
| *printing stats only for variants passing the filter: | |
| bcftools view -f PASS filename.vcf.gz | |
| *printing variants withoud header: | |
| bcftools view -H | |
| *printing variants on a particular region: | |
| bcftools view -r chr20:1-200000 -s NA20818,NA20819 filename.vcf.gz | |
| *print all variants except for the ones falling within region: | |
| bcftools view -t ^chr20:1-30000000 ex_bams.samtools.20161231.vcf.gz >out.vcf | |
| *view the positions passed in a file (accepted files are .vcf and .bed): | |
| bcftools view -R 0002.vcf in.vcf.gz | |
| *view the positions passed in a tsv file: | |
| bcftools view -R 0002.tsv in.vcf.gz | |
| # The format of 0002.tsv: | |
| 20 79000 80000 | |
| 20 90000 100000 | |
| *now, print out the chr\tpos | |
| bcftools query -f '%CHROM\t%POS\n' filename.vcf | |
| / | |
| *now, print out the AF INFO field | |
| bcftools query -f '%INFO/AF\n' | |
| / | |
| #getting a particular annotation from the VCF | |
| bcftools query -f '%QUAL\n' 0002.vcf | |
| / | |
| #printing chr pos and a particular annotation from a VCF: | |
| bcftools query -f '%CHROM\t%POS\t%INFO/DP\n' in.vcf.gz | |
| / | |
| #printing out the sets assigned by GATK CombineVariants | |
| ~/bin/bcftools-1.6/bcftools query -f '%set\n' out_combine.vcf.gz |sort |uniq | |
| / | |
| 0#printing a list of samples from a VCF: | |
| bcftools query -l test.vcf | |
| *selecting snps from file: | |
| ~/bin/bcftools/bcftools view -v snps lc_bams.bcftools.20170319.NA12878.vcf.gz | |
| / | |
| *selecting the variants from a VCF (excluding 0|0 genotypes) | |
| bcftools view -c1 input.vcf | |
| / | |
| *selecting the non-variants from a VCF(AC=0) | |
| bcftools view -H -C0 concat.allchrs.sites.vcf.gz | |
| / | |
| #filtering: | |
| / | |
| #using one of the INFO annotations (IDV) | |
| bcftools filter -sFilterName -e'IDV<5' input.vcf | |
| / | |
| #OR logical operator: | |
| bcftools filter -sFilterName -e'DP>50000 | IDV<9' input.vcf | |
| / | |
| #filtering on FORMAT annotation: | |
| bcftools filter -sFilterName -e'FORMAT/DP<5' input.vcf | |
| / | |
| #filtering on INFO annotation: | |
| bcftools filter -sFilterName -e'INFO/DP<5' input.vcf | |
| / | |
| #printing out variants that pass the filter: | |
| ~/bin/bcftools/bcftools view -f.,PASS lc_bams.bcftools.20170411.exc.norm.SNP.filtered.vcf.gz | |
| / | |
| #bcftools stats and filtering: | |
| ~/bin/bcftools/bcftools stats -f "PASS,." file.vcf | |
| / | |
| #select only biallelic (excluding multiallelic) snps | |
| bcftools view -m2 -M2 -v snps input.vcf.gz | |
| / | |
| #select only the multiallelic snps | |
| bcftools view -m3 -v snps input.vcf.gz | |
| / | |
| #printing the set info in the INFO field: | |
| bcftools view -i 'set="freebayes_lcex"' combined.all.chr20.vcf.gz | |
| / | |
| #printing all entries having a quality <10 | |
| bcftools view -i 'QUAL<10' in.vcf.gz | |
| / | |
| #removing FORMAT column from the VCF | |
| #it will remove all FORMAT annotations except the GT information | |
| bcftools annotate -x FORMAT ifile.vcf.gz | |
| / | |
| #removing INFO field from VCF | |
| bcftools annotate --remove INFO in.vcf.gz | |
| / | |
| #annotating a vcf file using the annotations from a different VCF (in this case we only annotate the INFO/DP) | |
| bcftools annotate -c 'INFO/DP' -a annt.vcf.gz in.vcf.gz | |
| / | |
| #annotating a vcf file with a tabular file: | |
| see page https://github.com/samtools/bcftools/wiki/HOWTOs#annotate-from-bed | |
| / | |
| #drop individual genotype information | |
| bcftools view -G input.vcf.gz | |
| / | |
| #getting stats on the number of REF/ALT swaps and other things: | |
| bcftools +fixref file.bcf -- -f ref.fa | |
| / | |
| #correcting the REF/ALT swaps: | |
| bcftools norm --check-ref ws -f ref.fa in.vcf.gz -o out.vcf.gz -Oz | |
| // | |
| #changing the sample names in a VCF: | |
| #the samplenames.txt file has the following format: | |
| #oldsamplename newsamplename | |
| bcftools reheader -s samplenames.txt NA12878.giab.SNP.chr20.non_valid.vcf.gz -o NA12878.giab.SNP.chr20.non_valid.reheaded.vcf.gz | |
| // | |
| #changing the header: | |
| bcftools reheader -h newheader.txt ../combined.all.chr20.vcf.gz.snps.biallelic.vcf_chr20.vcf.gz.ensembl.vcf.gz.phased.ligated.NA12878.ucsc.vcf.gz -o combined.all.chr20.vcf.gz.snps.biallelic.vcf_chr20.vcf.gz.ensembl.vcf.gz.phased.ligated.NA12878.ucsc.reheaded.vcf.gz | |
| // | |
| # in order to use the plugins: | |
| export BCFTOOLS_PLUGINS=~/bin/bcftools-1.6/plugins/ | |
| // | |
| # taqg2tag: | |
| # convert PL to GL | |
| bcftools +tag2tag in.vcf -- -r --pl-to-gl | |
| // | |
| # normalizing the multiallelic variants: | |
| # with -any I will split the multiallelic variants (SNPs+INDELs) into several records | |
| bcftools norm -m -any in.vcf.gz -o out.norm.vcf.gz -Oz | |
| For example: | |
| chr20 60280 . TTTCCA TTTCCATTCCA,T 744 PASS . | |
| Will be converted to: | |
| chr20 60280 . TTTCCA TTTCCATTCCA 744 PASS . | |
| chr20 60280 . TTTCCA T 744 PASS . | |
| // | |
| #selecting the missing (uncalled) genotypes: | |
| bcftools view -u in.vcf.gz -o missing_genotypes.vcf.gz -Oz | |
| // | |
| #select a particular genotype (0/1 or 1/1) from a vcf. In this case access sample accessed by index 8: | |
| bcftools view -H combined.snps_indels_chr1.filt.vcf.gz.onlyvariants.vcf.gz.ensembl.vcf.gz.85706.vcf.gz -i 'GT[8]="het"' | |
| // | |
| #select all lines having exactly AC=2 | |
| bcftools view -i'AC=2' in.vcf.gz | |
| // | |
| # if we have a tab in a VCF defined in the header like: | |
| ##INFO=<ID=GRCH37_38_REF_STRING_MATCH,Number=0,Type=Flag,Description="Indicates reference allele in origin GRCh37 vcf string-matches reference allele in dbsnp GRCh38 vcf"> | |
| / | |
| #we can check for the records having this tag by doing: | |
| bcftools view -H -i'GRCH37_38_REF_STRING_MATCH=1' ALL.chr7_GRCh38.genotypes.20170504.ensembl.vcf.NA12878.biallelic.nonvariants.nonvalid.snps.vcf.gz |less | |
| / | |
| #And the contrary by doing: | |
| bcftools view -H -i'GRCH37_38_REF_STRING_MATCH=0' ALL.chr7_GRCh38.genotypes.20170504.ensembl.vcf.NA12878.biallelic.nonvariants.nonvalid.snps.vcf.gz |less | |
| / | |
| # Filtering a VCF depending on a certain Allele frequency: | |
| bcftools view -i 'INFO/AF > 0.8' z.vcf.gz |
This is great - any chance bcftools isec options could also be included?
Thank you! Great compendium of commands!
Hi,
Hope you can help.
I am trying to filter by CADD>20 using this command using bcftools/1.11:
bcftools view -i "INFO/CADD_phred>20" -Oz -o out.vcf.gz in.vcf.gz
This is the error message:
Error: cannot use arithmetic operators to compare strings and numbers
I have tried to change the header for CADD_phred from string to float and invoked the above command, but it didn't work.
##INFO=<ID=CADD_phred,Number=.,Type=String,Description="CADD_phred annotation provided by ANNOVAR">
##INFO=<ID=CADD_phred,Number=.,Type=Float,Description="CADD_phred annotation provided by ANNOVAR">
I am also trying to filter variants by nonsynonymous_SNV and stopgain but I haven't been successful.
Any idea how to proceed?
Thanks in advance
This is great!
A nice thing to possibly add:
https://gist.github.com/adefelicibus/e668b2f03c157b3b272c871030c5d0b9
Split VCF by sample: