Last active
June 3, 2018 13:04
-
-
Save hongiiv/745c34d86d3730e9e4a6 to your computer and use it in GitHub Desktop.
Calling variants on cohorts of samples
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| #$ -V | |
| #$ -S /bin/bash | |
| #Embrrassingly Parallel Computations Using the Sun Grid Engine | |
| #Array Job | |
| #qsub -pe make 3 -cwd -j y -N hong -t 1-3 test.sh | |
| #MANIFEST=sample1.txt | |
| #SAMPLE=`awk "NR==${SGE_TASK_ID}" ${MANIFEST}` | |
| #echo $SAMPLE | |
| pipe_name="Calling variants on cohorts of samples using GATK & PLINK " | |
| pipe_version="20151006" | |
| project="hongiiv" | |
| debug="true" | |
| set -x # activate debugging | |
| set -o pipefail # trace ERR through pipes | |
| set -o errtrace # trace ERR through 'time command' and other functions | |
| set -o nounset ## set -u : exit the script if you try to use an uninitialised variable | |
| set -o errexit ## set -e : exit the script if any statement returns a non-true return value | |
| #################################################################################### | |
| #error check | |
| #################################################################################### | |
| function error_exit { | |
| echo | |
| echo "$@" | |
| exit 1 | |
| } | |
| #trap the killer signals so that we can exit with a good message. | |
| trap "error_exit 'Received signal SIGHUP'" SIGHUP | |
| trap "error_exit 'Received signal SIGINT'" SIGINT | |
| trap "error_exit 'Received signal SIGTERM'" SIGTERM | |
| trap "error_exit 'Received signal SIGPIPE'" SIGPIPE | |
| trap "error_exit 'Received signal SIGKILL'" SIGKILL | |
| #################################################################################### | |
| #System environment | |
| #################################################################################### | |
| TMPDIR=/GLUSTER/KEC/KEC_VCF/hongiiv_test/temp; export TMPDIR | |
| ulimit -n 2048 | |
| #################################################################################### | |
| #software | |
| #################################################################################### | |
| gatk="/BIO/app/GenomeAnalysisTK-3.3-0/GenomeAnalysisTK.jar" | |
| java="/usr/bin/java" #version 1.7.0_25 | |
| vcftools="/BIO/app/vcftools_0.1.10/bin/vcftools" | |
| plink="/GLUSTER/KEC/KEC_VCF/hongiiv_test/plink" #v1.07 (10/Aug/2009) | |
| samtools="/NAS/tools/samtools-0.1.19/samtools" | |
| bcftools="/NAS/tools/samtools-0.1.19/bcftools/bcftools" | |
| Yfitter="/NAS/tools/Yfitter/Yfitter" #v0.2 | |
| Yfitter_dir="/NAS/tools/Yfitter" | |
| python="/usr/bin/python" #2.7.3 | |
| conifer="/NAS/tools/conifer_v0.2.2/conifer.py" | |
| eigenstrat="/NAS/tools/EIG/bin" #v6.0.1 (12/12/14) | |
| #################################################################################### | |
| #gatk bundle (references) | |
| #################################################################################### | |
| bait="/NAS/tools/korean_DB_bait_intersect.with_chr_final.bed" | |
| ref="/NAS/tools/hg19_macrogen_short.fa" #contig chrM,chr1,chr2 | |
| hapmap="/DB/reference_data/gatk_bundle/2.8/hg19/hapmap_3.3.hg19.vcf" | |
| omni2="/DB/reference_data/gatk_bundle/2.8/b37/1000G_omni2.5.b37.vcf" | |
| omni="/DB/reference_data/gatk_bundle/2.8/hg19/1000G_omni2.5.hg19.vcf" | |
| hc="/DB/reference_data/gatk_bundle/2.8/hg19/1000G_phase1.snps.high_confidence.hg19.vcf" | |
| dbsnp="/DB/reference_data/gatk_bundle/2.8/hg19/dbsnp_138.hg19.vcf" | |
| dbsnp144="/BIO/reference/00-All.vcf" #contig 1,2,3 | |
| mills="/DB/reference_data/gatk_bundle/2.8/hg19/Mills_and_1000G_gold_standard.indels.hg19.vcf" | |
| #################################################################################### | |
| #functions | |
| #################################################################################### | |
| time_start=$(date +%s) | |
| time_now=${time_start} | |
| time_end=${time_start} | |
| LOG_FILE="logs/${project}-${time_start}.log" | |
| LOG() { | |
| time_now=$(date +%s) | |
| #echo "[${project}][$(date "+%F %T")][time: $((${time_now} - ${time_end})) sec] $*" >> ${LOG_FILE} | |
| echo "[$(date "+%F %T")] $*" >> ${LOG_FILE} | |
| time_end=${time_now} | |
| } | |
| DEBUG() { | |
| if [ "$debug" == "true" ]; then | |
| echo "[DEBUG] [$(date "+%F %T")] $*" >> ${LOG_FILE} | |
| fi | |
| } | |
| if [ -d "logs" ]; then | |
| LOG "logs directory exists" | |
| else | |
| mkdir logs | |
| fi | |
| check_line_size_bam() { | |
| fastqR1linesize=`zcat $1 | wc -l | cut -d' ' -f1` | |
| check_error $? | |
| fastqhalfsize=`expr ${fastqR1linesize} / 2` | |
| check_error $? | |
| bamlinesize=`${samtools} view $2 | wc -l ${FASTQ} | cut -d' ' -f1` | |
| check_error $? | |
| LOG "$1 half line size: ${fastqhalfsize}" | |
| LOG "$2 line size: ${bamlinesize}" | |
| if [ ${fastqhalfsize} -ne ${bamlinesize} ]; then | |
| LOG "check_line_size_bam failed!" | |
| fi | |
| } | |
| #################################################################################### | |
| #sample list | |
| #################################################################################### | |
| if [ -f "sample.list" ]; then | |
| LOG "Delete previous sample.list" | |
| rm -rf sample.list | |
| else | |
| touch sample.list | |
| fi | |
| sample_count="0" | |
| DEBUG "==========================Sample List==========================" | |
| for sample in $(ls -alh|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}'); | |
| do | |
| sample_count=$((${sample_count}+1)) | |
| DEBUG $sample; | |
| echo $sample >> sample.list | |
| done | |
| DEBUG "===============================================================" | |
| LOG "Total samples: ${sample_count}" | |
| #################################################################################### | |
| #Calculate RPKM (Reads Per Kilobase per Million mapped reads) | |
| #RPKM = numReads / (geneLength/1,000 * totalNumReads/1,000,000) | |
| #numReads: number of reads mapped to a gene sequence | |
| #geneLength: length of the gene sequence | |
| #totalNumReads: total number of mapped reads of a sample | |
| #################################################################################### | |
| if [ -d "RPKM" ]; then | |
| LOG "RPKM directory exists" | |
| else | |
| mkdir RPKM | |
| fi | |
| # 1. Using job array | |
| manifest="sample.list" | |
| job=$(echo -e " | |
| #!/bin/bash | |
| # | |
| #$ -cwd | |
| #$ -j y | |
| #$ -S /bin/bash | |
| # | |
| sample=\`awk \"NR==\${SGE_TASK_ID}\" ${manifest}\` \n | |
| # expeted location of output and MD5 verification files \n | |
| OUTFILE=RPKM/\${sample}_analysis.hdf5 \n | |
| MD5FILE=\${OUTFILE}.md5 \n | |
| # check if MD5 file already exists and matches output file \n | |
| if [[ -f \$OUTFILE && -f \$MD5FILE && \`md5sum \${OUTFILE} | cut -f1 -d\" \"\` = \`cat \${MD5FILE} | cut -f1 -d\" \"\` ]]; then | |
| # task was previously run successfully, skip this time \n | |
| echo -e \`date\` "\\t\${JOB_NAME}\\t\${JOB_ID}\\t\${SGE_TASK_ID}\\t\${SAMPLE}\\tSKIP" >> $LOG_FILE \n | |
| exit 0 \n | |
| else \n | |
| # do the main work \n | |
| date | |
| hostname | |
| # check exit status \n | |
| STATUS=\$? \n | |
| if [[ \$STATUS -eq 0 ]]; then \n | |
| if [[ -f \$OUTFILE ]]; then \n | |
| echo \`md5sum \$OUTFILE > \$MD5FILE\` \n | |
| else | |
| echo "check output file fail" >> $LOG_FILE \n | |
| echo -e [\`date\`] "\\t\${JOB_NAME}\\t\${JOB_ID}\\t\${SGE_TASK_ID}\\t\\${SAMPLE}\tFAIL\\t\${STATUS}" >> $LOG_FILE \n | |
| exit 0 \n | |
| fi \n | |
| echo -e \`date\` "\\t\${JOB_NAME}\\t\${JOB_ID}\\t\${SGE_TASK_ID}\\t\${SAMPLE}\\tOK" >> $LOG_FILE \n | |
| else \n | |
| echo -e \`date\` "\\t\${JOB_NAME}\\t\${JOB_ID}\\t\${SGE_TASK_ID}\\t\\${SAMPLE}\tFAIL\\t\${STATUS}" >> $LOG_FILE \n | |
| fi \n | |
| exit \$STATUS \n | |
| fi" | qsub -pe make 8 -cwd -j y -N STEP1_RPKM -t 1-${sample_count} | |
| ) | |
| DEBUG "${job}" | |
| # 2.Using for statement | |
| for sample in $(ls -alh|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}'); | |
| do | |
| job=$(echo -e " | |
| ${python} ${conifer} rpkm --probes ${bait} --input ${sample}.sorted.dedup.realigned.recalibrated.sorted.bam --output RPKM/${sample}.rpkm.txt; | |
| ${python} ${conifer} analyze --probes ${bait} --rpkm_dir ./RPKM/ --output RPKM/${sample}_analysis.hdf5 --svd 6 --write_svals RPKM/${sample}_singular_values.txt --plot_scree RPKM/${sample}_screeplot.png --write_sd RPKM/${sample}_sd_values.txt; | |
| ${python} ${conifer} call --input RPKM/${sample}_analysis.hdf5 --output RPKM/${sample}_calls.txt; | |
| ${python} ${conifer} plotcalls --input RPKM/${sample}_analysis.hdf5 --calls RPKM/${sample}_calls.txt --outputdir ./RPKM/;" | | |
| qsub -pe make 8 -cwd -j y -N rpkm_${sample}) | |
| ) | |
| DEBUG $job | |
| done | |
| #################################################################################### | |
| #Mitochondria haplogrup assignment | |
| #Online submission of query to haplogrep at http://haplogrep.uibk.ac.at | |
| #assignment of .N. to heterozygous calls by using UNIX command line | |
| #################################################################################### | |
| mkdir chrM | |
| for sample in $(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}'); | |
| do | |
| job=$(echo -e " | |
| ${samtools} view -h -b -o chrM/${sample}.chrM.bam ${sample}.bam chrM; | |
| ${samtools} mpileup -E -q 30 -Q 30 -C 50 -r chrM:1-16569 -uf chrM/${sample}.chrM.bam | ${bcftools} view -cg - > chrM/${sample}_chrM.vcf; | |
| ${java} -jar ${gatk} -T FastaAlternateReferenceMarker -R ${ref} -o chrM/${sample}_chrM.fasta -variant chrM/${sample}_chrM.vcf; | |
| ${python} fas2hsd.py chrM/${sample}_chrM.fasta | |
| " | qsub -pe make 8 -cwd -j y -N mitho_${sample}) | |
| done | |
| #################################################################################### | |
| #Chromosome Y haplogroup assignments | |
| #To assign haplogrup for chromosome Y of male samples in Korean, | |
| #samtools and bcftools used to generate necessary input file to haplogroup assignment software YFitter v0.2 | |
| #################################################################################### | |
| mkdir chrY | |
| for sample in $(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}'); | |
| do | |
| samples="${sample}.bam ${samples} " | |
| job=$({echo -e "${samtools} view -h -b -o chrY/${sample}.chrY.bam ${sample}.bam chrY" | qsub -pe make 8 -cwd -j y -N chrY_${sample}) | |
| jids="chrY_${sample}, ${jids}" | |
| done | |
| job=$(echo -e " | |
| ${samtools} mpileup -uf /NAS/tools/chrY.fa -C 50 -Q 20 -q 20 ${samples} > chrY/allYs.bcf; | |
| ${bcftools} view -Q chrY/allYs.bcf ${Yfitter_dir}/karafet_sites_b37.pos > chrY/infosites.qcall; | |
| ${Yfitter} -m ${Yfitter_dir}/karafet_tree_b37.xml chrY/infosites.qcall > chrY/allYs_haplogroup.txt; | |
| "| qsub -pe make 8 -cwd -j y -N yfitter -hold_jid ${jids}) | |
| #counting bam files | |
| count=$(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}'|wc -l) | |
| if [ $count -gt 200 ] | |
| then | |
| echo "over 200 samples. try to by manually" | |
| else | |
| echo "false" | |
| fi | |
| #################################################################################### | |
| #Depth of Coverage | |
| #################################################################################### | |
| mkdir doc | |
| for sample in $(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}'); | |
| do | |
| job=$(echo -e "${java} -Xmx4g -jar ${gatk} -T DepthOfCoverage -R ${ref} -I ${sample}.recal.bam -o doc/${sample}.cov.out -L ${bait} -omitBaseOutput -omitLocusTable -omitIntervals --minMappingQuality 20 --minBaseQuality 20 --logging_level ERROR --summaryCoverageThreshold 20 --summaryCoverageThreshold 40 --summaryCoverageThreshold 60 --summaryCoverageThreshold 80 --summaryCoverageThreshold 100" | qsub -pe make 4 -cwd -j y -N doc_${sample}) | |
| done | |
| #################################################################################### | |
| #genotype GVCF | |
| #################################################################################### | |
| mkdir gvcf | |
| jids="" | |
| for sample in $(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}'); | |
| do | |
| job=$(echo -e "${java} -Xmx4g -jar ${gatk} -T HaplotypeCaller -R ${ref} -L ${bait} -I ${sample}.bam --emitRefConfidence GVCF --variant_index_type LINEAR --variant_index_parameter 128000 -stand_call_conf 30.0 -stand_emit_conf 10.0 -o gvcf/${sample}.hc.g.vcf" | qsub -pe make 8 -cwd -j y -N gvcf_${sample}) | |
| jids="gvcf_${sample}, ${jids}" | |
| done | |
| #################################################################################### | |
| #The gVCFs were combined in 298 groups (~sqrt(n) gVCFs in each group) and | |
| then joint genotyping of SNPs and Indels were performed on all groups. | |
| #################################################################################### | |
| for vcf in $(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}'); | |
| do | |
| vcfs="--variant gvcf/${vcf}.hc.g.vcf ${vcfs} " | |
| done | |
| job=$(echo -e " | |
| ${java} -Xmx15g -jar ${gatk} -T CombineGVCFs -R ${ref} -o final/group.g.vcf ${vcfs}" | qsub -pe make 8 -cwd -j y -N combinegvcf -hold_jid ${jids} | |
| ") | |
| #################################################################################### | |
| #Genotyping each chromosome | |
| #################################################################################### | |
| for chr in chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chrM chrX chrY | |
| do | |
| job=$(echo -e " | |
| ${java] -Xmx15g -jar ${gatk} -nt 11 -T GenotypeGVCFs -R ${ref} --variant final/group.g.vcf -L ${chr} -o final/grouped_${chr}.vcf --annotation StrandOddsRatio -A QualByDepth -A FisherStrand -A MappingQualityRankSumTest -A MappingQualityZero -A ReadPosRankSumTest -A InbreedingCoeff -A DepthPerAlleleBySample -A HaplotypeScore" | qsub -pe make 8 -cwd -j y -N genotypegvcf -hold_jid combinegvcf) | |
| done | |
| #################################################################################### | |
| #Merge genotype calls | |
| #################################################################################### | |
| for chr in chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chrM chrX chrY | |
| do | |
| vcfs="-V final/grouped_${chr}.vcf ${vcfs}" | |
| done | |
| job=$(echo -e " | |
| ${java} -Xmx15g -cp ${gatk} org.broadinstitute.gatk.tools.CatVariants -R ${ref} ${vcfs} -out final/merged_chr.vcf -assumeSorted" | qsub -pe make 8 -cwd -j y -N catvariant --hold_jid genotypegvcf | |
| ) | |
| #################################################################################### | |
| #Select variants in the bait region | |
| #################################################################################### | |
| job=$(echo -e " | |
| ${java} -Xmx15g -jar ${gatk} -T SelectVariants -R ${ref} -nt 11 -V final/merged_chr.vcf -L ${bait} -o final/merged_chr_bait.vcf"| qsub -pe make 8 -cwd -j y -N bait --hold_jid catvariant | |
| ) | |
| #################################################################################### | |
| #VQSR for SNP | |
| #################################################################################### | |
| job=$(echo -e " | |
| ${java} -Xmx15g -jar ${gatk} -T VariantRecalibrator -R ${ref} -nt 11 -input final/merged_chr_bait.vcf -resource:hapmap,known=false,training=true,truth=true,prior=15.0 ${hapmap} -resource:omni,known=false,training=true,truth=true,prior=12.0 ${omni} -resource:1000G,known=false,training=true,truth=false,prior=10.0 ${hc} -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 ${dbsnp} -an DP -an SOR -an QD -an FS -an MQ -an MQRankSum -an ReadPosRankSum -an InbreedingCoeff -mode SNP -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 -recalFile final/merged_chr_bait.snp.recal -tranchesFile final/merged_chr_bait.snp.tranches -rscriptFile final/merged_chr_bait.snp.plots.R | qsub -pe make 8 -cwd -j y -N vqsr_snp --hold_jid bait | |
| ) | |
| job=$(echo -e " | |
| ${java} -Xmx15g -jar ${gatk} -T ApplyRecalibration -R ${ref} -nt 11 -input final/merged_chr_bait.vcf -mode SNP --ts_filter_level 99.0 -recalFile final/merged_chr_bait.snp.recal -tranchesFile final/merged_chr_bait.snp.tranches -o final/merged_chr_bait.snp.raw_indel.vcf" | qsub -pe make 8 -cwd -j y -N applyrecalsnp --hold_jid vqsr_snp | |
| ) | |
| #################################################################################### | |
| #VQSR for INDEL | |
| #################################################################################### | |
| job=$(echo -e " | |
| ${java} -Xmx15g -jar ${gatk} -T VariantRecalibrator -R ${ref} -nt 11 -input final/merged_chr_bait.snp.raw_indel.vcf -resource:mills,known=true,training=true,truth=true,prior=12.0 ${mills} -an DP -an FS -an SOR -an QD -an MQRankSum -an ReadPosRankSum -an InbreedingCoeff -mode INDEL -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 --maxGaussians 4 -recalFile final/merged_chr_bait.indel.recal -tranchesFile final/merged_chr_bait.indel.tranches -rscriptFile final/merged_chr_bait.indel.plots.R" | qsub -pe make 8 -cwd -j y -N vqsr_indel --hold_jid applyrecalsnp | |
| ) | |
| job=$(echo -e " | |
| ${java} -Xmx15g -jar ${gatk} -T ApplyRecalibration -R ${ref} -nt 11 -input final/merged_chr_bait.snp.raw_indel.vcf -mode INDEL --ts_filter_level 99.0 -recalFile final/merged_chr_bait.indel.recal -tranchesFile final/merged_chr_bait.indel.tranches -o final/merged_chr_bait.recal.vcf" | qsub -pe make 8 -cwd -j y -N apply_indel --hold_jid vqsr_indel | |
| ) | |
| #################################################################################### | |
| #Select variants exclude non-variant loci and filtered loci (trim remaining alleles by default) | |
| #-env, --excludeNonVariants, -ef, --excludeFilterd | |
| #################################################################################### | |
| job=$(echo -e " | |
| ${java} -Xmx15g -jar ${gatk} -T SelectVariants -R $ref -nt 11 -V final/merged_chr_bait.recal.vcf -o final/merged_chr_bait.recal.filtered_only.vcf -env -ef | qsub -pe make 8 -cwd -j y -N filter --hold_jid apply_indel" | |
| ) | |
| #################################################################################### | |
| #Extaract autosomal SNPs | |
| #################################################################################### | |
| job=$(echo -e " | |
| ${java} -Xmx15g -jar ${gatk} -T SelectVariants -R $ref -V final/merged_chr_bait.recal.filtered_only.vcf -selectType SNP -L chr1 -L chr2 -L chr3 -L chr4 -L chr5 -L chr6 -L chr7 -L chr8 -L chr9 -L chr10 -L chr11 -L chr12 -L chr13 -L chr14 -L chr15 -L chr16 -L chr17 -L chr17 -L chr18 -L chr19 -L chr20 -L chr21 -L chr22 -o final/final_auto_recal_bait.vcf" | qsub -pe make 8 -cwd -j y -N autosomal --hold_jid filter | |
| ) | |
| #################################################################################### | |
| #Convert VCF to PLINK format | |
| #################################################################################### | |
| mkdir plink | |
| job=$(echo -e " | |
| ${vcftools} --vcf final/final_auto_recal_bait.vcf --plink --out plink/final.snp.auto" | qsub -pe make 8 -cwd -j y -N plink --hold_jid autosomal | |
| ) | |
| #################################################################################### | |
| #MDS | |
| #################################################################################### | |
| job=$(echo -e " | |
| ${plink} --file plink/final.snp.auto --maf 0.05 --geno 0.01 --hwe 0.001 --indep-pairwise 50 5 0.2 --out plink/all_prun --noweb; | |
| ${plink} --file plink/final.snp.auto --extract plink/all_prun.prune.in --make-bed --out plink/all_part_prun --noweb; | |
| ${plink} --bfile plink/all_part_prun --genome --out plink/all_part_prun_genome --noweb; | |
| ${plink} --bfile plink/all_part_prun --read-genome plink/all_part_prun_genome.genome --mds-plot 4 --out plink/all_part_prun_mds --noweb" | qsub -pe make 8 -cwd -j y -N mds --hold_jid plink | |
| ) | |
| #################################################################################### | |
| #IBS & MDS plot | |
| #################################################################################### | |
| job=$(echo " | |
| cat all_part_prun_genome.genome | awk '{if($0 !~/DST/) printf \"%.3f\n\", $12}' | sort | uniq -c > all_part_prun_dst.txt" | qsub -pe make 4 -cwd -j y -N awk_job --hold_jid mds | |
| ) | |
| job=$(echo -e " | |
| cd plink \n | |
| R --slave --vanilla --quiet --no-save <<EOF \n | |
| op<-par(family='Gentium') \n | |
| png('ibs_dst.png') \n | |
| DST<-read.table('all_part_prun_dst.txt') \n | |
| plot(DST[,2], DST[,1], xlim=c(0.6,1),main='IBS DST') \n | |
| dev.off() \n | |
| EOF \n | |
| #find cryptic relatedness \n | |
| awk '{if($12>0.95) print}' all_part_prun_genome.genome \n | |
| #MDS R plot \n | |
| R --slave --vanilla --quiet --no-save <<EOF \n | |
| # ---edit so path to R-graphics/.RData is correct--- \n | |
| mds<-read.table('all_part_prun_mds.mds',header=T) \n | |
| png('mds_plot.png') \n | |
| plot(mds$C1,mds$C2, main='MDS plot - Korean',col='gray',pch=4) \n | |
| dev.off() \n | |
| EOF \n | |
| #MDS & PCA plot with 1000genomes \n | |
| #1000genomes \n | |
| CEU='1000G_20101123_v3_GIANT_chr1_23_minimacnames_CEU_MAF0.01' \n | |
| CHB='1000G_20101123_v3_GIANT_chr1_23_minimacnames_CHB_JPT_MAF0.01' \n | |
| YRI='1000G_20101123_v3_GIANT_chr1_23_minimacnames_YRI_MAF0.01' \n | |
| awk '{if((length(\$5) >= 2)||(length(\$6) >= 2)) print \$2, 'ambig';else print \$2;}' $CEU.bim | grep -v ambig > ceu.snplist.txt \n | |
| awk '{if((length(\$5) >= 2)||(length(\$6) >= 2)) print \$2, 'ambig';else print \$2;}' $CHB.bim | grep -v ambig > chb.snplist.txt \n | |
| awk '{if((length(\$5) >= 2)||(length(\$6) >= 2)) print \$2, 'ambig';else print \$2;}' $YRI.bim | grep -v ambig > yri.snplist.txt \n | |
| ${plink} --bfile $CEU --extract ceu.snplist.txt --make-bed --out CEU --noweb \n | |
| ${plink} --bfile $CHB --extract ceu.snplist.txt --make-bed --out CHB --noweb \n | |
| ${plink} --bfile $YRI --extract ceu.snplist.txt --make-bed --out YRI --noweb \n | |
| ${plink} --file final.snp.auto --make-bed --out KOR --noweb \n | |
| ${plink} --bfile CEU --bmerge CHB.bed CHB.bim CHB.fam --make-bed --out CEU_CHB --noweb \n | |
| ${plink} --bfile CEU_CHB --bmerge YRI.bed YRI.bim YRI.fam --make-bed --out CEU_CHB_YRI --noweb \n | |
| ${plink} --bfile KOR --bmerge CEU_CHB_YRI.bed CEU_CHB_YRI.bim CEU_CHB_YRI.fam --out asian --noweb \n | |
| ${plink} --bfile KOR --exclude asian.missnp --make-bed --out KOR_flip --noweb \n | |
| ${plink} --bfile KOR_flip --bmerge CEU_CHB_YRI.bed CEU_CHB_YRI.bim CEU_CHB_YRI.fam --out asian_flip --noweb \n | |
| ${plink} --bfile asian_flip --maf 0.05 --geno 0.01 --hwe 0.001 --indep-pairwise 50 5 0.2 --out asian_prun --noweb \n | |
| ${plink} --bfile asian_flip --extract asian_prun.prune.in --make-bed --out asian_part_prun --noweb \n | |
| ${plink} --bfile asian_part_prun --genome --out asian_part_prun_genome --noweb \n | |
| ${plink} --bfile asian_part_prun --read-genome asian_part_prun_genome.genome --mds-plot 4 --out asian_part_prun_mds --noweb \n | |
| R --slave --vanilla --quiet --no-save <<EOF \n | |
| mds<-read.table('asian_part_prun_mds.mds',header=T) \n | |
| png('asian_mds_plot.png') \n | |
| plot(mds\$C1,mds\$C2,main='MDS plot',col='gray',pch=1) \n | |
| legend('topright',legend=c('KOR','CEU','CHB','JPT','YRI'),pch=21,col=c('blue','red','yellow','maroon','green')) \n | |
| for (i in 1:1339){ \n | |
| points(mds\$C1[i],mds\$C2[i],pch=21,col='blue') \n | |
| } \n | |
| for (i in 1340:1422){ \n | |
| points(mds\$C1[i],mds\$C2[i],pch=21,col='red') \n | |
| } \n | |
| for (i in 1423:1520){ \n | |
| points(mds\$C1[i],mds\$C2[i],pch=21,col='yellow') \n | |
| } \n | |
| for (i in 1521:1608){ \n | |
| points(mds\$C1[i],mds\$C2[i],pch=21,col='maroon') \n | |
| } \n | |
| for (i in 1609:1695){ \n | |
| points(mds\$C1[i],mds\$C2[i],pch=21,col='green') \n | |
| } \n | |
| dev.off() \n | |
| EOF" | qsub -pe make 4 -cwd -j y -N mds_plot --hold_jid awk_job | |
| ) | |
| #################################################################################### | |
| #PCA | |
| #EIGENSOFT PCA | |
| #################################################################################### | |
| par='genotypename: asian_part_prun.bed | |
| snpname: asian_part_prun.bim | |
| indivname: asian_part_prun.fam | |
| outputformat: EIGENSTRAT | |
| genotypeoutname: asian_part_prun.geno | |
| snpoutname: asian_part_prun.snp | |
| indivoutname: asian_part_prun.ind | |
| familynames: NO' | |
| echo "$par" > plink/gwas.par | |
| job=$(echo -e " | |
| sudo apt-get update -y \n | |
| sudo apt-get install libgsl0ldbl -y \n | |
| PATH=$eigenstrat:$PATH; export PATH \n | |
| cd plink \n | |
| ${eigenstrat}/convertf -p gwas.par \n | |
| ${eigenstrat}/smartpca.perl -i asian_part_prun.geno -a asian_part_prun.snp -b asian_part_prun.ind -o asian_part_prun.pca -p asian_part_prun.plot -e asian_part_prun.eval -l asian_part_prun.logs \n | |
| R --slave --quiet --no-save <<EOF \n | |
| op<-par(family='Gentium') \n | |
| pca<-read.table('plink/asian_part_prun.pca.evec',header=F) \n | |
| png('asian_pca_plot.png') \n | |
| plot(pca\$V2,pca\$V3,main='PCA plot',col='gray',pch=1) \n | |
| legend('topright',legend=c('KOR','CEU','CHB','JPT','YRI'),pch=21,col=c('blue','red','yellow','maroon','green')) \n | |
| for (i in 1:1320){ \n | |
| points(pca\$V2[i],pca\$V3[i],pch=21,col='blue') \n | |
| } \n | |
| for (i in 1321:1402){ \n | |
| points(pca\$V2[i],pca\$V3[i],pch=21,col='red') \n | |
| } \n | |
| for (i in 1403:1500){ \n | |
| points(pca\$V2[i],pca\$V3[i],pch=21,col='yellow') \n | |
| } \n | |
| for (i in 1501:1588){ \n | |
| points(pca\$V2[i],pca\$V3[i],pch=21,col='maroon') \n | |
| } \n | |
| for (i in 1589:1675){ \n | |
| points(pca\$V2[i],pca\$V3[i],pch=21,col='green') \n | |
| } \n | |
| dev.off() \n | |
| EOF" | qsub -pe make 4 -cwd -j y -N pca_plot | |
| ) | |
| LOG "End" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment