Skip to content

Instantly share code, notes, and snippets.

@hongiiv
Last active June 3, 2018 13:04
Show Gist options
  • Save hongiiv/745c34d86d3730e9e4a6 to your computer and use it in GitHub Desktop.
Save hongiiv/745c34d86d3730e9e4a6 to your computer and use it in GitHub Desktop.
Calling variants on cohorts of samples
#!/bin/bash
#$ -V
#$ -S /bin/bash
#Embrrassingly Parallel Computations Using the Sun Grid Engine
#Array Job
#qsub -pe make 3 -cwd -j y -N hong -t 1-3 test.sh
#MANIFEST=sample1.txt
#SAMPLE=`awk "NR==${SGE_TASK_ID}" ${MANIFEST}`
#echo $SAMPLE
pipe_name="Calling variants on cohorts of samples using GATK & PLINK "
pipe_version="20151006"
project="hongiiv"
debug="true"
set -x # activate debugging
set -o pipefail # trace ERR through pipes
set -o errtrace # trace ERR through 'time command' and other functions
set -o nounset ## set -u : exit the script if you try to use an uninitialised variable
set -o errexit ## set -e : exit the script if any statement returns a non-true return value
####################################################################################
#error check
####################################################################################
function error_exit {
echo
echo "$@"
exit 1
}
#trap the killer signals so that we can exit with a good message.
trap "error_exit 'Received signal SIGHUP'" SIGHUP
trap "error_exit 'Received signal SIGINT'" SIGINT
trap "error_exit 'Received signal SIGTERM'" SIGTERM
trap "error_exit 'Received signal SIGPIPE'" SIGPIPE
trap "error_exit 'Received signal SIGKILL'" SIGKILL
####################################################################################
#System environment
####################################################################################
TMPDIR=/GLUSTER/KEC/KEC_VCF/hongiiv_test/temp; export TMPDIR
ulimit -n 2048
####################################################################################
#software
####################################################################################
gatk="/BIO/app/GenomeAnalysisTK-3.3-0/GenomeAnalysisTK.jar"
java="/usr/bin/java" #version 1.7.0_25
vcftools="/BIO/app/vcftools_0.1.10/bin/vcftools"
plink="/GLUSTER/KEC/KEC_VCF/hongiiv_test/plink" #v1.07 (10/Aug/2009)
samtools="/NAS/tools/samtools-0.1.19/samtools"
bcftools="/NAS/tools/samtools-0.1.19/bcftools/bcftools"
Yfitter="/NAS/tools/Yfitter/Yfitter" #v0.2
Yfitter_dir="/NAS/tools/Yfitter"
python="/usr/bin/python" #2.7.3
conifer="/NAS/tools/conifer_v0.2.2/conifer.py"
eigenstrat="/NAS/tools/EIG/bin" #v6.0.1 (12/12/14)
####################################################################################
#gatk bundle (references)
####################################################################################
bait="/NAS/tools/korean_DB_bait_intersect.with_chr_final.bed"
ref="/NAS/tools/hg19_macrogen_short.fa" #contig chrM,chr1,chr2
hapmap="/DB/reference_data/gatk_bundle/2.8/hg19/hapmap_3.3.hg19.vcf"
omni2="/DB/reference_data/gatk_bundle/2.8/b37/1000G_omni2.5.b37.vcf"
omni="/DB/reference_data/gatk_bundle/2.8/hg19/1000G_omni2.5.hg19.vcf"
hc="/DB/reference_data/gatk_bundle/2.8/hg19/1000G_phase1.snps.high_confidence.hg19.vcf"
dbsnp="/DB/reference_data/gatk_bundle/2.8/hg19/dbsnp_138.hg19.vcf"
dbsnp144="/BIO/reference/00-All.vcf" #contig 1,2,3
mills="/DB/reference_data/gatk_bundle/2.8/hg19/Mills_and_1000G_gold_standard.indels.hg19.vcf"
####################################################################################
#functions
####################################################################################
time_start=$(date +%s)
time_now=${time_start}
time_end=${time_start}
LOG_FILE="logs/${project}-${time_start}.log"
LOG() {
time_now=$(date +%s)
#echo "[${project}][$(date "+%F %T")][time: $((${time_now} - ${time_end})) sec] $*" >> ${LOG_FILE}
echo "[$(date "+%F %T")] $*" >> ${LOG_FILE}
time_end=${time_now}
}
DEBUG() {
if [ "$debug" == "true" ]; then
echo "[DEBUG] [$(date "+%F %T")] $*" >> ${LOG_FILE}
fi
}
if [ -d "logs" ]; then
LOG "logs directory exists"
else
mkdir logs
fi
check_line_size_bam() {
fastqR1linesize=`zcat $1 | wc -l | cut -d' ' -f1`
check_error $?
fastqhalfsize=`expr ${fastqR1linesize} / 2`
check_error $?
bamlinesize=`${samtools} view $2 | wc -l ${FASTQ} | cut -d' ' -f1`
check_error $?
LOG "$1 half line size: ${fastqhalfsize}"
LOG "$2 line size: ${bamlinesize}"
if [ ${fastqhalfsize} -ne ${bamlinesize} ]; then
LOG "check_line_size_bam failed!"
fi
}
####################################################################################
#sample list
####################################################################################
if [ -f "sample.list" ]; then
LOG "Delete previous sample.list"
rm -rf sample.list
else
touch sample.list
fi
sample_count="0"
DEBUG "==========================Sample List=========================="
for sample in $(ls -alh|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}');
do
sample_count=$((${sample_count}+1))
DEBUG $sample;
echo $sample >> sample.list
done
DEBUG "==============================================================="
LOG "Total samples: ${sample_count}"
####################################################################################
#Calculate RPKM (Reads Per Kilobase per Million mapped reads)
#RPKM = numReads / (geneLength/1,000 * totalNumReads/1,000,000)
#numReads: number of reads mapped to a gene sequence
#geneLength: length of the gene sequence
#totalNumReads: total number of mapped reads of a sample
####################################################################################
if [ -d "RPKM" ]; then
LOG "RPKM directory exists"
else
mkdir RPKM
fi
# 1. Using job array
manifest="sample.list"
job=$(echo -e "
#!/bin/bash
#
#$ -cwd
#$ -j y
#$ -S /bin/bash
#
sample=\`awk \"NR==\${SGE_TASK_ID}\" ${manifest}\` \n
# expeted location of output and MD5 verification files \n
OUTFILE=RPKM/\${sample}_analysis.hdf5 \n
MD5FILE=\${OUTFILE}.md5 \n
# check if MD5 file already exists and matches output file \n
if [[ -f \$OUTFILE && -f \$MD5FILE && \`md5sum \${OUTFILE} | cut -f1 -d\" \"\` = \`cat \${MD5FILE} | cut -f1 -d\" \"\` ]]; then
# task was previously run successfully, skip this time \n
echo -e \`date\` "\\t\${JOB_NAME}\\t\${JOB_ID}\\t\${SGE_TASK_ID}\\t\${SAMPLE}\\tSKIP" >> $LOG_FILE \n
exit 0 \n
else \n
# do the main work \n
date
hostname
# check exit status \n
STATUS=\$? \n
if [[ \$STATUS -eq 0 ]]; then \n
if [[ -f \$OUTFILE ]]; then \n
echo \`md5sum \$OUTFILE > \$MD5FILE\` \n
else
echo "check output file fail" >> $LOG_FILE \n
echo -e [\`date\`] "\\t\${JOB_NAME}\\t\${JOB_ID}\\t\${SGE_TASK_ID}\\t\\${SAMPLE}\tFAIL\\t\${STATUS}" >> $LOG_FILE \n
exit 0 \n
fi \n
echo -e \`date\` "\\t\${JOB_NAME}\\t\${JOB_ID}\\t\${SGE_TASK_ID}\\t\${SAMPLE}\\tOK" >> $LOG_FILE \n
else \n
echo -e \`date\` "\\t\${JOB_NAME}\\t\${JOB_ID}\\t\${SGE_TASK_ID}\\t\\${SAMPLE}\tFAIL\\t\${STATUS}" >> $LOG_FILE \n
fi \n
exit \$STATUS \n
fi" | qsub -pe make 8 -cwd -j y -N STEP1_RPKM -t 1-${sample_count}
)
DEBUG "${job}"
# 2.Using for statement
for sample in $(ls -alh|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}');
do
job=$(echo -e "
${python} ${conifer} rpkm --probes ${bait} --input ${sample}.sorted.dedup.realigned.recalibrated.sorted.bam --output RPKM/${sample}.rpkm.txt;
${python} ${conifer} analyze --probes ${bait} --rpkm_dir ./RPKM/ --output RPKM/${sample}_analysis.hdf5 --svd 6 --write_svals RPKM/${sample}_singular_values.txt --plot_scree RPKM/${sample}_screeplot.png --write_sd RPKM/${sample}_sd_values.txt;
${python} ${conifer} call --input RPKM/${sample}_analysis.hdf5 --output RPKM/${sample}_calls.txt;
${python} ${conifer} plotcalls --input RPKM/${sample}_analysis.hdf5 --calls RPKM/${sample}_calls.txt --outputdir ./RPKM/;" |
qsub -pe make 8 -cwd -j y -N rpkm_${sample})
)
DEBUG $job
done
####################################################################################
#Mitochondria haplogrup assignment
#Online submission of query to haplogrep at http://haplogrep.uibk.ac.at
#assignment of .N. to heterozygous calls by using UNIX command line
####################################################################################
mkdir chrM
for sample in $(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}');
do
job=$(echo -e "
${samtools} view -h -b -o chrM/${sample}.chrM.bam ${sample}.bam chrM;
${samtools} mpileup -E -q 30 -Q 30 -C 50 -r chrM:1-16569 -uf chrM/${sample}.chrM.bam | ${bcftools} view -cg - > chrM/${sample}_chrM.vcf;
${java} -jar ${gatk} -T FastaAlternateReferenceMarker -R ${ref} -o chrM/${sample}_chrM.fasta -variant chrM/${sample}_chrM.vcf;
${python} fas2hsd.py chrM/${sample}_chrM.fasta
" | qsub -pe make 8 -cwd -j y -N mitho_${sample})
done
####################################################################################
#Chromosome Y haplogroup assignments
#To assign haplogrup for chromosome Y of male samples in Korean,
#samtools and bcftools used to generate necessary input file to haplogroup assignment software YFitter v0.2
####################################################################################
mkdir chrY
for sample in $(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}');
do
samples="${sample}.bam ${samples} "
job=$({echo -e "${samtools} view -h -b -o chrY/${sample}.chrY.bam ${sample}.bam chrY" | qsub -pe make 8 -cwd -j y -N chrY_${sample})
jids="chrY_${sample}, ${jids}"
done
job=$(echo -e "
${samtools} mpileup -uf /NAS/tools/chrY.fa -C 50 -Q 20 -q 20 ${samples} > chrY/allYs.bcf;
${bcftools} view -Q chrY/allYs.bcf ${Yfitter_dir}/karafet_sites_b37.pos > chrY/infosites.qcall;
${Yfitter} -m ${Yfitter_dir}/karafet_tree_b37.xml chrY/infosites.qcall > chrY/allYs_haplogroup.txt;
"| qsub -pe make 8 -cwd -j y -N yfitter -hold_jid ${jids})
#counting bam files
count=$(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}'|wc -l)
if [ $count -gt 200 ]
then
echo "over 200 samples. try to by manually"
else
echo "false"
fi
####################################################################################
#Depth of Coverage
####################################################################################
mkdir doc
for sample in $(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}');
do
job=$(echo -e "${java} -Xmx4g -jar ${gatk} -T DepthOfCoverage -R ${ref} -I ${sample}.recal.bam -o doc/${sample}.cov.out -L ${bait} -omitBaseOutput -omitLocusTable -omitIntervals --minMappingQuality 20 --minBaseQuality 20 --logging_level ERROR --summaryCoverageThreshold 20 --summaryCoverageThreshold 40 --summaryCoverageThreshold 60 --summaryCoverageThreshold 80 --summaryCoverageThreshold 100" | qsub -pe make 4 -cwd -j y -N doc_${sample})
done
####################################################################################
#genotype GVCF
####################################################################################
mkdir gvcf
jids=""
for sample in $(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}');
do
job=$(echo -e "${java} -Xmx4g -jar ${gatk} -T HaplotypeCaller -R ${ref} -L ${bait} -I ${sample}.bam --emitRefConfidence GVCF --variant_index_type LINEAR --variant_index_parameter 128000 -stand_call_conf 30.0 -stand_emit_conf 10.0 -o gvcf/${sample}.hc.g.vcf" | qsub -pe make 8 -cwd -j y -N gvcf_${sample})
jids="gvcf_${sample}, ${jids}"
done
####################################################################################
#The gVCFs were combined in 298 groups (~sqrt(n) gVCFs in each group) and
then joint genotyping of SNPs and Indels were performed on all groups.
####################################################################################
for vcf in $(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}');
do
vcfs="--variant gvcf/${vcf}.hc.g.vcf ${vcfs} "
done
job=$(echo -e "
${java} -Xmx15g -jar ${gatk} -T CombineGVCFs -R ${ref} -o final/group.g.vcf ${vcfs}" | qsub -pe make 8 -cwd -j y -N combinegvcf -hold_jid ${jids}
")
####################################################################################
#Genotyping each chromosome
####################################################################################
for chr in chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chrM chrX chrY
do
job=$(echo -e "
${java] -Xmx15g -jar ${gatk} -nt 11 -T GenotypeGVCFs -R ${ref} --variant final/group.g.vcf -L ${chr} -o final/grouped_${chr}.vcf --annotation StrandOddsRatio -A QualByDepth -A FisherStrand -A MappingQualityRankSumTest -A MappingQualityZero -A ReadPosRankSumTest -A InbreedingCoeff -A DepthPerAlleleBySample -A HaplotypeScore" | qsub -pe make 8 -cwd -j y -N genotypegvcf -hold_jid combinegvcf)
done
####################################################################################
#Merge genotype calls
####################################################################################
for chr in chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chrM chrX chrY
do
vcfs="-V final/grouped_${chr}.vcf ${vcfs}"
done
job=$(echo -e "
${java} -Xmx15g -cp ${gatk} org.broadinstitute.gatk.tools.CatVariants -R ${ref} ${vcfs} -out final/merged_chr.vcf -assumeSorted" | qsub -pe make 8 -cwd -j y -N catvariant --hold_jid genotypegvcf
)
####################################################################################
#Select variants in the bait region
####################################################################################
job=$(echo -e "
${java} -Xmx15g -jar ${gatk} -T SelectVariants -R ${ref} -nt 11 -V final/merged_chr.vcf -L ${bait} -o final/merged_chr_bait.vcf"| qsub -pe make 8 -cwd -j y -N bait --hold_jid catvariant
)
####################################################################################
#VQSR for SNP
####################################################################################
job=$(echo -e "
${java} -Xmx15g -jar ${gatk} -T VariantRecalibrator -R ${ref} -nt 11 -input final/merged_chr_bait.vcf -resource:hapmap,known=false,training=true,truth=true,prior=15.0 ${hapmap} -resource:omni,known=false,training=true,truth=true,prior=12.0 ${omni} -resource:1000G,known=false,training=true,truth=false,prior=10.0 ${hc} -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 ${dbsnp} -an DP -an SOR -an QD -an FS -an MQ -an MQRankSum -an ReadPosRankSum -an InbreedingCoeff -mode SNP -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 -recalFile final/merged_chr_bait.snp.recal -tranchesFile final/merged_chr_bait.snp.tranches -rscriptFile final/merged_chr_bait.snp.plots.R | qsub -pe make 8 -cwd -j y -N vqsr_snp --hold_jid bait
)
job=$(echo -e "
${java} -Xmx15g -jar ${gatk} -T ApplyRecalibration -R ${ref} -nt 11 -input final/merged_chr_bait.vcf -mode SNP --ts_filter_level 99.0 -recalFile final/merged_chr_bait.snp.recal -tranchesFile final/merged_chr_bait.snp.tranches -o final/merged_chr_bait.snp.raw_indel.vcf" | qsub -pe make 8 -cwd -j y -N applyrecalsnp --hold_jid vqsr_snp
)
####################################################################################
#VQSR for INDEL
####################################################################################
job=$(echo -e "
${java} -Xmx15g -jar ${gatk} -T VariantRecalibrator -R ${ref} -nt 11 -input final/merged_chr_bait.snp.raw_indel.vcf -resource:mills,known=true,training=true,truth=true,prior=12.0 ${mills} -an DP -an FS -an SOR -an QD -an MQRankSum -an ReadPosRankSum -an InbreedingCoeff -mode INDEL -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 --maxGaussians 4 -recalFile final/merged_chr_bait.indel.recal -tranchesFile final/merged_chr_bait.indel.tranches -rscriptFile final/merged_chr_bait.indel.plots.R" | qsub -pe make 8 -cwd -j y -N vqsr_indel --hold_jid applyrecalsnp
)
job=$(echo -e "
${java} -Xmx15g -jar ${gatk} -T ApplyRecalibration -R ${ref} -nt 11 -input final/merged_chr_bait.snp.raw_indel.vcf -mode INDEL --ts_filter_level 99.0 -recalFile final/merged_chr_bait.indel.recal -tranchesFile final/merged_chr_bait.indel.tranches -o final/merged_chr_bait.recal.vcf" | qsub -pe make 8 -cwd -j y -N apply_indel --hold_jid vqsr_indel
)
####################################################################################
#Select variants exclude non-variant loci and filtered loci (trim remaining alleles by default)
#-env, --excludeNonVariants, -ef, --excludeFilterd
####################################################################################
job=$(echo -e "
${java} -Xmx15g -jar ${gatk} -T SelectVariants -R $ref -nt 11 -V final/merged_chr_bait.recal.vcf -o final/merged_chr_bait.recal.filtered_only.vcf -env -ef | qsub -pe make 8 -cwd -j y -N filter --hold_jid apply_indel"
)
####################################################################################
#Extaract autosomal SNPs
####################################################################################
job=$(echo -e "
${java} -Xmx15g -jar ${gatk} -T SelectVariants -R $ref -V final/merged_chr_bait.recal.filtered_only.vcf -selectType SNP -L chr1 -L chr2 -L chr3 -L chr4 -L chr5 -L chr6 -L chr7 -L chr8 -L chr9 -L chr10 -L chr11 -L chr12 -L chr13 -L chr14 -L chr15 -L chr16 -L chr17 -L chr17 -L chr18 -L chr19 -L chr20 -L chr21 -L chr22 -o final/final_auto_recal_bait.vcf" | qsub -pe make 8 -cwd -j y -N autosomal --hold_jid filter
)
####################################################################################
#Convert VCF to PLINK format
####################################################################################
mkdir plink
job=$(echo -e "
${vcftools} --vcf final/final_auto_recal_bait.vcf --plink --out plink/final.snp.auto" | qsub -pe make 8 -cwd -j y -N plink --hold_jid autosomal
)
####################################################################################
#MDS
####################################################################################
job=$(echo -e "
${plink} --file plink/final.snp.auto --maf 0.05 --geno 0.01 --hwe 0.001 --indep-pairwise 50 5 0.2 --out plink/all_prun --noweb;
${plink} --file plink/final.snp.auto --extract plink/all_prun.prune.in --make-bed --out plink/all_part_prun --noweb;
${plink} --bfile plink/all_part_prun --genome --out plink/all_part_prun_genome --noweb;
${plink} --bfile plink/all_part_prun --read-genome plink/all_part_prun_genome.genome --mds-plot 4 --out plink/all_part_prun_mds --noweb" | qsub -pe make 8 -cwd -j y -N mds --hold_jid plink
)
####################################################################################
#IBS & MDS plot
####################################################################################
job=$(echo "
cat all_part_prun_genome.genome | awk '{if($0 !~/DST/) printf \"%.3f\n\", $12}' | sort | uniq -c > all_part_prun_dst.txt" | qsub -pe make 4 -cwd -j y -N awk_job --hold_jid mds
)
job=$(echo -e "
cd plink \n
R --slave --vanilla --quiet --no-save <<EOF \n
op<-par(family='Gentium') \n
png('ibs_dst.png') \n
DST<-read.table('all_part_prun_dst.txt') \n
plot(DST[,2], DST[,1], xlim=c(0.6,1),main='IBS DST') \n
dev.off() \n
EOF \n
#find cryptic relatedness \n
awk '{if($12>0.95) print}' all_part_prun_genome.genome \n
#MDS R plot \n
R --slave --vanilla --quiet --no-save <<EOF \n
# ---edit so path to R-graphics/.RData is correct--- \n
mds<-read.table('all_part_prun_mds.mds',header=T) \n
png('mds_plot.png') \n
plot(mds$C1,mds$C2, main='MDS plot - Korean',col='gray',pch=4) \n
dev.off() \n
EOF \n
#MDS & PCA plot with 1000genomes \n
#1000genomes \n
CEU='1000G_20101123_v3_GIANT_chr1_23_minimacnames_CEU_MAF0.01' \n
CHB='1000G_20101123_v3_GIANT_chr1_23_minimacnames_CHB_JPT_MAF0.01' \n
YRI='1000G_20101123_v3_GIANT_chr1_23_minimacnames_YRI_MAF0.01' \n
awk '{if((length(\$5) >= 2)||(length(\$6) >= 2)) print \$2, 'ambig';else print \$2;}' $CEU.bim | grep -v ambig > ceu.snplist.txt \n
awk '{if((length(\$5) >= 2)||(length(\$6) >= 2)) print \$2, 'ambig';else print \$2;}' $CHB.bim | grep -v ambig > chb.snplist.txt \n
awk '{if((length(\$5) >= 2)||(length(\$6) >= 2)) print \$2, 'ambig';else print \$2;}' $YRI.bim | grep -v ambig > yri.snplist.txt \n
${plink} --bfile $CEU --extract ceu.snplist.txt --make-bed --out CEU --noweb \n
${plink} --bfile $CHB --extract ceu.snplist.txt --make-bed --out CHB --noweb \n
${plink} --bfile $YRI --extract ceu.snplist.txt --make-bed --out YRI --noweb \n
${plink} --file final.snp.auto --make-bed --out KOR --noweb \n
${plink} --bfile CEU --bmerge CHB.bed CHB.bim CHB.fam --make-bed --out CEU_CHB --noweb \n
${plink} --bfile CEU_CHB --bmerge YRI.bed YRI.bim YRI.fam --make-bed --out CEU_CHB_YRI --noweb \n
${plink} --bfile KOR --bmerge CEU_CHB_YRI.bed CEU_CHB_YRI.bim CEU_CHB_YRI.fam --out asian --noweb \n
${plink} --bfile KOR --exclude asian.missnp --make-bed --out KOR_flip --noweb \n
${plink} --bfile KOR_flip --bmerge CEU_CHB_YRI.bed CEU_CHB_YRI.bim CEU_CHB_YRI.fam --out asian_flip --noweb \n
${plink} --bfile asian_flip --maf 0.05 --geno 0.01 --hwe 0.001 --indep-pairwise 50 5 0.2 --out asian_prun --noweb \n
${plink} --bfile asian_flip --extract asian_prun.prune.in --make-bed --out asian_part_prun --noweb \n
${plink} --bfile asian_part_prun --genome --out asian_part_prun_genome --noweb \n
${plink} --bfile asian_part_prun --read-genome asian_part_prun_genome.genome --mds-plot 4 --out asian_part_prun_mds --noweb \n
R --slave --vanilla --quiet --no-save <<EOF \n
mds<-read.table('asian_part_prun_mds.mds',header=T) \n
png('asian_mds_plot.png') \n
plot(mds\$C1,mds\$C2,main='MDS plot',col='gray',pch=1) \n
legend('topright',legend=c('KOR','CEU','CHB','JPT','YRI'),pch=21,col=c('blue','red','yellow','maroon','green')) \n
for (i in 1:1339){ \n
points(mds\$C1[i],mds\$C2[i],pch=21,col='blue') \n
} \n
for (i in 1340:1422){ \n
points(mds\$C1[i],mds\$C2[i],pch=21,col='red') \n
} \n
for (i in 1423:1520){ \n
points(mds\$C1[i],mds\$C2[i],pch=21,col='yellow') \n
} \n
for (i in 1521:1608){ \n
points(mds\$C1[i],mds\$C2[i],pch=21,col='maroon') \n
} \n
for (i in 1609:1695){ \n
points(mds\$C1[i],mds\$C2[i],pch=21,col='green') \n
} \n
dev.off() \n
EOF" | qsub -pe make 4 -cwd -j y -N mds_plot --hold_jid awk_job
)
####################################################################################
#PCA
#EIGENSOFT PCA
####################################################################################
par='genotypename: asian_part_prun.bed
snpname: asian_part_prun.bim
indivname: asian_part_prun.fam
outputformat: EIGENSTRAT
genotypeoutname: asian_part_prun.geno
snpoutname: asian_part_prun.snp
indivoutname: asian_part_prun.ind
familynames: NO'
echo "$par" > plink/gwas.par
job=$(echo -e "
sudo apt-get update -y \n
sudo apt-get install libgsl0ldbl -y \n
PATH=$eigenstrat:$PATH; export PATH \n
cd plink \n
${eigenstrat}/convertf -p gwas.par \n
${eigenstrat}/smartpca.perl -i asian_part_prun.geno -a asian_part_prun.snp -b asian_part_prun.ind -o asian_part_prun.pca -p asian_part_prun.plot -e asian_part_prun.eval -l asian_part_prun.logs \n
R --slave --quiet --no-save <<EOF \n
op<-par(family='Gentium') \n
pca<-read.table('plink/asian_part_prun.pca.evec',header=F) \n
png('asian_pca_plot.png') \n
plot(pca\$V2,pca\$V3,main='PCA plot',col='gray',pch=1) \n
legend('topright',legend=c('KOR','CEU','CHB','JPT','YRI'),pch=21,col=c('blue','red','yellow','maroon','green')) \n
for (i in 1:1320){ \n
points(pca\$V2[i],pca\$V3[i],pch=21,col='blue') \n
} \n
for (i in 1321:1402){ \n
points(pca\$V2[i],pca\$V3[i],pch=21,col='red') \n
} \n
for (i in 1403:1500){ \n
points(pca\$V2[i],pca\$V3[i],pch=21,col='yellow') \n
} \n
for (i in 1501:1588){ \n
points(pca\$V2[i],pca\$V3[i],pch=21,col='maroon') \n
} \n
for (i in 1589:1675){ \n
points(pca\$V2[i],pca\$V3[i],pch=21,col='green') \n
} \n
dev.off() \n
EOF" | qsub -pe make 4 -cwd -j y -N pca_plot
)
LOG "End"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment