hongiiv · June 3, 2018 13:04
diff --git a/Calling variants on cohorts of samples b/Calling variants on cohorts of samples
 #!/bin/bash
 #$ -V
 #$ -S /bin/bash

 #Embrrassingly Parallel Computations Using the Sun Grid Engine
 #Array Job
 #qsub -pe make 3 -cwd -j y -N hong -t 1-3 test.sh
 #MANIFEST=sample1.txt
 #SAMPLE=`awk "NR==${SGE_TASK_ID}" ${MANIFEST}`
 #echo $SAMPLE


 pipe_name="Calling variants on cohorts of samples using GATK & PLINK "
 pipe_version="20151006"
 project="hongiiv"
 debug="true"

 set -x # activate debugging
 set -o pipefail # trace ERR through pipes
 set -o errtrace  # trace ERR through 'time command' and other functions
 set -o nounset   ## set -u : exit the script if you try to use an uninitialised variable
 set -o errexit   ## set -e : exit the script if any statement returns a non-true return value

 ####################################################################################
 #error check
 ####################################################################################
 function error_exit {
  echo
  echo "$@"
  exit 1
 }

 #trap the killer signals so that we can exit with a good message.
 trap "error_exit 'Received signal SIGHUP'" SIGHUP
 trap "error_exit 'Received signal SIGINT'" SIGINT
 trap "error_exit 'Received signal SIGTERM'" SIGTERM
 trap "error_exit 'Received signal SIGPIPE'" SIGPIPE
 trap "error_exit 'Received signal SIGKILL'" SIGKILL

 ####################################################################################
 #System environment
 ####################################################################################
 TMPDIR=/GLUSTER/KEC/KEC_VCF/hongiiv_test/temp; export TMPDIR
 ulimit -n 2048

 ####################################################################################
 #software
 ####################################################################################
 gatk="/BIO/app/GenomeAnalysisTK-3.3-0/GenomeAnalysisTK.jar"
 java="/usr/bin/java" #version 1.7.0_25
 vcftools="/BIO/app/vcftools_0.1.10/bin/vcftools"
 plink="/GLUSTER/KEC/KEC_VCF/hongiiv_test/plink" #v1.07 (10/Aug/2009)
 samtools="/NAS/tools/samtools-0.1.19/samtools"
 bcftools="/NAS/tools/samtools-0.1.19/bcftools/bcftools"
 Yfitter="/NAS/tools/Yfitter/Yfitter" #v0.2
 Yfitter_dir="/NAS/tools/Yfitter"
 python="/usr/bin/python" #2.7.3
 conifer="/NAS/tools/conifer_v0.2.2/conifer.py"
 eigenstrat="/NAS/tools/EIG/bin" #v6.0.1 (12/12/14)

 ####################################################################################
 #gatk bundle (references)
 ####################################################################################
 bait="/NAS/tools/korean_DB_bait_intersect.with_chr_final.bed"
 ref="/NAS/tools/hg19_macrogen_short.fa" #contig chrM,chr1,chr2
 hapmap="/DB/reference_data/gatk_bundle/2.8/hg19/hapmap_3.3.hg19.vcf"
 omni2="/DB/reference_data/gatk_bundle/2.8/b37/1000G_omni2.5.b37.vcf"
 omni="/DB/reference_data/gatk_bundle/2.8/hg19/1000G_omni2.5.hg19.vcf"
 hc="/DB/reference_data/gatk_bundle/2.8/hg19/1000G_phase1.snps.high_confidence.hg19.vcf"
 dbsnp="/DB/reference_data/gatk_bundle/2.8/hg19/dbsnp_138.hg19.vcf"
 dbsnp144="/BIO/reference/00-All.vcf" #contig 1,2,3
 mills="/DB/reference_data/gatk_bundle/2.8/hg19/Mills_and_1000G_gold_standard.indels.hg19.vcf"

 ####################################################################################
 #functions
 ####################################################################################
 time_start=$(date +%s)
 time_now=${time_start}
 time_end=${time_start}
 LOG_FILE="logs/${project}-${time_start}.log"
 LOG() {
  time_now=$(date +%s)
  #echo "[${project}][$(date "+%F %T")][time: $((${time_now} - ${time_end})) sec] $*" >> ${LOG_FILE}
  echo "[$(date "+%F %T")] $*" >> ${LOG_FILE}
  time_end=${time_now}
 }

 DEBUG() {
  if [ "$debug" == "true" ]; then
    echo "[DEBUG] [$(date "+%F %T")] $*" >> ${LOG_FILE}
  fi
 }

 if [ -d "logs" ]; then
  LOG "logs directory exists"
 else
  mkdir logs
 fi

 check_line_size_bam() {
  fastqR1linesize=`zcat $1 | wc -l | cut -d' ' -f1`
  check_error $?

  fastqhalfsize=`expr ${fastqR1linesize} / 2`
  check_error $?

  bamlinesize=`${samtools} view $2 | wc -l ${FASTQ} | cut -d' ' -f1`
  check_error $?

  LOG "$1 half line size: ${fastqhalfsize}"
  LOG "$2 line size: ${bamlinesize}"
  if [ ${fastqhalfsize} -ne ${bamlinesize} ]; then
    LOG "check_line_size_bam failed!"
  fi
 }

 ####################################################################################
 #sample list
 ####################################################################################
 if [ -f "sample.list" ]; then
  LOG "Delete previous sample.list"
  rm -rf sample.list
 else
  touch sample.list
 fi

 sample_count="0"
 DEBUG "==========================Sample List=========================="
 for sample in $(ls -alh|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}');
 do
    sample_count=$((${sample_count}+1))
    DEBUG $sample;
    echo $sample >> sample.list
 done
 DEBUG "==============================================================="
 LOG "Total samples: ${sample_count}"

 ####################################################################################
 #Calculate RPKM (Reads Per Kilobase per Million mapped reads)
 #RPKM = numReads / (geneLength/1,000 * totalNumReads/1,000,000)
 #numReads: number of reads mapped to a gene sequence
 #geneLength: length of the gene sequence
 #totalNumReads: total number of mapped reads of a sample
 ####################################################################################
 if [ -d "RPKM" ]; then
  LOG "RPKM directory exists"
 else
  mkdir RPKM
 fi

 # 1. Using job array
 manifest="sample.list"
 job=$(echo -e "
 #!/bin/bash
 #
 #$ -cwd
 #$ -j y
 #$ -S /bin/bash
 #

 sample=\`awk \"NR==\${SGE_TASK_ID}\" ${manifest}\` \n
 # expeted location of output and MD5 verification files \n
 OUTFILE=RPKM/\${sample}_analysis.hdf5 \n
 MD5FILE=\${OUTFILE}.md5 \n
 # check if MD5 file already exists and matches output file \n
 if [[ -f \$OUTFILE && -f \$MD5FILE && \`md5sum \${OUTFILE} | cut -f1 -d\" \"\` = \`cat \${MD5FILE} | cut -f1 -d\" \"\` ]]; then
    # task was previously run successfully, skip this time \n
    echo -e \`date\` "\\t\${JOB_NAME}\\t\${JOB_ID}\\t\${SGE_TASK_ID}\\t\${SAMPLE}\\tSKIP" >> $LOG_FILE \n
    exit 0 \n
 else \n
    # do the main work \n
    date
    hostname
    # check exit status \n
    STATUS=\$? \n
    if [[ \$STATUS -eq 0 ]]; then \n
        if [[ -f \$OUTFILE ]]; then \n
            echo \`md5sum \$OUTFILE > \$MD5FILE\` \n
        else
            echo "check output file fail" >> $LOG_FILE \n
            echo -e [\`date\`] "\\t\${JOB_NAME}\\t\${JOB_ID}\\t\${SGE_TASK_ID}\\t\\${SAMPLE}\tFAIL\\t\${STATUS}" >> $LOG_FILE \n
            exit 0 \n
        fi \n
        echo -e \`date\` "\\t\${JOB_NAME}\\t\${JOB_ID}\\t\${SGE_TASK_ID}\\t\${SAMPLE}\\tOK" >> $LOG_FILE \n
    else \n
         echo -e \`date\` "\\t\${JOB_NAME}\\t\${JOB_ID}\\t\${SGE_TASK_ID}\\t\\${SAMPLE}\tFAIL\\t\${STATUS}" >> $LOG_FILE \n
    fi \n
    exit \$STATUS \n
 fi" | qsub -pe make 8 -cwd -j y -N STEP1_RPKM -t 1-${sample_count}
 )

 DEBUG "${job}"

 # 2.Using for statement
 for sample in $(ls -alh|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}');
 do
    job=$(echo -e "
        ${python} ${conifer} rpkm --probes ${bait} --input ${sample}.sorted.dedup.realigned.recalibrated.sorted.bam --output RPKM/${sample}.rpkm.txt;
        ${python} ${conifer} analyze --probes ${bait} --rpkm_dir ./RPKM/ --output RPKM/${sample}_analysis.hdf5 --svd 6 --write_svals RPKM/${sample}_singular_values.txt --plot_scree RPKM/${sample}_screeplot.png --write_sd RPKM/${sample}_sd_values.txt;
        ${python} ${conifer} call --input RPKM/${sample}_analysis.hdf5 --output RPKM/${sample}_calls.txt;
        ${python} ${conifer} plotcalls --input RPKM/${sample}_analysis.hdf5 --calls RPKM/${sample}_calls.txt --outputdir ./RPKM/;" |
        qsub -pe make 8 -cwd -j y -N rpkm_${sample})
        )
    DEBUG $job
 done


 ####################################################################################
 #Mitochondria haplogrup assignment
 #Online submission of query to haplogrep at http://haplogrep.uibk.ac.at
 #assignment of .N. to heterozygous calls by using UNIX command line
 ####################################################################################
 mkdir chrM
 for sample in $(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}');
 do
    job=$(echo -e "
        ${samtools} view -h -b -o chrM/${sample}.chrM.bam ${sample}.bam chrM;
        ${samtools} mpileup -E -q 30 -Q 30 -C 50 -r chrM:1-16569 -uf chrM/${sample}.chrM.bam | ${bcftools} view -cg - > chrM/${sample}_chrM.vcf;
        ${java} -jar ${gatk} -T FastaAlternateReferenceMarker -R ${ref} -o chrM/${sample}_chrM.fasta -variant chrM/${sample}_chrM.vcf;
        ${python} fas2hsd.py chrM/${sample}_chrM.fasta
        " | qsub -pe make 8 -cwd -j y -N mitho_${sample})
 done

 ####################################################################################
 #Chromosome Y haplogroup assignments
 #To assign haplogrup for chromosome Y of male samples in Korean,
 #samtools and bcftools used to generate necessary input file to haplogroup assignment software YFitter v0.2
 ####################################################################################
 mkdir chrY
 for sample in $(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}');
 do
    samples="${sample}.bam ${samples} "
    job=$({echo -e "${samtools} view -h -b -o chrY/${sample}.chrY.bam ${sample}.bam chrY" | qsub -pe make 8 -cwd -j y -N chrY_${sample})
    jids="chrY_${sample}, ${jids}"
 done
 job=$(echo -e "
    ${samtools} mpileup -uf /NAS/tools/chrY.fa -C 50 -Q 20 -q 20 ${samples} > chrY/allYs.bcf;
    ${bcftools} view -Q chrY/allYs.bcf ${Yfitter_dir}/karafet_sites_b37.pos > chrY/infosites.qcall;
    ${Yfitter} -m ${Yfitter_dir}/karafet_tree_b37.xml chrY/infosites.qcall > chrY/allYs_haplogroup.txt;
    "| qsub -pe make 8 -cwd -j y -N yfitter -hold_jid ${jids})

 #counting bam files
 count=$(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}'|wc -l)
 if [ $count -gt 200 ]
 then
   echo "over 200 samples. try to by manually"
 else
   echo "false"
 fi

 ####################################################################################
 #Depth of Coverage
 ####################################################################################
 mkdir doc
 for sample in $(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}');
 do
    job=$(echo -e "${java} -Xmx4g -jar ${gatk} -T DepthOfCoverage -R ${ref} -I ${sample}.recal.bam -o doc/${sample}.cov.out -L ${bait} -omitBaseOutput -omitLocusTable -omitIntervals --minMappingQuality 20 --minBaseQuality 20 --logging_level ERROR --summaryCoverageThreshold 20 --summaryCoverageThreshold 40 --summaryCoverageThreshold 60 --summaryCoverageThreshold 80 --summaryCoverageThreshold 100" | qsub -pe make 4 -cwd -j y -N doc_${sample})
 done

 ####################################################################################
 #genotype GVCF
 ####################################################################################
 mkdir gvcf
 jids=""
 for sample in $(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}');
 do
    job=$(echo -e "${java} -Xmx4g -jar ${gatk} -T HaplotypeCaller -R ${ref} -L ${bait} -I ${sample}.bam --emitRefConfidence GVCF --variant_index_type LINEAR --variant_index_parameter 128000 -stand_call_conf 30.0 -stand_emit_conf 10.0 -o gvcf/${sample}.hc.g.vcf" | qsub -pe make 8 -cwd -j y -N gvcf_${sample})
    jids="gvcf_${sample}, ${jids}"
 done

 ####################################################################################
 #The gVCFs were combined in 298 groups (~sqrt(n) gVCFs in each group) and
 then joint genotyping of SNPs and Indels were performed on all groups.
 ####################################################################################

 for vcf in $(ll -h|grep bam$|awk '{print $9}'|awk -F '.bam' '{print $1}');
 do
    vcfs="--variant gvcf/${vcf}.hc.g.vcf ${vcfs} "
 done
 job=$(echo -e "
    ${java} -Xmx15g -jar ${gatk} -T CombineGVCFs -R ${ref} -o final/group.g.vcf ${vcfs}" | qsub -pe make 8 -cwd -j y -N combinegvcf -hold_jid ${jids}
    ")

 ####################################################################################
 #Genotyping each chromosome
 ####################################################################################
 for chr in chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chrM chrX chrY
 do
    job=$(echo -e "
        ${java] -Xmx15g -jar ${gatk} -nt 11 -T GenotypeGVCFs -R ${ref} --variant final/group.g.vcf -L ${chr} -o final/grouped_${chr}.vcf --annotation StrandOddsRatio -A QualByDepth -A FisherStrand -A MappingQualityRankSumTest -A MappingQualityZero -A ReadPosRankSumTest -A InbreedingCoeff -A DepthPerAlleleBySample -A HaplotypeScore" | qsub -pe make 8 -cwd -j y -N genotypegvcf -hold_jid combinegvcf)
 done

 ####################################################################################
 #Merge genotype calls
 ####################################################################################
 for chr in chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chrM chrX chrY
 do
    vcfs="-V final/grouped_${chr}.vcf ${vcfs}"
 done

 job=$(echo -e "
    ${java} -Xmx15g -cp ${gatk} org.broadinstitute.gatk.tools.CatVariants -R ${ref} ${vcfs} -out final/merged_chr.vcf -assumeSorted" | qsub -pe make 8 -cwd -j y -N catvariant --hold_jid genotypegvcf
 )

 ####################################################################################
 #Select variants in the bait region
 ####################################################################################
 job=$(echo -e "
    ${java} -Xmx15g -jar ${gatk} -T SelectVariants -R ${ref} -nt 11 -V final/merged_chr.vcf -L ${bait} -o final/merged_chr_bait.vcf"| qsub -pe make 8 -cwd -j y -N bait --hold_jid catvariant
 )

 ####################################################################################
 #VQSR for SNP
 ####################################################################################
 job=$(echo -e "
    ${java} -Xmx15g -jar ${gatk} -T VariantRecalibrator -R ${ref} -nt 11 -input final/merged_chr_bait.vcf -resource:hapmap,known=false,training=true,truth=true,prior=15.0 ${hapmap} -resource:omni,known=false,training=true,truth=true,prior=12.0 ${omni} -resource:1000G,known=false,training=true,truth=false,prior=10.0 ${hc} -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 ${dbsnp} -an DP -an SOR -an QD -an FS -an MQ -an MQRankSum -an ReadPosRankSum -an InbreedingCoeff -mode SNP -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 -recalFile final/merged_chr_bait.snp.recal -tranchesFile final/merged_chr_bait.snp.tranches -rscriptFile final/merged_chr_bait.snp.plots.R | qsub -pe make 8 -cwd -j y -N vqsr_snp --hold_jid bait
 )

 job=$(echo -e "
    ${java} -Xmx15g -jar ${gatk} -T ApplyRecalibration -R ${ref} -nt 11 -input final/merged_chr_bait.vcf -mode SNP --ts_filter_level 99.0 -recalFile final/merged_chr_bait.snp.recal -tranchesFile final/merged_chr_bait.snp.tranches -o final/merged_chr_bait.snp.raw_indel.vcf" | qsub -pe make 8 -cwd -j y -N applyrecalsnp --hold_jid vqsr_snp
 )

 ####################################################################################
 #VQSR for INDEL
 ####################################################################################
 job=$(echo -e "
    ${java} -Xmx15g -jar ${gatk} -T VariantRecalibrator -R ${ref} -nt 11 -input final/merged_chr_bait.snp.raw_indel.vcf -resource:mills,known=true,training=true,truth=true,prior=12.0 ${mills} -an DP -an FS -an SOR -an QD -an MQRankSum -an ReadPosRankSum -an InbreedingCoeff -mode INDEL -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 --maxGaussians 4 -recalFile final/merged_chr_bait.indel.recal -tranchesFile final/merged_chr_bait.indel.tranches  -rscriptFile final/merged_chr_bait.indel.plots.R" | qsub -pe make 8 -cwd -j y -N vqsr_indel --hold_jid applyrecalsnp
 )

 job=$(echo -e "
    ${java} -Xmx15g -jar ${gatk} -T ApplyRecalibration -R ${ref} -nt 11 -input final/merged_chr_bait.snp.raw_indel.vcf -mode INDEL --ts_filter_level 99.0 -recalFile final/merged_chr_bait.indel.recal -tranchesFile final/merged_chr_bait.indel.tranches -o final/merged_chr_bait.recal.vcf" | qsub -pe make 8 -cwd -j y -N apply_indel --hold_jid vqsr_indel
 )

 ####################################################################################
 #Select variants exclude non-variant loci and filtered loci (trim remaining alleles by default)
 #-env, --excludeNonVariants, -ef, --excludeFilterd
 ####################################################################################
 job=$(echo -e "
    ${java} -Xmx15g -jar ${gatk} -T SelectVariants -R $ref -nt 11 -V final/merged_chr_bait.recal.vcf -o final/merged_chr_bait.recal.filtered_only.vcf -env -ef | qsub -pe make 8 -cwd -j y -N filter --hold_jid apply_indel"
 )

 ####################################################################################
 #Extaract autosomal SNPs
 ####################################################################################
 job=$(echo -e "
    ${java} -Xmx15g -jar ${gatk} -T SelectVariants -R $ref -V final/merged_chr_bait.recal.filtered_only.vcf -selectType SNP -L chr1 -L chr2 -L chr3  -L chr4  -L chr5  -L chr6  -L chr7  -L chr8  -L chr9  -L chr10  -L chr11  -L chr12  -L chr13  -L chr14  -L chr15  -L chr16  -L chr17  -L chr17  -L chr18  -L chr19  -L chr20  -L chr21  -L chr22 -o final/final_auto_recal_bait.vcf" | qsub -pe make 8 -cwd -j y -N autosomal --hold_jid filter
 )

 ####################################################################################
 #Convert VCF to PLINK format
 ####################################################################################
 mkdir plink
 job=$(echo -e "
    ${vcftools} --vcf final/final_auto_recal_bait.vcf --plink --out plink/final.snp.auto" | qsub -pe make 8 -cwd -j y -N plink --hold_jid autosomal
 )

 ####################################################################################
 #MDS
 ####################################################################################
 job=$(echo -e "
    ${plink} --file plink/final.snp.auto --maf 0.05 --geno 0.01 --hwe 0.001 --indep-pairwise 50 5 0.2 --out plink/all_prun --noweb;
    ${plink} --file plink/final.snp.auto --extract plink/all_prun.prune.in --make-bed --out plink/all_part_prun --noweb;
    ${plink} --bfile plink/all_part_prun --genome --out plink/all_part_prun_genome --noweb;
    ${plink} --bfile plink/all_part_prun --read-genome plink/all_part_prun_genome.genome --mds-plot 4 --out plink/all_part_prun_mds --noweb" | qsub -pe make 8 -cwd -j y -N mds --hold_jid plink
 )

 ####################################################################################
 #IBS & MDS plot
 ####################################################################################
 job=$(echo "
    cat all_part_prun_genome.genome | awk '{if($0 !~/DST/) printf \"%.3f\n\", $12}' | sort | uniq -c > all_part_prun_dst.txt" | qsub -pe make 4 -cwd -j y -N awk_job --hold_jid mds
 )

 job=$(echo -e "
    cd plink \n
 R --slave --vanilla --quiet --no-save  <<EOF \n
    op<-par(family='Gentium') \n
    png('ibs_dst.png') \n
    DST<-read.table('all_part_prun_dst.txt') \n
    plot(DST[,2], DST[,1], xlim=c(0.6,1),main='IBS DST') \n
    dev.off() \n
 EOF \n

    #find cryptic relatedness \n
    awk '{if($12>0.95) print}' all_part_prun_genome.genome \n

    #MDS R plot \n
 R --slave --vanilla --quiet --no-save  <<EOF \n
    #  ---edit so path to R-graphics/.RData is correct--- \n
    mds<-read.table('all_part_prun_mds.mds',header=T) \n
    png('mds_plot.png') \n
    plot(mds$C1,mds$C2, main='MDS plot - Korean',col='gray',pch=4) \n
    dev.off() \n
 EOF \n

    #MDS & PCA plot with 1000genomes \n
    #1000genomes \n
    CEU='1000G_20101123_v3_GIANT_chr1_23_minimacnames_CEU_MAF0.01' \n
    CHB='1000G_20101123_v3_GIANT_chr1_23_minimacnames_CHB_JPT_MAF0.01' \n
    YRI='1000G_20101123_v3_GIANT_chr1_23_minimacnames_YRI_MAF0.01' \n

    awk '{if((length(\$5) >= 2)||(length(\$6) >= 2)) print \$2, 'ambig';else print \$2;}' $CEU.bim | grep -v ambig > ceu.snplist.txt \n
    awk '{if((length(\$5) >= 2)||(length(\$6) >= 2)) print \$2, 'ambig';else print \$2;}' $CHB.bim | grep -v ambig > chb.snplist.txt \n
    awk '{if((length(\$5) >= 2)||(length(\$6) >= 2)) print \$2, 'ambig';else print \$2;}' $YRI.bim | grep -v ambig > yri.snplist.txt \n

    ${plink} --bfile $CEU --extract ceu.snplist.txt --make-bed --out CEU --noweb \n
    ${plink} --bfile $CHB --extract ceu.snplist.txt --make-bed --out CHB --noweb \n
    ${plink} --bfile $YRI --extract ceu.snplist.txt --make-bed --out YRI --noweb \n
    ${plink} --file final.snp.auto --make-bed --out KOR --noweb \n

    ${plink} --bfile CEU --bmerge CHB.bed CHB.bim CHB.fam --make-bed --out CEU_CHB --noweb \n
    ${plink} --bfile CEU_CHB --bmerge YRI.bed YRI.bim YRI.fam --make-bed --out CEU_CHB_YRI --noweb \n
    ${plink} --bfile KOR --bmerge CEU_CHB_YRI.bed CEU_CHB_YRI.bim CEU_CHB_YRI.fam --out asian --noweb \n
    ${plink} --bfile KOR --exclude asian.missnp --make-bed --out KOR_flip --noweb \n
    ${plink} --bfile KOR_flip --bmerge CEU_CHB_YRI.bed CEU_CHB_YRI.bim CEU_CHB_YRI.fam --out asian_flip --noweb \n

    ${plink} --bfile asian_flip --maf 0.05 --geno 0.01 --hwe 0.001 --indep-pairwise 50 5 0.2 --out asian_prun --noweb \n
    ${plink} --bfile asian_flip --extract asian_prun.prune.in --make-bed --out asian_part_prun --noweb \n
    ${plink} --bfile asian_part_prun --genome --out asian_part_prun_genome --noweb \n
    ${plink} --bfile asian_part_prun --read-genome asian_part_prun_genome.genome --mds-plot 4 --out asian_part_prun_mds --noweb \n

 R --slave --vanilla --quiet --no-save  <<EOF \n
    mds<-read.table('asian_part_prun_mds.mds',header=T) \n
    png('asian_mds_plot.png') \n

    plot(mds\$C1,mds\$C2,main='MDS plot',col='gray',pch=1) \n
    legend('topright',legend=c('KOR','CEU','CHB','JPT','YRI'),pch=21,col=c('blue','red','yellow','maroon','green')) \n

    for (i in 1:1339){ \n
        points(mds\$C1[i],mds\$C2[i],pch=21,col='blue') \n
    } \n
    for (i in 1340:1422){ \n
        points(mds\$C1[i],mds\$C2[i],pch=21,col='red') \n
    } \n
    for (i in 1423:1520){ \n
        points(mds\$C1[i],mds\$C2[i],pch=21,col='yellow') \n
    } \n
    for (i in 1521:1608){ \n
        points(mds\$C1[i],mds\$C2[i],pch=21,col='maroon') \n
    } \n
    for (i in 1609:1695){ \n
        points(mds\$C1[i],mds\$C2[i],pch=21,col='green') \n
    } \n

    dev.off() \n
 EOF" | qsub -pe make 4 -cwd -j y -N mds_plot --hold_jid awk_job
 )

 ####################################################################################
 #PCA
 #EIGENSOFT PCA
 ####################################################################################
 par='genotypename: asian_part_prun.bed
 snpname: asian_part_prun.bim
 indivname: asian_part_prun.fam
 outputformat: EIGENSTRAT
 genotypeoutname: asian_part_prun.geno
 snpoutname: asian_part_prun.snp
 indivoutname: asian_part_prun.ind
 familynames: NO'

 echo "$par" > plink/gwas.par

 job=$(echo -e "
    sudo apt-get update -y \n
    sudo apt-get install libgsl0ldbl -y \n
    PATH=$eigenstrat:$PATH; export PATH \n
    cd plink \n
    ${eigenstrat}/convertf -p gwas.par \n
    ${eigenstrat}/smartpca.perl -i asian_part_prun.geno -a asian_part_prun.snp -b asian_part_prun.ind -o asian_part_prun.pca -p asian_part_prun.plot -e asian_part_prun.eval -l asian_part_prun.logs \n

 R --slave --quiet --no-save  <<EOF \n
    op<-par(family='Gentium') \n
    pca<-read.table('plink/asian_part_prun.pca.evec',header=F) \n
    png('asian_pca_plot.png') \n

    plot(pca\$V2,pca\$V3,main='PCA plot',col='gray',pch=1) \n
    legend('topright',legend=c('KOR','CEU','CHB','JPT','YRI'),pch=21,col=c('blue','red','yellow','maroon','green')) \n

    for (i in 1:1320){ \n
    points(pca\$V2[i],pca\$V3[i],pch=21,col='blue') \n
    } \n
    for (i in 1321:1402){ \n
    points(pca\$V2[i],pca\$V3[i],pch=21,col='red') \n
    } \n
    for (i in 1403:1500){ \n
        points(pca\$V2[i],pca\$V3[i],pch=21,col='yellow') \n
    } \n
    for (i in 1501:1588){ \n
        points(pca\$V2[i],pca\$V3[i],pch=21,col='maroon') \n
    } \n
    for (i in 1589:1675){ \n
    points(pca\$V2[i],pca\$V3[i],pch=21,col='green') \n
    } \n
    dev.off() \n
 EOF" | qsub -pe make 4 -cwd -j y -N pca_plot
 )

 LOG "End"