class TranscriptomicSNP(object): """ Generate a VCF file with transcriptomic coordinates Gets transcriptomic snp loci in format . # available: # # """ def __init__(self, anno_path, vcf_path, out_path, do_add_transcript_version = True): self.anno_path = anno_path self.vcf_path = vcf_path self.out_path = out_path self.do_add_transcript_version = do_add_transcript_version @property def anno_path(self): """Path to annotation GTF file""" return self._anno_path @anno_path.setter def anno_path(self, value): value = value.rstrip("/") value = os.path.expanduser(value) assert os.path.isfile(value), "File {} doesn't exist".format(value) self._anno_path = value @property def vcf_path(self): """Path to annotation VCF file""" return self._vcf_path @vcf_path.setter def vcf_path(self, value): value = value.rstrip("/") value = os.path.expanduser(value) assert os.path.isfile(value), "File {} doesn't exist".format(value) self._vcf_path = value @property def out_path(self): return self._out_path @out_path.setter def out_path(self, value): value = rm_ext(value, "gz") value = value.rstrip("/") value = os.path.expanduser(value) self._out_path = value def get_transcript_snvs(self): """Generate a VCF file with transcriptomic coordinates Refernces: https://docs.python.org/3/library/subprocess.html https://bedops.readthedocs.io/en/latest/content/reference/file-management/conversion/convert2bed.html#convert2bed http://bedtools.readthedocs.io/en/latest/content/tools/intersect.html """ # Extract transcripts from GTF file, sort _, transcripts_gtf = tempfile.mkstemp() command = 'grep -P "\ttranscript\t" {} | sort -k1,1 -k4,4n > {}'.format(self.anno_path,transcripts_gtf) _ = subprocess.run(command, check = True, shell = True) # Get only SNVs from VCF, to save some memory/time _, snv_vcf = tempfile.mkstemp() command = 'grep -P "(^#|TSA=SNV)" {} > {}'.format(self.vcf_path, snv_vcf) subprocess.run(command, check = True, shell = True) # Get such transcripts, that have an overlap with a known SNV # Exploits fact that both coordinates are chromosome-based # writes information from both files next to each other _, transcripts_vcf = tempfile.mkstemp() command = "bedtools intersect -sorted -F 1 -wo -a {} -b {} > {}" command = command.format(transcripts_gtf, snv_vcf, transcripts_vcf) subprocess.run(command, check = True, shell = True) # Pull out name of transcripts as obtained in previous step _, transcripts = tempfile.mkstemp() if self.do_add_transcript_version: command = "cut -f9 {} | sed -E 's/.*\stranscript_id \"([^;]+)\"; " command += "transcript_version \"([^;]+)\";.*/\\1.\\2/' > {}" else: command = "cut -f9 {} | sed -E 's/.*\stranscript_id \"([^;]+)\".*/\\1/' > {}" command = command.format(transcripts_vcf, transcripts) subprocess.run(command, check = True, shell = True) # Combine transcript name with the SNV information # Converts the coordinates to transcriptomic ones # `$11-$4+1` computes coordinate of SNV along transcript,where # $4 start of transcript and $11 postion of SNV both in chromosme-based coords. # transcripts file carries information on transcript # columns $12..$17 carry information on the SNV itself command = "awk -F\"\t\" -v OFS=\"\t\" '{{print $11-$4+1,$12,$13,$14,$15,$16,$17}}' {} | paste {} - > {}" command = command.format(transcripts_vcf, transcripts, self.out_path) subprocess.run(command, shell = True, check = True) # Zip and index command = "bgzip --force {0} && tabix -p vcf {0}.gz".format(self.out_path) subprocess.run(command, shell = True, check = True) # Cleanup for f in [transcripts_gtf, snv_vcf, transcripts, transcripts_vcf]: os.remove(f)