Last active
October 10, 2018 11:18
-
-
Save martinholub/2dbb4cfcb28d12aed0e0302afc58cb57 to your computer and use it in GitHub Desktop.
Revisions
-
martinholub revised this gist
Oct 10, 2018 . 1 changed file with 5 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -88,6 +88,11 @@ def get_transcript_snvs(self): # Combine transcript name with the SNV information # Converts the coordinates to transcriptomic ones # `$11-$4+1` computes coordinate of SNV along transcript,where # $4 start of transcript and $11 postion of SNV both in chromosme-based coords. # transcripts file carries information on transcript # columns $12..$17 carry information on the SNV itself command = "awk -F\"\t\" -v OFS=\"\t\" '{{print $11-$4+1,$12,$13,$14,$15,$16,$17}}' {} | paste {} - > {}" command = command.format(transcripts_vcf, transcripts, self.out_path) subprocess.run(command, shell = True, check = True) -
martinholub created this gist
Oct 10, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,101 @@ class TranscriptomicSNP(object): """ Generate a VCF file with transcriptomic coordinates Gets transcriptomic snp loci in format <transc> <SNPpos> <additional_info...>. # available: # <chrom> <transc> <start> <end> # <chrom> <SNVpos> """ def __init__(self, anno_path, vcf_path, out_path, do_add_transcript_version = True): self.anno_path = anno_path self.vcf_path = vcf_path self.out_path = out_path self.do_add_transcript_version = do_add_transcript_version @property def anno_path(self): """Path to annotation GTF file""" return self._anno_path @anno_path.setter def anno_path(self, value): value = value.rstrip("/") value = os.path.expanduser(value) assert os.path.isfile(value), "File {} doesn't exist".format(value) self._anno_path = value @property def vcf_path(self): """Path to annotation VCF file""" return self._vcf_path @vcf_path.setter def vcf_path(self, value): value = value.rstrip("/") value = os.path.expanduser(value) assert os.path.isfile(value), "File {} doesn't exist".format(value) self._vcf_path = value @property def out_path(self): return self._out_path @out_path.setter def out_path(self, value): value = rm_ext(value, "gz") value = value.rstrip("/") value = os.path.expanduser(value) self._out_path = value def get_transcript_snvs(self): """Generate a VCF file with transcriptomic coordinates Refernces: https://docs.python.org/3/library/subprocess.html https://bedops.readthedocs.io/en/latest/content/reference/file-management/conversion/convert2bed.html#convert2bed http://bedtools.readthedocs.io/en/latest/content/tools/intersect.html """ # Extract transcripts from GTF file, sort _, transcripts_gtf = tempfile.mkstemp() command = 'grep -P "\ttranscript\t" {} | sort -k1,1 -k4,4n > {}'.format(self.anno_path,transcripts_gtf) _ = subprocess.run(command, check = True, shell = True) # Get only SNVs from VCF, to save some memory/time _, snv_vcf = tempfile.mkstemp() command = 'grep -P "(^#|TSA=SNV)" {} > {}'.format(self.vcf_path, snv_vcf) subprocess.run(command, check = True, shell = True) # Get such transcripts, that have an overlap with a known SNV # Exploits fact that both coordinates are chromosome-based # writes information from both files next to each other _, transcripts_vcf = tempfile.mkstemp() command = "bedtools intersect -sorted -F 1 -wo -a {} -b {} > {}" command = command.format(transcripts_gtf, snv_vcf, transcripts_vcf) subprocess.run(command, check = True, shell = True) # Pull out name of transcripts as obtained in previous step _, transcripts = tempfile.mkstemp() if self.do_add_transcript_version: command = "cut -f9 {} | sed -E 's/.*\stranscript_id \"([^;]+)\"; " command += "transcript_version \"([^;]+)\";.*/\\1.\\2/' > {}" else: command = "cut -f9 {} | sed -E 's/.*\stranscript_id \"([^;]+)\".*/\\1/' > {}" command = command.format(transcripts_vcf, transcripts) subprocess.run(command, check = True, shell = True) # Combine transcript name with the SNV information # Converts the coordinates to transcriptomic ones command = "awk -F\"\t\" -v OFS=\"\t\" '{{print $11-$4+1,$12,$13,$14,$15,$16,$17}}' {} | paste {} - > {}" command = command.format(transcripts_vcf, transcripts, self.out_path) subprocess.run(command, shell = True, check = True) # Zip and index command = "bgzip --force {0} && tabix -p vcf {0}.gz".format(self.out_path) subprocess.run(command, shell = True, check = True) # Cleanup for f in [transcripts_gtf, snv_vcf, transcripts, transcripts_vcf]: os.remove(f)