martinholub · October 10, 2018 11:18 · Oct 10, 2018 · Oct 10, 2018
diff --git a/transcript_snp.py b/transcript_snp.py
@@ -88,6 +88,11 @@ def get_transcript_snvs(self):
 
         # Combine transcript name with the SNV information
         # Converts the coordinates to transcriptomic ones
+
+        # `$11-$4+1` computes coordinate of SNV along transcript,where
+        # $4 start of transcript and $11 postion of SNV both in chromosme-based coords.
+        # transcripts file carries information on transcript
+        # columns $12..$17 carry information on the SNV itself
         command = "awk -F\"\t\" -v OFS=\"\t\" '{{print $11-$4+1,$12,$13,$14,$15,$16,$17}}' {} | paste {} - > {}"
         command = command.format(transcripts_vcf, transcripts, self.out_path)
         subprocess.run(command, shell = True, check = True)

diff --git a/transcript_snp.py b/transcript_snp.py
@@ -0,0 +1,101 @@
+class TranscriptomicSNP(object):
+    """ Generate a VCF file with transcriptomic coordinates
+
+    Gets transcriptomic snp loci in format <transc> <SNPpos> <additional_info...>.
+
+    # available:
+    # <chrom> <transc> <start> <end>
+    # <chrom> <SNVpos>
+
+    """
+    def __init__(self, anno_path, vcf_path, out_path, do_add_transcript_version = True):
+        self.anno_path = anno_path
+        self.vcf_path = vcf_path
+        self.out_path = out_path
+        self.do_add_transcript_version = do_add_transcript_version
+
+    @property
+    def anno_path(self):
+        """Path to annotation GTF file"""
+        return self._anno_path
+
+    @anno_path.setter
+    def anno_path(self, value):
+        value = value.rstrip("/")
+        value = os.path.expanduser(value)
+        assert os.path.isfile(value), "File {} doesn't exist".format(value)
+        self._anno_path = value
+
+    @property
+    def vcf_path(self):
+        """Path to annotation VCF file"""
+        return self._vcf_path
+
+    @vcf_path.setter
+    def vcf_path(self, value):
+        value = value.rstrip("/")
+        value = os.path.expanduser(value)
+        assert os.path.isfile(value), "File {} doesn't exist".format(value)
+        self._vcf_path = value
+
+    @property
+    def out_path(self):
+        return self._out_path
+
+    @out_path.setter
+    def out_path(self, value):
+        value = rm_ext(value, "gz")
+        value = value.rstrip("/")
+        value = os.path.expanduser(value)
+        self._out_path = value
+
+    def get_transcript_snvs(self):
+        """Generate a VCF file with transcriptomic coordinates
+
+        Refernces:
+            https://docs.python.org/3/library/subprocess.html
+            https://bedops.readthedocs.io/en/latest/content/reference/file-management/conversion/convert2bed.html#convert2bed
+            http://bedtools.readthedocs.io/en/latest/content/tools/intersect.html
+        """
+
+        # Extract transcripts from GTF file, sort
+        _, transcripts_gtf = tempfile.mkstemp()
+        command = 'grep -P "\ttranscript\t" {} | sort -k1,1 -k4,4n > {}'.format(self.anno_path,transcripts_gtf)
+        _ = subprocess.run(command, check = True, shell = True)
+
+        # Get only SNVs from VCF, to save some memory/time
+        _, snv_vcf = tempfile.mkstemp()
+        command = 'grep -P "(^#|TSA=SNV)" {} > {}'.format(self.vcf_path, snv_vcf)
+        subprocess.run(command, check = True, shell = True)
+
+        # Get such transcripts, that have an overlap with a known SNV
+        # Exploits fact that both coordinates are chromosome-based
+        # writes information from both files next to each other
+        _, transcripts_vcf = tempfile.mkstemp()
+        command = "bedtools intersect -sorted -F 1 -wo -a {} -b {} > {}"
+        command = command.format(transcripts_gtf, snv_vcf, transcripts_vcf)
+        subprocess.run(command, check = True, shell = True)
+
+        # Pull out name of transcripts as obtained in previous step
+        _, transcripts = tempfile.mkstemp()
+        if self.do_add_transcript_version:
+            command = "cut -f9 {} | sed -E 's/.*\stranscript_id \"([^;]+)\"; "
+            command += "transcript_version \"([^;]+)\";.*/\\1.\\2/' > {}"
+        else:
+            command = "cut -f9 {} | sed -E 's/.*\stranscript_id \"([^;]+)\".*/\\1/' > {}"
+        command = command.format(transcripts_vcf, transcripts)
+        subprocess.run(command, check = True, shell = True)
+
+        # Combine transcript name with the SNV information
+        # Converts the coordinates to transcriptomic ones
+        command = "awk -F\"\t\" -v OFS=\"\t\" '{{print $11-$4+1,$12,$13,$14,$15,$16,$17}}' {} | paste {} - > {}"
+        command = command.format(transcripts_vcf, transcripts, self.out_path)
+        subprocess.run(command, shell = True, check = True)
+
+        # Zip and index
+        command = "bgzip --force {0} && tabix -p vcf {0}.gz".format(self.out_path)
+        subprocess.run(command, shell = True, check = True)
+
+        # Cleanup
+        for f in [transcripts_gtf, snv_vcf, transcripts, transcripts_vcf]:
+            os.remove(f)
No results found