Skip to content

Instantly share code, notes, and snippets.

@martinholub
Last active October 10, 2018 11:18
Show Gist options
  • Select an option

  • Save martinholub/2dbb4cfcb28d12aed0e0302afc58cb57 to your computer and use it in GitHub Desktop.

Select an option

Save martinholub/2dbb4cfcb28d12aed0e0302afc58cb57 to your computer and use it in GitHub Desktop.

Revisions

  1. martinholub revised this gist Oct 10, 2018. 1 changed file with 5 additions and 0 deletions.
    5 changes: 5 additions & 0 deletions transcript_snp.py
    Original file line number Diff line number Diff line change
    @@ -88,6 +88,11 @@ def get_transcript_snvs(self):

    # Combine transcript name with the SNV information
    # Converts the coordinates to transcriptomic ones

    # `$11-$4+1` computes coordinate of SNV along transcript,where
    # $4 start of transcript and $11 postion of SNV both in chromosme-based coords.
    # transcripts file carries information on transcript
    # columns $12..$17 carry information on the SNV itself
    command = "awk -F\"\t\" -v OFS=\"\t\" '{{print $11-$4+1,$12,$13,$14,$15,$16,$17}}' {} | paste {} - > {}"
    command = command.format(transcripts_vcf, transcripts, self.out_path)
    subprocess.run(command, shell = True, check = True)
  2. martinholub created this gist Oct 10, 2018.
    101 changes: 101 additions & 0 deletions transcript_snp.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,101 @@
    class TranscriptomicSNP(object):
    """ Generate a VCF file with transcriptomic coordinates
    Gets transcriptomic snp loci in format <transc> <SNPpos> <additional_info...>.
    # available:
    # <chrom> <transc> <start> <end>
    # <chrom> <SNVpos>
    """
    def __init__(self, anno_path, vcf_path, out_path, do_add_transcript_version = True):
    self.anno_path = anno_path
    self.vcf_path = vcf_path
    self.out_path = out_path
    self.do_add_transcript_version = do_add_transcript_version

    @property
    def anno_path(self):
    """Path to annotation GTF file"""
    return self._anno_path

    @anno_path.setter
    def anno_path(self, value):
    value = value.rstrip("/")
    value = os.path.expanduser(value)
    assert os.path.isfile(value), "File {} doesn't exist".format(value)
    self._anno_path = value

    @property
    def vcf_path(self):
    """Path to annotation VCF file"""
    return self._vcf_path

    @vcf_path.setter
    def vcf_path(self, value):
    value = value.rstrip("/")
    value = os.path.expanduser(value)
    assert os.path.isfile(value), "File {} doesn't exist".format(value)
    self._vcf_path = value

    @property
    def out_path(self):
    return self._out_path

    @out_path.setter
    def out_path(self, value):
    value = rm_ext(value, "gz")
    value = value.rstrip("/")
    value = os.path.expanduser(value)
    self._out_path = value

    def get_transcript_snvs(self):
    """Generate a VCF file with transcriptomic coordinates
    Refernces:
    https://docs.python.org/3/library/subprocess.html
    https://bedops.readthedocs.io/en/latest/content/reference/file-management/conversion/convert2bed.html#convert2bed
    http://bedtools.readthedocs.io/en/latest/content/tools/intersect.html
    """

    # Extract transcripts from GTF file, sort
    _, transcripts_gtf = tempfile.mkstemp()
    command = 'grep -P "\ttranscript\t" {} | sort -k1,1 -k4,4n > {}'.format(self.anno_path,transcripts_gtf)
    _ = subprocess.run(command, check = True, shell = True)

    # Get only SNVs from VCF, to save some memory/time
    _, snv_vcf = tempfile.mkstemp()
    command = 'grep -P "(^#|TSA=SNV)" {} > {}'.format(self.vcf_path, snv_vcf)
    subprocess.run(command, check = True, shell = True)

    # Get such transcripts, that have an overlap with a known SNV
    # Exploits fact that both coordinates are chromosome-based
    # writes information from both files next to each other
    _, transcripts_vcf = tempfile.mkstemp()
    command = "bedtools intersect -sorted -F 1 -wo -a {} -b {} > {}"
    command = command.format(transcripts_gtf, snv_vcf, transcripts_vcf)
    subprocess.run(command, check = True, shell = True)

    # Pull out name of transcripts as obtained in previous step
    _, transcripts = tempfile.mkstemp()
    if self.do_add_transcript_version:
    command = "cut -f9 {} | sed -E 's/.*\stranscript_id \"([^;]+)\"; "
    command += "transcript_version \"([^;]+)\";.*/\\1.\\2/' > {}"
    else:
    command = "cut -f9 {} | sed -E 's/.*\stranscript_id \"([^;]+)\".*/\\1/' > {}"
    command = command.format(transcripts_vcf, transcripts)
    subprocess.run(command, check = True, shell = True)

    # Combine transcript name with the SNV information
    # Converts the coordinates to transcriptomic ones
    command = "awk -F\"\t\" -v OFS=\"\t\" '{{print $11-$4+1,$12,$13,$14,$15,$16,$17}}' {} | paste {} - > {}"
    command = command.format(transcripts_vcf, transcripts, self.out_path)
    subprocess.run(command, shell = True, check = True)

    # Zip and index
    command = "bgzip --force {0} && tabix -p vcf {0}.gz".format(self.out_path)
    subprocess.run(command, shell = True, check = True)

    # Cleanup
    for f in [transcripts_gtf, snv_vcf, transcripts, transcripts_vcf]:
    os.remove(f)