Skip to content

Instantly share code, notes, and snippets.

@aaronsaunders
Created October 27, 2013 20:25
Show Gist options
  • Save aaronsaunders/7187482 to your computer and use it in GitHub Desktop.
Save aaronsaunders/7187482 to your computer and use it in GitHub Desktop.

Revisions

  1. Aaron Marc Saunders revised this gist Oct 27, 2013. No changes.
  2. Aaron Marc Saunders created this gist Oct 27, 2013.
    110 changes: 110 additions & 0 deletions biofileformat.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,110 @@
    """
    Mappings from file extensions to biopython types.
    Copied from Erik Matsens seqmagick https://github.com/fhcrc/seqmagick/
    """
    import argparse
    import contextlib
    import copy
    import functools
    import os
    import os.path
    import signal
    import sys
    import tempfile

    import bz2
    import gzip
    import os.path
    import sys

    # Define mappings in a dictionary with extension : BioPython_file_type.
    EXTENSION_TO_TYPE = {'.aln': 'clustal',
    '.afa': 'fasta',
    '.fa': 'fasta',
    '.faa': 'fasta',
    '.fas': 'fasta',
    '.fasta': 'fasta',
    '.fastq': 'fastq',
    '.fq': 'fastq',
    '.ffn': 'fasta',
    '.fna': 'fasta',
    '.frn': 'fasta',
    '.gb': 'genbank',
    '.gbk': 'genbank',
    '.needle': 'emboss',
    '.nex': 'nexus',
    '.phy': 'phylip',
    '.phylip': 'phylip',
    '.phyx': 'phylip-relaxed',
    '.qual': 'qual',
    '.sff': 'sff-trim',
    '.sth': 'stockholm',
    '.sto': 'stockholm',}

    COMPRESS_EXT = {'.bz2': bz2.BZ2File, '.gz': gzip.open, '.bz': bz2.BZ2File}


    class UnknownExtensionError(ValueError):
    pass


    def from_extension(extension):
    """
    Look up the BioPython file type corresponding with input extension.
    Look up is case insensitive.
    """
    if not extension.startswith('.'):
    raise ValueError("Extensions must begin with a period.")
    try:
    return EXTENSION_TO_TYPE[extension.lower()]
    except KeyError:
    raise UnknownExtensionError("seqmagick does not know how to handle " +
    "files with extensions like this: " + extension)


    def from_filename(file_name):
    """
    Look up the BioPython file type corresponding to an input file name.
    """
    base, extension = os.path.splitext(file_name)
    if extension in COMPRESS_EXT:
    # Compressed file
    extension = os.path.splitext(base)[1]
    return from_extension(extension)

    def from_handle(fh, stream_default='fasta'):
    """
    Look up the BioPython file type corresponding to a file-like object.
    For stdin, stdout, and stderr, ``stream_default`` is used.
    """
    if fh in (sys.stdin, sys.stdout, sys.stderr):
    return stream_default
    return from_filename(fh.name)


    class FileType(object):
    """
    Near clone of argparse.FileType, supporting gzip and bzip
    """
    def __init__(self, mode='r'):
    self.mode = mode
    self.ext_map = COMPRESS_EXT.copy()

    def _get_handle(self, file_path):
    ext = os.path.splitext(file_path)[1].lower()
    return self.ext_map.get(ext, open)(file_path, self.mode)

    def __call__(self, string):
    if string == '-':
    if 'r' in self.mode:
    return sys.stdin
    elif 'w' in self.mode:
    return sys.stdout
    else:
    raise ValueError("Invalid mode: {0}".format(string))
    else:
    return self._get_handle(string)
    72 changes: 72 additions & 0 deletions plot_lengths.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,72 @@
    from Bio.SeqIO import parse
    from toolbox import biofileformat
    import matplotlib.pyplot as plt
    import sys
    from collections import Counter
    import fileinput

    def get_lengths(fname, sub=False):
    """
    Parses a sequence file and returns a list of sequence lengths
    """
    with biofileformat.FileType('rb')(fname) as fh:
    seqformat = biofileformat.from_handle(fh)

    okformats = [ "fasta", "fastq" ]
    if seqformat not in okformats:
    print "takes only fasta/fastq w/wo compression"
    return

    if sub:
    lengths = []
    for n, rec in enumerate(parse(fh, seqformat)):
    sizes.append(len(rec))
    if n == (sub - 1):
    break
    else:
    lengths = [len(rec) for rec in parse(fh, seqformat)]

    return lengths

    def draw_histogram(lengths):
    """
    Parses a sequence file and returns a plot of sequence lengths.
    Optional argument to subset the file.
    """
    plt.hist(lengths)
    plt.title("%s\n%i sequences, range %i to %i bp" \
    % (fname, len(lengths), min(lengths),max(lengths)))
    plt.xlabel("Sequence length (bp)")
    plt.ylabel("Count")
    plt.show()

    return


    def plot_seqlength(fname, sub=False):
    draw_histogram(get_lengths(fname, sub=False))
    return

    def main(args):
    usage = "Usage: plot_lengths.py infile"

    if args[0] == "-h":
    print usage
    sys.exit()

    for fname in args:
    lengths = get_lengths(fname)
    counts = Counter()
    for l in lengths:
    counts[l] += 1
    outlines = [ '{}\t{}'.format( key, counts[key])
    for key in sorted(counts.keys()) ]

    sys.stdout.write('length\tcount\n')
    sys.stdout.write('\n'.join(outlines))
    sys.stdout.write('\n')

    if __name__ == '__main__':
    main(sys.argv[1:])