Created
October 27, 2013 20:25
-
-
Save aaronsaunders/7187482 to your computer and use it in GitHub Desktop.
Revisions
-
Aaron Marc Saunders revised this gist
Oct 27, 2013 . No changes.There are no files selected for viewing
-
Aaron Marc Saunders created this gist
Oct 27, 2013 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,110 @@ """ Mappings from file extensions to biopython types. Copied from Erik Matsens seqmagick https://github.com/fhcrc/seqmagick/ """ import argparse import contextlib import copy import functools import os import os.path import signal import sys import tempfile import bz2 import gzip import os.path import sys # Define mappings in a dictionary with extension : BioPython_file_type. EXTENSION_TO_TYPE = {'.aln': 'clustal', '.afa': 'fasta', '.fa': 'fasta', '.faa': 'fasta', '.fas': 'fasta', '.fasta': 'fasta', '.fastq': 'fastq', '.fq': 'fastq', '.ffn': 'fasta', '.fna': 'fasta', '.frn': 'fasta', '.gb': 'genbank', '.gbk': 'genbank', '.needle': 'emboss', '.nex': 'nexus', '.phy': 'phylip', '.phylip': 'phylip', '.phyx': 'phylip-relaxed', '.qual': 'qual', '.sff': 'sff-trim', '.sth': 'stockholm', '.sto': 'stockholm',} COMPRESS_EXT = {'.bz2': bz2.BZ2File, '.gz': gzip.open, '.bz': bz2.BZ2File} class UnknownExtensionError(ValueError): pass def from_extension(extension): """ Look up the BioPython file type corresponding with input extension. Look up is case insensitive. """ if not extension.startswith('.'): raise ValueError("Extensions must begin with a period.") try: return EXTENSION_TO_TYPE[extension.lower()] except KeyError: raise UnknownExtensionError("seqmagick does not know how to handle " + "files with extensions like this: " + extension) def from_filename(file_name): """ Look up the BioPython file type corresponding to an input file name. """ base, extension = os.path.splitext(file_name) if extension in COMPRESS_EXT: # Compressed file extension = os.path.splitext(base)[1] return from_extension(extension) def from_handle(fh, stream_default='fasta'): """ Look up the BioPython file type corresponding to a file-like object. For stdin, stdout, and stderr, ``stream_default`` is used. """ if fh in (sys.stdin, sys.stdout, sys.stderr): return stream_default return from_filename(fh.name) class FileType(object): """ Near clone of argparse.FileType, supporting gzip and bzip """ def __init__(self, mode='r'): self.mode = mode self.ext_map = COMPRESS_EXT.copy() def _get_handle(self, file_path): ext = os.path.splitext(file_path)[1].lower() return self.ext_map.get(ext, open)(file_path, self.mode) def __call__(self, string): if string == '-': if 'r' in self.mode: return sys.stdin elif 'w' in self.mode: return sys.stdout else: raise ValueError("Invalid mode: {0}".format(string)) else: return self._get_handle(string) This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,72 @@ from Bio.SeqIO import parse from toolbox import biofileformat import matplotlib.pyplot as plt import sys from collections import Counter import fileinput def get_lengths(fname, sub=False): """ Parses a sequence file and returns a list of sequence lengths """ with biofileformat.FileType('rb')(fname) as fh: seqformat = biofileformat.from_handle(fh) okformats = [ "fasta", "fastq" ] if seqformat not in okformats: print "takes only fasta/fastq w/wo compression" return if sub: lengths = [] for n, rec in enumerate(parse(fh, seqformat)): sizes.append(len(rec)) if n == (sub - 1): break else: lengths = [len(rec) for rec in parse(fh, seqformat)] return lengths def draw_histogram(lengths): """ Parses a sequence file and returns a plot of sequence lengths. Optional argument to subset the file. """ plt.hist(lengths) plt.title("%s\n%i sequences, range %i to %i bp" \ % (fname, len(lengths), min(lengths),max(lengths))) plt.xlabel("Sequence length (bp)") plt.ylabel("Count") plt.show() return def plot_seqlength(fname, sub=False): draw_histogram(get_lengths(fname, sub=False)) return def main(args): usage = "Usage: plot_lengths.py infile" if args[0] == "-h": print usage sys.exit() for fname in args: lengths = get_lengths(fname) counts = Counter() for l in lengths: counts[l] += 1 outlines = [ '{}\t{}'.format( key, counts[key]) for key in sorted(counts.keys()) ] sys.stdout.write('length\tcount\n') sys.stdout.write('\n'.join(outlines)) sys.stdout.write('\n') if __name__ == '__main__': main(sys.argv[1:])