#!/usr/bin/env python from sys import argv from operator import itemgetter from scipy.stats import rankdata from numpy import log from biom import load_table """ From the figure 1 approach in http://jem.rupress.org/content/209/2/365 Rank = # of times sequence appears Frequency of rank = how many of a given rank # of present, e.g., should be many rank 1 implication is that those with high rank will be active pool of B-cells. """ otu_table = load_table(argv[1]) ids = otu_table._sample_ids obs = otu_table._observation_ids exceptions = ["WT.sp4", "WT.LI4"] # outlier per_id_outfs = [x + ".txt" for x in ids] id_outf = [] for f in per_id_outfs: id_outf.append(open(f, "w")) combined_outf = open("combined_data.txt", "w") combined_filtered_outf = open("combined_data_no4.txt", "w") combined_counts = {} combined_counts_filtered = {} for curr_otu in obs: count = 0 count_filtered = 0 for id in ids: curr_val = otu_table.get_value_by_ids(curr_otu, id) count += curr_val if id not in exceptions: count_filtered += curr_val # Skip if zero count OTU, e.g. zero after rarefaction if count > 0: try: combined_counts[count] += 1 except KeyError: combined_counts[count] = 1 if count_filtered > 0: try: combined_counts_filtered[count_filtered] += 1 except KeyError: combined_counts_filtered[count_filtered] = 1 id_dicts = {} for id in ids: id_dicts[id] = {} for id in ids: for curr_otu in obs: curr_val = otu_table.get_value_by_ids(curr_otu, id) if curr_val > 0: try: id_dicts[id][curr_val] += 1 except KeyError: id_dicts[id][curr_val] = 1 sorted_per_sample = [] for id in ids: curr_l = sorted(id_dicts[id].iteritems(), key=itemgetter(0), reverse=False) sorted_per_sample.append(curr_l) combined_counts_sorted = sorted(combined_counts.iteritems(), key=itemgetter(0), reverse=False) combined_counts_filtered_sorted = sorted(combined_counts_filtered.iteritems(), key=itemgetter(0), reverse=False) combined_outf.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n") for x in combined_counts_sorted: combined_outf.write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1]))) combined_filtered_outf.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n") for x in combined_counts_filtered_sorted: combined_filtered_outf.write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1]))) for f in id_outf: f.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n") for f in range(len(id_outf)): for x in sorted_per_sample[f]: id_outf[f].write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1])))