#!/usr/bin/env python

from sys import argv

from operator import itemgetter
from scipy.stats import rankdata
from numpy import log

from biom import load_table

""" From the figure 1 approach in http://jem.rupress.org/content/209/2/365
    Rank = # of times sequence appears
    Frequency of rank = how many of a given rank # of present, e.g., should be many rank 1
    
    implication is that those with high rank will be active pool of B-cells.
    
    """

otu_table = load_table(argv[1])

ids = otu_table._sample_ids
obs = otu_table._observation_ids

exceptions = ["WT.sp4", "WT.LI4"] # outlier


per_id_outfs = [x + ".txt" for x in ids]

id_outf = []

for f in per_id_outfs:
    id_outf.append(open(f, "w"))

 
combined_outf = open("combined_data.txt", "w")
combined_filtered_outf = open("combined_data_no4.txt", "w")

combined_counts = {}
combined_counts_filtered = {}

for curr_otu in obs:
    count = 0
    count_filtered = 0
    for id in ids:
        curr_val = otu_table.get_value_by_ids(curr_otu, id)
        count += curr_val
        if id not in exceptions:
            count_filtered += curr_val
    # Skip if zero count OTU, e.g. zero after rarefaction
    if count > 0:
        try:
            combined_counts[count] += 1
        except KeyError:
            combined_counts[count] = 1
    if count_filtered > 0:
        try:
            combined_counts_filtered[count_filtered] += 1
        except KeyError:
            combined_counts_filtered[count_filtered] = 1
        
id_dicts = {}
for id in ids:
    id_dicts[id] = {}
        
for id in ids:

    for curr_otu in obs:
        curr_val = otu_table.get_value_by_ids(curr_otu, id)
        if curr_val > 0:
            try:
                id_dicts[id][curr_val] += 1
            except KeyError:
                id_dicts[id][curr_val] = 1
                
sorted_per_sample = []

for id in ids:
    curr_l = sorted(id_dicts[id].iteritems(), key=itemgetter(0), reverse=False)
    sorted_per_sample.append(curr_l)
    

combined_counts_sorted = sorted(combined_counts.iteritems(), key=itemgetter(0), reverse=False)
combined_counts_filtered_sorted = sorted(combined_counts_filtered.iteritems(), key=itemgetter(0), reverse=False)

combined_outf.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n")
for x in combined_counts_sorted:
    combined_outf.write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1])))

combined_filtered_outf.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n")
for x in combined_counts_filtered_sorted:
    combined_filtered_outf.write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1])))
    
for f in id_outf:
    f.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n")
    
for f in range(len(id_outf)):
    for x in sorted_per_sample[f]:
        id_outf[f].write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1])))