fedarko · April 17, 2019 22:18 · Apr 11, 2019 · Apr 11, 2019 · Apr 11, 2019 · Apr 11, 2019
diff --git a/poisson-cat.py b/poisson-cat.py
@@ -50,9 +50,11 @@ def poisson_cat(table, metadata, category, ref=None):
     C = N[idx].sum()
     D = N[~idx].sum()
 
-    diff = np.log((A * B * C)  / (B * B * D - A))
+    diff = pd.Series(np.log((A * B * C)  / (B * B * D - A)),
+                     table.ids(axis='observation'))
     return diff
 
+
 # This is for testing
 
 def random_block_table(reps, n_species,

diff --git a/poisson-cat → poisson-cat.py b/poisson-cat → poisson-cat.py
diff --git a/poisson-cate → poisson-cat b/poisson-cate → poisson-cat
diff --git a/poisson-cate b/poisson-cate
@@ -0,0 +1,182 @@
+import numpy as np
+import patsy
+import pandas as pd
+from biom import Table
+
+# This is the main function
+def poisson_cat(table, metadata, category, ref=None):
+    """ Poisson differential abundance.
+
+    Parameters
+    ----------
+    table : biom.Table
+        Sparse matrix of counts.
+    metadata : pd.DataFrame
+        Sample metadata file.
+    category : str
+        Metadata category of interest.
+    ref : str
+        Reference category.  If not specified, the first
+        category will be picked.
+
+    Returns
+    -------
+    differential : pd.DataFrame
+        Differentials of all of the variables of interest.
+
+    Notes
+    -----
+    This only works on a single categorical variable.  If you have continuous
+    variables, stop it.
+
+    This also assumes that the metadata and table have already been matched and
+    aligned.
+
+    TODO
+    ----
+    Extend this to multiple metadata categories, so that multiple columns
+    can be analyzed simultaneously
+    """
+    if ref is None:
+        ref = metadata[category].unique()[0]
+
+    idx = metadata[category] == ref
+    x = idx.astype(np.int64).reshape(-1, 1)
+    y = table.matrix_data
+
+    A = y @ ~x  # reference microbe counts
+    B = table.sum(axis='observation')
+    N = table.sum(axis='sample')
+    C = N[idx].sum()
+    D = N[~idx].sum()
+
+    diff = np.log((A * B * C)  / (B * B * D - A))
+    return diff
+
+# This is for testing
+
+def random_block_table(reps, n_species,
+                       species_mean=0,
+                       species_var=1.,
+                       effect_size=1,
+                       library_size=10000,
+                       microbe_total=100000, microbe_kappa=0.3,
+                       microbe_tau=0.1, sigma=0.5, seed=None):
+    """ Differential abundance analysis benchmarks.
+
+    The simulation here consists of 3 parts
+
+    Step 1: generate class probabilities using logistic distribution
+    Step 2: generate coefficients from normal distributions
+    Step 3: generate counts from species distributions
+
+    Parameters
+    ----------
+    reps : int
+        Number of replicate samples per test.
+    n_species : int
+        Number of species.
+    species_loc : float
+        Mean of the species prior.
+    species_variance : float
+        Variance of species log-fold differences
+    effect_size : int
+        The effect size difference between the feature abundances.
+    n_contaminants : int
+       Number of contaminant species.
+    sigma: float
+        Logistic error variance for class probabilities
+    library_size : np.array
+        A vector specifying the library sizes per sample.
+    template : np.array
+        A vector specifying feature abundances or relative proportions.
+
+    Returns
+    -------
+    generator of
+        pd.DataFrame
+           Ground truth tables.
+        pd.DataFrame
+           Metadata group categories, n_diff and effect_size
+        pd.Series
+           Species actually differentially abundant.
+    """
+    state = check_random_state(seed)
+    data = []
+
+    n = reps * 2
+    k = 2
+    labels = np.array([-effect_size] * (n // 2) + [effect_size] * (n // 2))
+    eps = np.random.logistic(loc=0, scale=sigma, size=n)
+    class_probs = labels + eps
+
+    X = np.hstack((np.ones((n, 1)), class_probs.reshape(-1, 1)))
+    B = np.random.normal(loc=species_mean, scale=species_var, size=(k, n_species))
+
+    ## Helper functions
+    # Convert microbial abundances to counts
+    def to_counts_f(x):
+        n = state.lognormal(np.log(library_size), microbe_tau)
+        p = x / x.sum()
+        return state.poisson(state.lognormal(np.log(n*p), microbe_kappa))
+
+    o_ids = ['F%d' % i for i in range(n_species)]
+    s_ids = ['S%d' % i for i in range(n)]
+
+    abs_table = pd.DataFrame(np.exp(X @ B) * microbe_total,
+                             index=s_ids,
+                             columns=o_ids)
+
+    rel_data = np.vstack(abs_table.apply(to_counts_f, axis=1))
+
+    rel_table = pd.DataFrame(rel_data,
+                             index=s_ids,
+                             columns=o_ids)
+
+    metadata = pd.DataFrame({'labels': labels})
+    metadata['effect_size'] = effect_size
+    metadata['microbe_total'] = microbe_total
+    metadata['class_logits'] = class_probs
+    metadata['intercept'] = 1
+    metadata.index = s_ids
+
+    ground_truth = pd.DataFrame({
+        'intercept': B[0, :],
+        'categorical': B[1, :]
+    }, index=o_ids)
+
+    return abs_table, rel_table, metadata, ground_truth
+
+if __name__ == "__main__":
+
+    import unittest
+
+    class TestPoissonCat(unittest.TestCase):
+
+        def setUp(self):
+            num_samples = 100
+            reps = 50
+            n_species = 200
+            np.random.seed(2)
+            self.res = random_block_table(
+                reps, n_species,
+                species_mean=0,
+                species_var=1,
+                microbe_kappa=0.7,
+                microbe_tau=0.7,
+                library_size=10000,
+                microbe_total=100000,
+                effect_size=1)
+
+        def test_poisson_cat(self):
+            abs_table, rel_table, metadata, ground_truth = self.res
+            table = Table(rel_table.T, rel_table.columns, rel_table.index)
+            res_diff = poisson_cat(table, metadata, category='label')
+            exp_diff = ground_truth['categorical']
+            # This may not work -- prepare to debug
+            from scipy.stats import pearsonr
+            r, p = pearsonr(res_diff, exp_diff)
+            self.assertGreater(r, 0.5)
+            self.assertLess(p, 0.01)
+
+    unittest.main()
No results found