|
|
@@ -0,0 +1,182 @@ |
|
|
import numpy as np |
|
|
import patsy |
|
|
import pandas as pd |
|
|
from biom import Table |
|
|
|
|
|
# This is the main function |
|
|
def poisson_cat(table, metadata, category, ref=None): |
|
|
""" Poisson differential abundance. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
table : biom.Table |
|
|
Sparse matrix of counts. |
|
|
metadata : pd.DataFrame |
|
|
Sample metadata file. |
|
|
category : str |
|
|
Metadata category of interest. |
|
|
ref : str |
|
|
Reference category. If not specified, the first |
|
|
category will be picked. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
differential : pd.DataFrame |
|
|
Differentials of all of the variables of interest. |
|
|
|
|
|
Notes |
|
|
----- |
|
|
This only works on a single categorical variable. If you have continuous |
|
|
variables, stop it. |
|
|
|
|
|
This also assumes that the metadata and table have already been matched and |
|
|
aligned. |
|
|
|
|
|
TODO |
|
|
---- |
|
|
Extend this to multiple metadata categories, so that multiple columns |
|
|
can be analyzed simultaneously |
|
|
""" |
|
|
if ref is None: |
|
|
ref = metadata[category].unique()[0] |
|
|
|
|
|
idx = metadata[category] == ref |
|
|
x = idx.astype(np.int64).reshape(-1, 1) |
|
|
y = table.matrix_data |
|
|
|
|
|
A = y @ ~x # reference microbe counts |
|
|
B = table.sum(axis='observation') |
|
|
N = table.sum(axis='sample') |
|
|
C = N[idx].sum() |
|
|
D = N[~idx].sum() |
|
|
|
|
|
diff = np.log((A * B * C) / (B * B * D - A)) |
|
|
return diff |
|
|
|
|
|
# This is for testing |
|
|
|
|
|
def random_block_table(reps, n_species, |
|
|
species_mean=0, |
|
|
species_var=1., |
|
|
effect_size=1, |
|
|
library_size=10000, |
|
|
microbe_total=100000, microbe_kappa=0.3, |
|
|
microbe_tau=0.1, sigma=0.5, seed=None): |
|
|
""" Differential abundance analysis benchmarks. |
|
|
|
|
|
The simulation here consists of 3 parts |
|
|
|
|
|
Step 1: generate class probabilities using logistic distribution |
|
|
Step 2: generate coefficients from normal distributions |
|
|
Step 3: generate counts from species distributions |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
reps : int |
|
|
Number of replicate samples per test. |
|
|
n_species : int |
|
|
Number of species. |
|
|
species_loc : float |
|
|
Mean of the species prior. |
|
|
species_variance : float |
|
|
Variance of species log-fold differences |
|
|
effect_size : int |
|
|
The effect size difference between the feature abundances. |
|
|
n_contaminants : int |
|
|
Number of contaminant species. |
|
|
sigma: float |
|
|
Logistic error variance for class probabilities |
|
|
library_size : np.array |
|
|
A vector specifying the library sizes per sample. |
|
|
template : np.array |
|
|
A vector specifying feature abundances or relative proportions. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
generator of |
|
|
pd.DataFrame |
|
|
Ground truth tables. |
|
|
pd.DataFrame |
|
|
Metadata group categories, n_diff and effect_size |
|
|
pd.Series |
|
|
Species actually differentially abundant. |
|
|
""" |
|
|
state = check_random_state(seed) |
|
|
data = [] |
|
|
|
|
|
n = reps * 2 |
|
|
k = 2 |
|
|
labels = np.array([-effect_size] * (n // 2) + [effect_size] * (n // 2)) |
|
|
eps = np.random.logistic(loc=0, scale=sigma, size=n) |
|
|
class_probs = labels + eps |
|
|
|
|
|
X = np.hstack((np.ones((n, 1)), class_probs.reshape(-1, 1))) |
|
|
B = np.random.normal(loc=species_mean, scale=species_var, size=(k, n_species)) |
|
|
|
|
|
## Helper functions |
|
|
# Convert microbial abundances to counts |
|
|
def to_counts_f(x): |
|
|
n = state.lognormal(np.log(library_size), microbe_tau) |
|
|
p = x / x.sum() |
|
|
return state.poisson(state.lognormal(np.log(n*p), microbe_kappa)) |
|
|
|
|
|
o_ids = ['F%d' % i for i in range(n_species)] |
|
|
s_ids = ['S%d' % i for i in range(n)] |
|
|
|
|
|
abs_table = pd.DataFrame(np.exp(X @ B) * microbe_total, |
|
|
index=s_ids, |
|
|
columns=o_ids) |
|
|
|
|
|
rel_data = np.vstack(abs_table.apply(to_counts_f, axis=1)) |
|
|
|
|
|
rel_table = pd.DataFrame(rel_data, |
|
|
index=s_ids, |
|
|
columns=o_ids) |
|
|
|
|
|
metadata = pd.DataFrame({'labels': labels}) |
|
|
metadata['effect_size'] = effect_size |
|
|
metadata['microbe_total'] = microbe_total |
|
|
metadata['class_logits'] = class_probs |
|
|
metadata['intercept'] = 1 |
|
|
metadata.index = s_ids |
|
|
|
|
|
ground_truth = pd.DataFrame({ |
|
|
'intercept': B[0, :], |
|
|
'categorical': B[1, :] |
|
|
}, index=o_ids) |
|
|
|
|
|
return abs_table, rel_table, metadata, ground_truth |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
import unittest |
|
|
|
|
|
class TestPoissonCat(unittest.TestCase): |
|
|
|
|
|
def setUp(self): |
|
|
num_samples = 100 |
|
|
reps = 50 |
|
|
n_species = 200 |
|
|
np.random.seed(2) |
|
|
self.res = random_block_table( |
|
|
reps, n_species, |
|
|
species_mean=0, |
|
|
species_var=1, |
|
|
microbe_kappa=0.7, |
|
|
microbe_tau=0.7, |
|
|
library_size=10000, |
|
|
microbe_total=100000, |
|
|
effect_size=1) |
|
|
|
|
|
def test_poisson_cat(self): |
|
|
abs_table, rel_table, metadata, ground_truth = self.res |
|
|
table = Table(rel_table.T, rel_table.columns, rel_table.index) |
|
|
res_diff = poisson_cat(table, metadata, category='label') |
|
|
exp_diff = ground_truth['categorical'] |
|
|
# This may not work -- prepare to debug |
|
|
from scipy.stats import pearsonr |
|
|
r, p = pearsonr(res_diff, exp_diff) |
|
|
self.assertGreater(r, 0.5) |
|
|
self.assertLess(p, 0.01) |
|
|
|
|
|
unittest.main() |