"""
Having a 3 x (3) Mixed Design: 3 conditions as BETWEEN factor, 3 blocks as WITHIN factor.
There are multiple measurements in each block (trials).
There are 2 dependent variables: parallel & orthogonal
Since 'parallel' and 'orthogonal' measures have the same unit, 
it can be viewed as 1 outcome called 'projection' in 2 different directions.

The interest here is in the VARIABILITY (e.g. standard deviation or variance) of the data 
for parallel & orthogonal outcomes for blocks 1, 2, and 3.
The analysis should enable to make statements about whether conditions, blocks, direction 
have an effect on the outcome.

Bonus:
Each user has an individual level of experience in the task.
Users rate the difficulty of each block they performed.
"""
# %%
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import patsy
import pymc3 as pm
import seaborn as sns
import theano.tensor as tt


#################################
#          Dummy Data           #
#################################
# %%
n_trials = 30  # by block.
n_users = 30
n_obs = 3 * n_trials * n_users

df = pd.DataFrame({'user': np.random.randint(n_users, size=n_obs),
                   'block': 1 + np.random.randint(3, size=n_obs),
                   'parallel': np.random.normal(0.0, 1, size=n_obs),
                   'orthogonal': np.random.normal(0.0, 1, size=n_obs)})

df.sort_values(by=['user', 'block'], inplace=True)
df.reset_index(drop=True, inplace=True)

# Assign each user to a condition.
df['condition'] = df['user'].map(pd.Series(index=df['user'].unique(), data=np.random.choice(['A',  'B', 'C'], size=df['user'].nunique())))
# Experience for each user.
df['experience'] = df['user'].map(pd.Series(index=df['user'].unique(), data=np.random.choice([np.nan, 0, 1, 2, 3, 4], size=df['user'].nunique())))
# Give each block its own ID.
df['block_id'] = pd.factorize(df['user'].astype(str) + df['block'].astype(str))[0]
# Trials.
df['trial'] = df.groupby(['block_id'])['block'].transform(lambda x: np.arange(len(x)))
df = df[df['trial'] < n_trials]
# Categorical columns.
df[['user', 'condition', 'block_id', 'block']] = df[['user', 'condition', 'block_id', 'block']].astype('category')
# Give each block_id a rating.
df['rating'] = df['block_id'].map(pd.Series(index=df['block_id'].unique(), data=np.random.randint(5, size=df['block_id'].cat.categories.size)))
df[['parallel', 'orthogonal']] = df[['parallel', 'orthogonal']].astype(theano.config.floatX)

# Thin out rows to simulate more missing data.
mask = np.random.choice([*[True]*4, False], size=df.shape[0])
df = df[mask]
n_obs = len(df)
# Make sure mean of each block is zero as with real data.
df[['parallel', 'orthogonal']] = df.groupby(['user', 'block'])[['parallel', 'orthogonal']].transform(lambda x: x - x.mean())

# Scale block deviations.
scale_para = (9-3) * np.random.random_sample(df['block_id'].nunique()) + 3
scale_ortho = (7-1.5) * np.random.random_sample(df['block_id'].nunique()) + 1.5
scale = np.stack((scale_para, scale_ortho)).T
df[['parallel', 'orthogonal']] = df.groupby(['block_id'])['parallel', 'orthogonal'].apply(lambda x: x * scale[x.name])

# %%
# Reshape data into long format.
samples = df.melt(id_vars=['user', 'experience', 'condition', 'block_id', 'block', 'rating', 'trial'],
                  value_vars=['parallel', 'orthogonal'],
                  var_name='direction', value_name='projection')
samples['direction'] = samples['direction'].astype('category')

# %%
# I'm interested in the variance for each direction. The mean of the squared projection by block_id is that block's variance.
samples['log_projection_sq'] = samples['projection'].transform('square').transform('log')

# %%
# Vizualize the data.
g = sns.FacetGrid(samples, col='block', row='user', hue="direction")
g.despine(offset=10)
g = (g.map(sns.distplot, "log_projection_sq").add_legend(title="Projection", loc='upper right'))
for ax in g.axes.flat:
    ax.set_ylabel("Probability Density")
    ax.set_xlabel("log(Projection Length^2)")
plt.tight_layout()
plt.show()

# %%
#################################
#          Coordinates          #
#################################
conditions = samples['condition'].cat.categories.tolist()
n_conditions = samples['condition'].nunique()
condition_indices = df['condition'].cat.codes.values # Condition index for every single value.
condition_lookup = pd.Series(*pd.factorize(samples['condition'].cat.categories), name='condition_idx_lookup')

# We need to be able to index by user. But the user IDs of the sample don't start at 0. We create an index and a mapping between them.
users = samples['user'].cat.categories.tolist()
n_users = samples['user'].nunique()
user_indices = samples['user'].cat.codes.values  # User index for every single value.
user_lookup = pd.Series(*pd.factorize(samples['user'].cat.categories), name='user_idx_lookup')
# User level predictors. More options: age group, gender.
# gaming_exp is not on an interval scale (no equidistance between items), but ordinal.
experience = samples.drop_duplicates(subset="user")['experience']  # Shape = user_indices

# Block indices for 1,2,3.
blocks = samples['block'].cat.categories.tolist()
n_blocks = samples['block'].nunique()  # Same as using cat.categories.size
block_indices = samples['block'].cat.codes.values # Block index for every single value.
block_lookup = pd.Series(*pd.factorize(samples['block'].cat.categories), name='block_idx_lookup')
# Block ID indices. Shape = n_users * n_blocks.
block_ids = samples['block_id'].cat.categories.tolist()
n_block_ids = samples['block_id'].nunique()
block_id_indices = samples['block_id'].cat.codes.values # Block index for every single value.
block_id_lookup = pd.Series(*pd.factorize(samples['block_id'].cat.categories), name='block_id_idx_lookup')
# Block level predictors.
# Note: Ratings are correlated with block index.
ratings = samples.drop_duplicates(subset=["user", 'block'])['rating']  # Shape = block_id_indices

# Coded direction of projection: 0 = orthogonal, 1 = parallel.
directions = samples['direction'].cat.categories.tolist()
n_directions = samples['direction'].nunique()
direction_indices = samples['direction'].cat.codes.values
direction_lookup = pd.Series(*pd.factorize(samples['direction'].cat.categories), name='direction_idx_lookup')

coords = {'Direction': directions, 'Condition': conditions, 'User' :users, 'Block': blocks, 'Block_ID': block_ids, 'obs_id': np.arange(direction_indices
.size)}

#################################
#       Multi-Level Model       #
#################################
# %%
# Fixed effects. Commonly denoted as X.
X = patsy.dmatrix('1 + block * direction', samples, return_type='dataframe')
X = np.asarray(X)

# %%
# Design matrix for the random effects. Intercept and slope are modelled separately to have more control on the prior.
Z_intercept = patsy.dmatrix('0 + user', data=samples, return_type='dataframe')
Z_intercept = np.asarray(Z_intercept)

Z_slope = patsy.dmatrix('0 + user:direction', data=samples, return_type='dataframe')
Z_slope = np.asarray(Z_slope)

Z = np.concatenate((Z_intercept, Z_slope), axis=1)

print('Z_intercept has shape ({}, {})'.format(*Z_intercept.shape))
print('Z_slope has shape ({}, {})'.format(*Z_slope.shape))
print('Z has shape ({}, {})'.format(*Z.shape))

# %%
Y = np.asarray(samples['log_projection_sq'])

# %%
with pm.Model(coords=coords) as hlm_model:
    ## Fixed effect
    beta_X_intercept = pm.Normal('beta_X_ortho', mu=0, sd=10) # contrain it to positive values
    beta_X_slope_b2 = pm.Normal('beta_X_b2_ortho_offset', mu=0, sd=10)
    beta_X_slope_b3 = pm.Normal('beta_X_b3_ortho_offset', mu=0, sd=10)
    beta_X_slope_para = pm.Normal('beta_X_parallel_offset', mu=0, sd=10)
    beta_X_slope_b2_para = pm.Normal('beta_X_b2_parallel_offset', mu=0, sd=10)
    beta_X_slope_b3_para = pm.Normal('beta_X_b3_parallel_offset', mu=0, sd=10)
    beta_X = tt.stack([beta_X_intercept,
                       beta_X_slope_b2,
                       beta_X_slope_b3,
                       beta_X_slope_para,
                       beta_X_slope_b2_para,
                       beta_X_slope_b3_para
                      ])
    
    estimate_X = pm.math.dot(X, beta_X)
    
    ## Random effect
    # Non Centered version
    sigma_Z_intercept = pm.HalfNormal('sigma_Z_intercept', sd=10)
    gamma_Z_intercept_raw = pm.Normal('gamma_Z_intercept_raw', mu=0, sd=1, shape=Z_intercept.shape[1])
    gamma_Z_intercept = pm.Deterministic('gamma_Z_intercept', gamma_Z_intercept_raw * sigma_Z_intercept)

    sigma_Z_slope = pm.HalfNormal('sigma_Z_slope', sd=10)
    gamma_Z_slope_raw = pm.Normal('gamma_Z_slope_raw', mu=0, sd=1, shape=Z_slope.shape[1])
    gamma_Z_slope = pm.Deterministic('gamma_Z_slope', gamma_Z_slope_raw * sigma_Z_slope)
    
    estimate_Z = pm.math.dot(Z_intercept, gamma_Z_intercept) + pm.math.dot(Z_slope, gamma_Z_slope) 
    
    ## likelihood
    mu_estimate = pm.Deterministic('mu_estimate', estimate_X + estimate_Z)
    sigma_unexplained = pm.HalfNormal('sigma_unexplained', sd=10) # unexplained variability
    y_likelihood = pm.Normal('y_likelihood', mu=mu_estimate, sd=sigma_unexplained, observed=Y)

# %%
pm.model_to_graphviz(hlm_model)

# %%
with hlm_model:
    idata = pm.sample(2000, tune=2000, return_inferencedata=True)

# %%
az.summary(idata)
# %%
pm.traceplot(idata, varnames=['beta_X_ortho', 'beta_X_b2_ortho_offset', 'beta_X_b3_ortho_offset', 'beta_X_parallel_offset', 'beta_X_b2_parallel_offset', 'beta_X_b3_parallel_offset', 'gamma_Z_intercept', 'gamma_Z_slope', 'sigma_unexplained'])
# %%
pm.plot_posterior(idata, varnames=['beta_X_ortho', 'beta_X_b2_ortho_offset', 'beta_X_b3_ortho_offset', 'beta_X_parallel_offset', 'beta_X_b2_parallel_offset', 'beta_X_b3_parallel_offset', 'gamma_Z_intercept', 'gamma_Z_slope', 'sigma_unexplained'])