Skip to content

Instantly share code, notes, and snippets.

@rishabh135
Created October 21, 2024 16:42
Show Gist options
  • Save rishabh135/306699e18a9dbc402fbe7868dd1b428f to your computer and use it in GitHub Desktop.
Save rishabh135/306699e18a9dbc402fbe7868dd1b428f to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
# Function to simulate returns and signal
def simulate_returns_and_signal(m_symbols, n_samples, seed):
# Parameters
r = 0 # Average return
r_sig = 1 # Average return conditioned on signal
sigma_alpha = 0.5 # Alpha standard deviation
sigma_beta = 0.3 # Beta standard deviation
rho = 0.4 # Correlation between alpha and beta
# Means
Mu = np.array([r, r_sig])
# Covariance matrix
Cov = np.diag([sigma_alpha, sigma_beta]) @ np.array([[1, rho], [rho, 1]]) @ np.diag([sigma_alpha, sigma_beta])
# Generate population equity data
np.random.seed(seed)
mvr = np.random.multivariate_normal(Mu, Cov, m_symbols)
# Extract alpha and beta returns
alpha_ret = mvr[:, 0]
beta_ret = mvr[:, 1]
return alpha_ret, beta_ret
# Function to generate train and test datasets
def generate_datasets(alpha_ret, beta_ret, m_symbols, n_samples_train, n_samples_test, seed_train, seed_test):
np.random.seed(seed_train)
event_train = np.random.uniform(0, 1, int(n_samples_train * m_symbols))
sym_id_train = np.repeat(list(range(1, m_symbols + 1)), n_samples_train)
mu_train = alpha_ret[sym_id_train - 1] + beta_ret[sym_id_train - 1] * event_train
sigma = 0.5
ret_train = np.random.normal(mu_train, sigma, m_symbols * n_samples_train)
train_df = pd.DataFrame({'symbol': sym_id_train, 'event': event_train, 'ret': ret_train})
np.random.seed(seed_test)
event_test = np.random.uniform(0, 1, int(n_samples_test * m_symbols))
sym_id_test = np.repeat(list(range(1, m_symbols + 1)), n_samples_test)
mu_test = alpha_ret[sym_id_test - 1] + beta_ret[sym_id_test - 1] * event_test
ret_test = np.random.normal(mu_test, sigma, m_symbols * n_samples_test)
test_df = pd.DataFrame({'symbol': sym_id_test, 'event': event_test, 'ret': ret_test})
return train_df, test_df
# Function to fit OLS model per symbol and evaluate MSE
def fit_ols_per_symbol(train_df, test_df):
symbol_mse = []
total_y = []
total_y_hat = []
rsq = []
pval = []
for symbol in train_df['symbol'].drop_duplicates().to_list():
train_data = train_df.loc[train_df['symbol'] == symbol][['event', 'ret']].to_numpy()
y = train_data.T.reshape(-1, 1)
X = sm.add_constant(train_data.T)
model = sm.OLS(y, X).fit()
rsq.append(model.rsquared)
pval.append(model.pvalues)
test_data = test_df.loc[test_df['symbol'] == symbol][['event', 'ret']].to_numpy()
y = test_data.T.reshape(-1, 1)
X = sm.add_constant(test_data.T)
y_hat = model.predict(exog=X)
mse = np.square(y - y_hat).mean()
total_y += [k for k in y]
total_y_hat += [k for k in y_hat]
symbol_mse.append(mse)
# Measure MSE across entire dataset
total_mse = np.square(np.array(total_y) - np.array(total_y_hat)).mean()
return symbol_mse, total_mse
# Function to fit completely pooled OLS model and evaluate MSE
def fit_completely_pooled_ols(train_df, test_df):
train_data = train_df[['event', 'ret']].to_numpy()
y = train_data.T.reshape(-1, 1)
X = sm.add_constant(train_data.T)
model = sm.OLS(y, X).fit()
pooled_mse = []
for symbol in test_df['symbol'].drop_duplicates().to_list():
test_data = test_df.loc[test_df['symbol'] == symbol][['event', 'ret']].to_numpy()
y = test_data.T.reshape(-1, 1)
X = sm.add_constant(test_data.T)
y_hat = model.predict(exog=X)
mse = np.square(y - y_hat).mean()
pooled_mse.append(mse)
# Measure MSE across entire dataset
X = sm.add_constant(test_df['event'].to_numpy())
y_hat = model.predict(X)
y = test_df['ret'].values.reshape(-1, 1)
total_mse = np.square(y - y_hat).mean()
return pooled_mse, total_mse
# Function to fit mixed effects model and evaluate MSE
def fit_mixed_effects_model(train_df, test_df):
model = smf.mixedlm('ret ~ event', train_df, groups=train_df['symbol'])
md = model.fit()
re = {symbol: md.random_effects[symbol] for symbol in md.random_effects}
test_df['fe'] = md.predict(exog=test_df)
test_df['re'] = test_df['symbol'].map(re)
test_df['ret_hat'] = test_df['fe'] + test_df['re']
mse = []
for symbol in test_df['symbol'].drop_duplicates().to_list():
test_data = test_df.loc[test_df['symbol'] == symbol]
sym_mse = np.square(test_data['ret'] - test_data['ret_hat']).mean()
mse.append(sym_mse)
# Measure MSE across entire dataset
me_mse = np.square(test_df['ret'] - test_df['ret_hat']).mean()
return mse, me_mse
# Main function
def main():
m_symbols = 200
n_samples_train = 50
n_samples_test = 20
seed_simulate = 60
seed_train = 64
seed_test = 32
# Simulate returns and signal
alpha_ret, beta_ret = simulate_returns_and_signal(m_symbols, n_samples_train, seed_simulate)
# Generate train and test datasets
train_df, test_df = generate_datasets(alpha_ret, beta_ret, m_symbols, n_samples_train, n_samples_test, seed_train, seed_test)
# Fit OLS model per symbol and evaluate MSE
symbol_mse, total_mse = fit_ols_per_symbol(train_df, test_df)
print(f"Average MSE per symbol: {np.mean(symbol_mse)}")
print(f"Total MSE across entire dataset (per-symbol models): {total_mse}")
# Fit completely pooled OLS model and evaluate MSE
pooled_mse, total_mse_pooled = fit_completely_pooled_ols(train_df, test_df)
print(f"Average MSE per symbol (completely pooled): {np.mean(pooled_mse)}")
print(f"Total MSE across entire dataset (completely pooled): {total_mse_pooled}")
# Fit mixed effects model and evaluate MSE
mse_mixed, total_mse_mixed = fit_mixed_effects_model(train_df, test_df)
print(f"Average MSE per symbol (mixed effects): {np.mean(mse_mixed)}")
print(f"Total MSE across entire dataset (mixed effects): {total_mse_mixed}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment