import numpy as np import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf import matplotlib.pyplot as plt # Function to simulate returns and signal def simulate_returns_and_signal(m_symbols, n_samples, seed): # Parameters r = 0 # Average return r_sig = 1 # Average return conditioned on signal sigma_alpha = 0.5 # Alpha standard deviation sigma_beta = 0.3 # Beta standard deviation rho = 0.4 # Correlation between alpha and beta # Means Mu = np.array([r, r_sig]) # Covariance matrix Cov = np.diag([sigma_alpha, sigma_beta]) @ np.array([[1, rho], [rho, 1]]) @ np.diag([sigma_alpha, sigma_beta]) # Generate population equity data np.random.seed(seed) mvr = np.random.multivariate_normal(Mu, Cov, m_symbols) # Extract alpha and beta returns alpha_ret = mvr[:, 0] beta_ret = mvr[:, 1] return alpha_ret, beta_ret # Function to generate train and test datasets def generate_datasets(alpha_ret, beta_ret, m_symbols, n_samples_train, n_samples_test, seed_train, seed_test): np.random.seed(seed_train) event_train = np.random.uniform(0, 1, int(n_samples_train * m_symbols)) sym_id_train = np.repeat(list(range(1, m_symbols + 1)), n_samples_train) mu_train = alpha_ret[sym_id_train - 1] + beta_ret[sym_id_train - 1] * event_train sigma = 0.5 ret_train = np.random.normal(mu_train, sigma, m_symbols * n_samples_train) train_df = pd.DataFrame({'symbol': sym_id_train, 'event': event_train, 'ret': ret_train}) np.random.seed(seed_test) event_test = np.random.uniform(0, 1, int(n_samples_test * m_symbols)) sym_id_test = np.repeat(list(range(1, m_symbols + 1)), n_samples_test) mu_test = alpha_ret[sym_id_test - 1] + beta_ret[sym_id_test - 1] * event_test ret_test = np.random.normal(mu_test, sigma, m_symbols * n_samples_test) test_df = pd.DataFrame({'symbol': sym_id_test, 'event': event_test, 'ret': ret_test}) return train_df, test_df # Function to fit OLS model per symbol and evaluate MSE def fit_ols_per_symbol(train_df, test_df): symbol_mse = [] total_y = [] total_y_hat = [] rsq = [] pval = [] for symbol in train_df['symbol'].drop_duplicates().to_list(): train_data = train_df.loc[train_df['symbol'] == symbol][['event', 'ret']].to_numpy() y = train_data.T.reshape(-1, 1) X = sm.add_constant(train_data.T) model = sm.OLS(y, X).fit() rsq.append(model.rsquared) pval.append(model.pvalues) test_data = test_df.loc[test_df['symbol'] == symbol][['event', 'ret']].to_numpy() y = test_data.T.reshape(-1, 1) X = sm.add_constant(test_data.T) y_hat = model.predict(exog=X) mse = np.square(y - y_hat).mean() total_y += [k for k in y] total_y_hat += [k for k in y_hat] symbol_mse.append(mse) # Measure MSE across entire dataset total_mse = np.square(np.array(total_y) - np.array(total_y_hat)).mean() return symbol_mse, total_mse # Function to fit completely pooled OLS model and evaluate MSE def fit_completely_pooled_ols(train_df, test_df): train_data = train_df[['event', 'ret']].to_numpy() y = train_data.T.reshape(-1, 1) X = sm.add_constant(train_data.T) model = sm.OLS(y, X).fit() pooled_mse = [] for symbol in test_df['symbol'].drop_duplicates().to_list(): test_data = test_df.loc[test_df['symbol'] == symbol][['event', 'ret']].to_numpy() y = test_data.T.reshape(-1, 1) X = sm.add_constant(test_data.T) y_hat = model.predict(exog=X) mse = np.square(y - y_hat).mean() pooled_mse.append(mse) # Measure MSE across entire dataset X = sm.add_constant(test_df['event'].to_numpy()) y_hat = model.predict(X) y = test_df['ret'].values.reshape(-1, 1) total_mse = np.square(y - y_hat).mean() return pooled_mse, total_mse # Function to fit mixed effects model and evaluate MSE def fit_mixed_effects_model(train_df, test_df): model = smf.mixedlm('ret ~ event', train_df, groups=train_df['symbol']) md = model.fit() re = {symbol: md.random_effects[symbol] for symbol in md.random_effects} test_df['fe'] = md.predict(exog=test_df) test_df['re'] = test_df['symbol'].map(re) test_df['ret_hat'] = test_df['fe'] + test_df['re'] mse = [] for symbol in test_df['symbol'].drop_duplicates().to_list(): test_data = test_df.loc[test_df['symbol'] == symbol] sym_mse = np.square(test_data['ret'] - test_data['ret_hat']).mean() mse.append(sym_mse) # Measure MSE across entire dataset me_mse = np.square(test_df['ret'] - test_df['ret_hat']).mean() return mse, me_mse # Main function def main(): m_symbols = 200 n_samples_train = 50 n_samples_test = 20 seed_simulate = 60 seed_train = 64 seed_test = 32 # Simulate returns and signal alpha_ret, beta_ret = simulate_returns_and_signal(m_symbols, n_samples_train, seed_simulate) # Generate train and test datasets train_df, test_df = generate_datasets(alpha_ret, beta_ret, m_symbols, n_samples_train, n_samples_test, seed_train, seed_test) # Fit OLS model per symbol and evaluate MSE symbol_mse, total_mse = fit_ols_per_symbol(train_df, test_df) print(f"Average MSE per symbol: {np.mean(symbol_mse)}") print(f"Total MSE across entire dataset (per-symbol models): {total_mse}") # Fit completely pooled OLS model and evaluate MSE pooled_mse, total_mse_pooled = fit_completely_pooled_ols(train_df, test_df) print(f"Average MSE per symbol (completely pooled): {np.mean(pooled_mse)}") print(f"Total MSE across entire dataset (completely pooled): {total_mse_pooled}") # Fit mixed effects model and evaluate MSE mse_mixed, total_mse_mixed = fit_mixed_effects_model(train_df, test_df) print(f"Average MSE per symbol (mixed effects): {np.mean(mse_mixed)}") print(f"Total MSE across entire dataset (mixed effects): {total_mse_mixed}") if __name__ == "__main__": main()