Created
October 21, 2024 16:42
-
-
Save rishabh135/306699e18a9dbc402fbe7868dd1b428f to your computer and use it in GitHub Desktop.
Revisions
-
rishabh135 created this gist
Oct 21, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,160 @@ import numpy as np import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf import matplotlib.pyplot as plt # Function to simulate returns and signal def simulate_returns_and_signal(m_symbols, n_samples, seed): # Parameters r = 0 # Average return r_sig = 1 # Average return conditioned on signal sigma_alpha = 0.5 # Alpha standard deviation sigma_beta = 0.3 # Beta standard deviation rho = 0.4 # Correlation between alpha and beta # Means Mu = np.array([r, r_sig]) # Covariance matrix Cov = np.diag([sigma_alpha, sigma_beta]) @ np.array([[1, rho], [rho, 1]]) @ np.diag([sigma_alpha, sigma_beta]) # Generate population equity data np.random.seed(seed) mvr = np.random.multivariate_normal(Mu, Cov, m_symbols) # Extract alpha and beta returns alpha_ret = mvr[:, 0] beta_ret = mvr[:, 1] return alpha_ret, beta_ret # Function to generate train and test datasets def generate_datasets(alpha_ret, beta_ret, m_symbols, n_samples_train, n_samples_test, seed_train, seed_test): np.random.seed(seed_train) event_train = np.random.uniform(0, 1, int(n_samples_train * m_symbols)) sym_id_train = np.repeat(list(range(1, m_symbols + 1)), n_samples_train) mu_train = alpha_ret[sym_id_train - 1] + beta_ret[sym_id_train - 1] * event_train sigma = 0.5 ret_train = np.random.normal(mu_train, sigma, m_symbols * n_samples_train) train_df = pd.DataFrame({'symbol': sym_id_train, 'event': event_train, 'ret': ret_train}) np.random.seed(seed_test) event_test = np.random.uniform(0, 1, int(n_samples_test * m_symbols)) sym_id_test = np.repeat(list(range(1, m_symbols + 1)), n_samples_test) mu_test = alpha_ret[sym_id_test - 1] + beta_ret[sym_id_test - 1] * event_test ret_test = np.random.normal(mu_test, sigma, m_symbols * n_samples_test) test_df = pd.DataFrame({'symbol': sym_id_test, 'event': event_test, 'ret': ret_test}) return train_df, test_df # Function to fit OLS model per symbol and evaluate MSE def fit_ols_per_symbol(train_df, test_df): symbol_mse = [] total_y = [] total_y_hat = [] rsq = [] pval = [] for symbol in train_df['symbol'].drop_duplicates().to_list(): train_data = train_df.loc[train_df['symbol'] == symbol][['event', 'ret']].to_numpy() y = train_data.T.reshape(-1, 1) X = sm.add_constant(train_data.T) model = sm.OLS(y, X).fit() rsq.append(model.rsquared) pval.append(model.pvalues) test_data = test_df.loc[test_df['symbol'] == symbol][['event', 'ret']].to_numpy() y = test_data.T.reshape(-1, 1) X = sm.add_constant(test_data.T) y_hat = model.predict(exog=X) mse = np.square(y - y_hat).mean() total_y += [k for k in y] total_y_hat += [k for k in y_hat] symbol_mse.append(mse) # Measure MSE across entire dataset total_mse = np.square(np.array(total_y) - np.array(total_y_hat)).mean() return symbol_mse, total_mse # Function to fit completely pooled OLS model and evaluate MSE def fit_completely_pooled_ols(train_df, test_df): train_data = train_df[['event', 'ret']].to_numpy() y = train_data.T.reshape(-1, 1) X = sm.add_constant(train_data.T) model = sm.OLS(y, X).fit() pooled_mse = [] for symbol in test_df['symbol'].drop_duplicates().to_list(): test_data = test_df.loc[test_df['symbol'] == symbol][['event', 'ret']].to_numpy() y = test_data.T.reshape(-1, 1) X = sm.add_constant(test_data.T) y_hat = model.predict(exog=X) mse = np.square(y - y_hat).mean() pooled_mse.append(mse) # Measure MSE across entire dataset X = sm.add_constant(test_df['event'].to_numpy()) y_hat = model.predict(X) y = test_df['ret'].values.reshape(-1, 1) total_mse = np.square(y - y_hat).mean() return pooled_mse, total_mse # Function to fit mixed effects model and evaluate MSE def fit_mixed_effects_model(train_df, test_df): model = smf.mixedlm('ret ~ event', train_df, groups=train_df['symbol']) md = model.fit() re = {symbol: md.random_effects[symbol] for symbol in md.random_effects} test_df['fe'] = md.predict(exog=test_df) test_df['re'] = test_df['symbol'].map(re) test_df['ret_hat'] = test_df['fe'] + test_df['re'] mse = [] for symbol in test_df['symbol'].drop_duplicates().to_list(): test_data = test_df.loc[test_df['symbol'] == symbol] sym_mse = np.square(test_data['ret'] - test_data['ret_hat']).mean() mse.append(sym_mse) # Measure MSE across entire dataset me_mse = np.square(test_df['ret'] - test_df['ret_hat']).mean() return mse, me_mse # Main function def main(): m_symbols = 200 n_samples_train = 50 n_samples_test = 20 seed_simulate = 60 seed_train = 64 seed_test = 32 # Simulate returns and signal alpha_ret, beta_ret = simulate_returns_and_signal(m_symbols, n_samples_train, seed_simulate) # Generate train and test datasets train_df, test_df = generate_datasets(alpha_ret, beta_ret, m_symbols, n_samples_train, n_samples_test, seed_train, seed_test) # Fit OLS model per symbol and evaluate MSE symbol_mse, total_mse = fit_ols_per_symbol(train_df, test_df) print(f"Average MSE per symbol: {np.mean(symbol_mse)}") print(f"Total MSE across entire dataset (per-symbol models): {total_mse}") # Fit completely pooled OLS model and evaluate MSE pooled_mse, total_mse_pooled = fit_completely_pooled_ols(train_df, test_df) print(f"Average MSE per symbol (completely pooled): {np.mean(pooled_mse)}") print(f"Total MSE across entire dataset (completely pooled): {total_mse_pooled}") # Fit mixed effects model and evaluate MSE mse_mixed, total_mse_mixed = fit_mixed_effects_model(train_df, test_df) print(f"Average MSE per symbol (mixed effects): {np.mean(mse_mixed)}") print(f"Total MSE across entire dataset (mixed effects): {total_mse_mixed}") if __name__ == "__main__": main()