Created
October 21, 2024 16:42
-
-
Save rishabh135/306699e18a9dbc402fbe7868dd1b428f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import pandas as pd | |
| import statsmodels.api as sm | |
| import statsmodels.formula.api as smf | |
| import matplotlib.pyplot as plt | |
| # Function to simulate returns and signal | |
| def simulate_returns_and_signal(m_symbols, n_samples, seed): | |
| # Parameters | |
| r = 0 # Average return | |
| r_sig = 1 # Average return conditioned on signal | |
| sigma_alpha = 0.5 # Alpha standard deviation | |
| sigma_beta = 0.3 # Beta standard deviation | |
| rho = 0.4 # Correlation between alpha and beta | |
| # Means | |
| Mu = np.array([r, r_sig]) | |
| # Covariance matrix | |
| Cov = np.diag([sigma_alpha, sigma_beta]) @ np.array([[1, rho], [rho, 1]]) @ np.diag([sigma_alpha, sigma_beta]) | |
| # Generate population equity data | |
| np.random.seed(seed) | |
| mvr = np.random.multivariate_normal(Mu, Cov, m_symbols) | |
| # Extract alpha and beta returns | |
| alpha_ret = mvr[:, 0] | |
| beta_ret = mvr[:, 1] | |
| return alpha_ret, beta_ret | |
| # Function to generate train and test datasets | |
| def generate_datasets(alpha_ret, beta_ret, m_symbols, n_samples_train, n_samples_test, seed_train, seed_test): | |
| np.random.seed(seed_train) | |
| event_train = np.random.uniform(0, 1, int(n_samples_train * m_symbols)) | |
| sym_id_train = np.repeat(list(range(1, m_symbols + 1)), n_samples_train) | |
| mu_train = alpha_ret[sym_id_train - 1] + beta_ret[sym_id_train - 1] * event_train | |
| sigma = 0.5 | |
| ret_train = np.random.normal(mu_train, sigma, m_symbols * n_samples_train) | |
| train_df = pd.DataFrame({'symbol': sym_id_train, 'event': event_train, 'ret': ret_train}) | |
| np.random.seed(seed_test) | |
| event_test = np.random.uniform(0, 1, int(n_samples_test * m_symbols)) | |
| sym_id_test = np.repeat(list(range(1, m_symbols + 1)), n_samples_test) | |
| mu_test = alpha_ret[sym_id_test - 1] + beta_ret[sym_id_test - 1] * event_test | |
| ret_test = np.random.normal(mu_test, sigma, m_symbols * n_samples_test) | |
| test_df = pd.DataFrame({'symbol': sym_id_test, 'event': event_test, 'ret': ret_test}) | |
| return train_df, test_df | |
| # Function to fit OLS model per symbol and evaluate MSE | |
| def fit_ols_per_symbol(train_df, test_df): | |
| symbol_mse = [] | |
| total_y = [] | |
| total_y_hat = [] | |
| rsq = [] | |
| pval = [] | |
| for symbol in train_df['symbol'].drop_duplicates().to_list(): | |
| train_data = train_df.loc[train_df['symbol'] == symbol][['event', 'ret']].to_numpy() | |
| y = train_data.T.reshape(-1, 1) | |
| X = sm.add_constant(train_data.T) | |
| model = sm.OLS(y, X).fit() | |
| rsq.append(model.rsquared) | |
| pval.append(model.pvalues) | |
| test_data = test_df.loc[test_df['symbol'] == symbol][['event', 'ret']].to_numpy() | |
| y = test_data.T.reshape(-1, 1) | |
| X = sm.add_constant(test_data.T) | |
| y_hat = model.predict(exog=X) | |
| mse = np.square(y - y_hat).mean() | |
| total_y += [k for k in y] | |
| total_y_hat += [k for k in y_hat] | |
| symbol_mse.append(mse) | |
| # Measure MSE across entire dataset | |
| total_mse = np.square(np.array(total_y) - np.array(total_y_hat)).mean() | |
| return symbol_mse, total_mse | |
| # Function to fit completely pooled OLS model and evaluate MSE | |
| def fit_completely_pooled_ols(train_df, test_df): | |
| train_data = train_df[['event', 'ret']].to_numpy() | |
| y = train_data.T.reshape(-1, 1) | |
| X = sm.add_constant(train_data.T) | |
| model = sm.OLS(y, X).fit() | |
| pooled_mse = [] | |
| for symbol in test_df['symbol'].drop_duplicates().to_list(): | |
| test_data = test_df.loc[test_df['symbol'] == symbol][['event', 'ret']].to_numpy() | |
| y = test_data.T.reshape(-1, 1) | |
| X = sm.add_constant(test_data.T) | |
| y_hat = model.predict(exog=X) | |
| mse = np.square(y - y_hat).mean() | |
| pooled_mse.append(mse) | |
| # Measure MSE across entire dataset | |
| X = sm.add_constant(test_df['event'].to_numpy()) | |
| y_hat = model.predict(X) | |
| y = test_df['ret'].values.reshape(-1, 1) | |
| total_mse = np.square(y - y_hat).mean() | |
| return pooled_mse, total_mse | |
| # Function to fit mixed effects model and evaluate MSE | |
| def fit_mixed_effects_model(train_df, test_df): | |
| model = smf.mixedlm('ret ~ event', train_df, groups=train_df['symbol']) | |
| md = model.fit() | |
| re = {symbol: md.random_effects[symbol] for symbol in md.random_effects} | |
| test_df['fe'] = md.predict(exog=test_df) | |
| test_df['re'] = test_df['symbol'].map(re) | |
| test_df['ret_hat'] = test_df['fe'] + test_df['re'] | |
| mse = [] | |
| for symbol in test_df['symbol'].drop_duplicates().to_list(): | |
| test_data = test_df.loc[test_df['symbol'] == symbol] | |
| sym_mse = np.square(test_data['ret'] - test_data['ret_hat']).mean() | |
| mse.append(sym_mse) | |
| # Measure MSE across entire dataset | |
| me_mse = np.square(test_df['ret'] - test_df['ret_hat']).mean() | |
| return mse, me_mse | |
| # Main function | |
| def main(): | |
| m_symbols = 200 | |
| n_samples_train = 50 | |
| n_samples_test = 20 | |
| seed_simulate = 60 | |
| seed_train = 64 | |
| seed_test = 32 | |
| # Simulate returns and signal | |
| alpha_ret, beta_ret = simulate_returns_and_signal(m_symbols, n_samples_train, seed_simulate) | |
| # Generate train and test datasets | |
| train_df, test_df = generate_datasets(alpha_ret, beta_ret, m_symbols, n_samples_train, n_samples_test, seed_train, seed_test) | |
| # Fit OLS model per symbol and evaluate MSE | |
| symbol_mse, total_mse = fit_ols_per_symbol(train_df, test_df) | |
| print(f"Average MSE per symbol: {np.mean(symbol_mse)}") | |
| print(f"Total MSE across entire dataset (per-symbol models): {total_mse}") | |
| # Fit completely pooled OLS model and evaluate MSE | |
| pooled_mse, total_mse_pooled = fit_completely_pooled_ols(train_df, test_df) | |
| print(f"Average MSE per symbol (completely pooled): {np.mean(pooled_mse)}") | |
| print(f"Total MSE across entire dataset (completely pooled): {total_mse_pooled}") | |
| # Fit mixed effects model and evaluate MSE | |
| mse_mixed, total_mse_mixed = fit_mixed_effects_model(train_df, test_df) | |
| print(f"Average MSE per symbol (mixed effects): {np.mean(mse_mixed)}") | |
| print(f"Total MSE across entire dataset (mixed effects): {total_mse_mixed}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment