Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save DavidykZhao/7e68dfbe353e7d975e5a9ac9874c023c to your computer and use it in GitHub Desktop.

Select an option

Save DavidykZhao/7e68dfbe353e7d975e5a9ac9874c023c to your computer and use it in GitHub Desktop.
def cross_validate_xgboost(train_data, train_output,
n_folds, param_grid,
type_dict,
fixed_param_dict = {'objective': 'binary:logistic', 'eval_metric': ['auc']},
metric_func_dict = {'auc': sklearn.metrics.roc_auc_score},
other_metrics_dict = None, keep_data = True, **kwargs):
"""
Perform k-fold cross-validation with xgboost hyperparameters
Get the average performance across folds and save all of the results
for easier calibration (Platt Scaling/Isotonic Regression)
Parameters
---------------
train_data (pd.DataFrame or np.array):
A matrix that contains 1 row per observation and 1 column per feature
train_output (pd.DataFrame or pd.Series or np.array):
An array-like that contains the outcome of interest as a binary
indicator
param_grid (OrderedDict):
An Ordered Dict where the keys are the parameter of interest,
and the value is a list containing all possible parameter settings that
need to be tested. The reason this parameter is an ordered dict is so
that the inner loop can keep track of which parameter is being set. In
python 3, this should not be an issue, since dictionaries have implicit
orderings when calling .keys(), but to be safe, an Ordered Dict is
required.
type_dict (Dict):
A dictionary whose keys are the same as param_grid, and the values are
either int or float (the base python functions). These are used to
coerce the parameters downstream
metric_func_dict (Dict):
key: the name of the metric as a string
value: A function that takes in arguments (y_true, y_pred) and computes some
metric to be used to select the best cross-validated parameters.
Default is sklearn.metrics.roc_auc_score
other_metrics_dict (Dict):
A dictionary with the same structure as `metric_func_dict`. These metrics will
not be used to determine the best parameters for the model.
**kwargs:
Each key is an argument to the xgboost model that has
only 1 value. These values will be passed every time the xgboost model is
run. `objective` and `eval_metric` are two parameters that need to be set.
Returns
--------
A tuple consisting of:
results_dict
best_settings
final_model_uncalibrated
keep_dict
"""
# Set up indices to keep track of training and validation folds
indices = np.arange(0, train_data.shape[0])
indices = np.random.permutation(indices)
indices_list = np.array_split(indices, n_folds)
# Build up an OrderedDict to save results
results_dict = {}
for item in param_grid:
results_dict[item] = []
results_dict['best_iteration'] = []
for key in metric_func_dict:
results_dict[key] = []
if other_metrics_dict:
for key in other_metrics_dict:
results_dict[key] = []
# Initialize the data to keep
if keep_data:
keep_dict = {'true': [], 'pred': []}
# Build up the expanded grid of parameter values
expanded_grid = np.array(np.meshgrid(*param_grid.values())).T.reshape(-1, len(param_grid))
# Implement Cross-validation
for ind, fold in enumerate(indices_list):
validation_fold = train_data.iloc[fold, :]
training_indices = np.concatenate([indices_list[f] for f in range(0, len(indices_list)) if f != ind])
training_fold = train_data.iloc[training_indices, :]
validation_fold_output = train_output.iloc[fold]
train_fold_output = train_output.iloc[training_indices]
## Train the model with parameters
## For each fold, fit all of the models with all parameter settings
## Store the results in another dictionary with the same keys
## as the result
for setting in range(0, expanded_grid.shape[0]):
# Create the current setting parameter dict
current_parameter_dict = {}
for index, (key, value) in enumerate(param_grid.items()):
current_parameter_dict[key] = type_dict[key](expanded_grid[setting][index])
results_dict[key].append(current_parameter_dict[key])
current_parameter_dict.update(fixed_param_dict)
X_train = xgb.DMatrix(training_fold, label = train_fold_output)
X_test = xgb.DMatrix(validation_fold, label = validation_fold_output)
temp_model = xgb.train(current_parameter_dict, X_train, evals = [(X_train, 'train'), (X_test, 'test')], **kwargs)
# Now that the model is fit, evaluate the metric
temp_pred = temp_model.predict(X_test)
# Compute the metric of interest: Default is AUC
# Append result
for key in metric_func_dict:
fold_result = metric_func_dict[key](validation_fold_output, temp_pred)
results_dict[key].append(fold_result)
if other_metrics_dict:
for key, func in other_metrics_dict.items():
results_dict[key].append(func(validation_fold_output, temp_pred))
# Append best_iteration
results_dict['best_iteration'].append(temp_model.best_iteration)
def _find_best_settings(_result_dict, _param_dict):
"""
Now, we want to find the best settings of the hyperparameters given by results_dict
We want the highest value of the metric in metric_func_dict and to return
the elements of param_grid that correspond to that value.
"""
for key in metric_func_dict:
max_index = _result_dict[key].index(max(_result_dict[key]))
final_setting_dict = {}
for key in _param_dict:
final_setting_dict[key] = _result_dict[key][max_index]
# Add the best iteration (with early_stopping_rounds provided)
final_setting_dict['best_iteration'] = _result_dict['best_iteration'][max_index]
return final_setting_dict
best_settings = _find_best_settings(results_dict, param_grid)
best_settings.update(fixed_param_dict)
number_boost_rounds = best_settings.pop('best_iteration')
if keep_data:
for ind, fold in enumerate(indices_list):
validation_fold = train_data.iloc[fold, :]
training_indices = np.concatenate([indices_list[f] for f in range(0, len(indices_list)) if f != ind])
training_fold = train_data.iloc[training_indices, :]
validation_fold_output = train_output.iloc[fold]
train_fold_output = train_output.iloc[training_indices]
X_train = xgb.DMatrix(training_fold, label = train_fold_output)
X_test = xgb.DMatrix(validation_fold, label = validation_fold_output)
temp_model = xgb.train(best_settings, X_train, num_boost_round = number_boost_rounds, early_stopping_rounds = None)
# Now that the model is fit, evaluate the metric
temp_pred = temp_model.predict(X_test)
keep_dict['true'] += list(validation_fold_output.values)
keep_dict['pred'] += list(temp_pred)
# Retrain model with best settings ======================
train_df = xgb.DMatrix(train_data, label = train_output)
final_model_uncalibrated = xgb.train(best_settings, train_df, num_boost_round = number_boost_rounds, early_stopping_rounds = None)
return results_dict, best_settings, final_model_uncalibrated, keep_dict
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment