def cross_validate_xgboost(train_data, train_output, n_folds, param_grid, type_dict, fixed_param_dict = {'objective': 'binary:logistic', 'eval_metric': ['auc']}, metric_func_dict = {'auc': sklearn.metrics.roc_auc_score}, other_metrics_dict = None, keep_data = True, **kwargs): """ Perform k-fold cross-validation with xgboost hyperparameters Get the average performance across folds and save all of the results for easier calibration (Platt Scaling/Isotonic Regression) Parameters --------------- train_data (pd.DataFrame or np.array): A matrix that contains 1 row per observation and 1 column per feature train_output (pd.DataFrame or pd.Series or np.array): An array-like that contains the outcome of interest as a binary indicator param_grid (OrderedDict): An Ordered Dict where the keys are the parameter of interest, and the value is a list containing all possible parameter settings that need to be tested. The reason this parameter is an ordered dict is so that the inner loop can keep track of which parameter is being set. In python 3, this should not be an issue, since dictionaries have implicit orderings when calling .keys(), but to be safe, an Ordered Dict is required. type_dict (Dict): A dictionary whose keys are the same as param_grid, and the values are either int or float (the base python functions). These are used to coerce the parameters downstream metric_func_dict (Dict): key: the name of the metric as a string value: A function that takes in arguments (y_true, y_pred) and computes some metric to be used to select the best cross-validated parameters. Default is sklearn.metrics.roc_auc_score other_metrics_dict (Dict): A dictionary with the same structure as `metric_func_dict`. These metrics will not be used to determine the best parameters for the model. **kwargs: Each key is an argument to the xgboost model that has only 1 value. These values will be passed every time the xgboost model is run. `objective` and `eval_metric` are two parameters that need to be set. Returns -------- A tuple consisting of: results_dict best_settings final_model_uncalibrated keep_dict """ # Set up indices to keep track of training and validation folds indices = np.arange(0, train_data.shape[0]) indices = np.random.permutation(indices) indices_list = np.array_split(indices, n_folds) # Build up an OrderedDict to save results results_dict = {} for item in param_grid: results_dict[item] = [] results_dict['best_iteration'] = [] for key in metric_func_dict: results_dict[key] = [] if other_metrics_dict: for key in other_metrics_dict: results_dict[key] = [] # Initialize the data to keep if keep_data: keep_dict = {'true': [], 'pred': []} # Build up the expanded grid of parameter values expanded_grid = np.array(np.meshgrid(*param_grid.values())).T.reshape(-1, len(param_grid)) # Implement Cross-validation for ind, fold in enumerate(indices_list): validation_fold = train_data.iloc[fold, :] training_indices = np.concatenate([indices_list[f] for f in range(0, len(indices_list)) if f != ind]) training_fold = train_data.iloc[training_indices, :] validation_fold_output = train_output.iloc[fold] train_fold_output = train_output.iloc[training_indices] ## Train the model with parameters ## For each fold, fit all of the models with all parameter settings ## Store the results in another dictionary with the same keys ## as the result for setting in range(0, expanded_grid.shape[0]): # Create the current setting parameter dict current_parameter_dict = {} for index, (key, value) in enumerate(param_grid.items()): current_parameter_dict[key] = type_dict[key](expanded_grid[setting][index]) results_dict[key].append(current_parameter_dict[key]) current_parameter_dict.update(fixed_param_dict) X_train = xgb.DMatrix(training_fold, label = train_fold_output) X_test = xgb.DMatrix(validation_fold, label = validation_fold_output) temp_model = xgb.train(current_parameter_dict, X_train, evals = [(X_train, 'train'), (X_test, 'test')], **kwargs) # Now that the model is fit, evaluate the metric temp_pred = temp_model.predict(X_test) # Compute the metric of interest: Default is AUC # Append result for key in metric_func_dict: fold_result = metric_func_dict[key](validation_fold_output, temp_pred) results_dict[key].append(fold_result) if other_metrics_dict: for key, func in other_metrics_dict.items(): results_dict[key].append(func(validation_fold_output, temp_pred)) # Append best_iteration results_dict['best_iteration'].append(temp_model.best_iteration) def _find_best_settings(_result_dict, _param_dict): """ Now, we want to find the best settings of the hyperparameters given by results_dict We want the highest value of the metric in metric_func_dict and to return the elements of param_grid that correspond to that value. """ for key in metric_func_dict: max_index = _result_dict[key].index(max(_result_dict[key])) final_setting_dict = {} for key in _param_dict: final_setting_dict[key] = _result_dict[key][max_index] # Add the best iteration (with early_stopping_rounds provided) final_setting_dict['best_iteration'] = _result_dict['best_iteration'][max_index] return final_setting_dict best_settings = _find_best_settings(results_dict, param_grid) best_settings.update(fixed_param_dict) number_boost_rounds = best_settings.pop('best_iteration') if keep_data: for ind, fold in enumerate(indices_list): validation_fold = train_data.iloc[fold, :] training_indices = np.concatenate([indices_list[f] for f in range(0, len(indices_list)) if f != ind]) training_fold = train_data.iloc[training_indices, :] validation_fold_output = train_output.iloc[fold] train_fold_output = train_output.iloc[training_indices] X_train = xgb.DMatrix(training_fold, label = train_fold_output) X_test = xgb.DMatrix(validation_fold, label = validation_fold_output) temp_model = xgb.train(best_settings, X_train, num_boost_round = number_boost_rounds, early_stopping_rounds = None) # Now that the model is fit, evaluate the metric temp_pred = temp_model.predict(X_test) keep_dict['true'] += list(validation_fold_output.values) keep_dict['pred'] += list(temp_pred) # Retrain model with best settings ====================== train_df = xgb.DMatrix(train_data, label = train_output) final_model_uncalibrated = xgb.train(best_settings, train_df, num_boost_round = number_boost_rounds, early_stopping_rounds = None) return results_dict, best_settings, final_model_uncalibrated, keep_dict