-
-
Save DavidykZhao/7e68dfbe353e7d975e5a9ac9874c023c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def cross_validate_xgboost(train_data, train_output, | |
| n_folds, param_grid, | |
| type_dict, | |
| fixed_param_dict = {'objective': 'binary:logistic', 'eval_metric': ['auc']}, | |
| metric_func_dict = {'auc': sklearn.metrics.roc_auc_score}, | |
| other_metrics_dict = None, keep_data = True, **kwargs): | |
| """ | |
| Perform k-fold cross-validation with xgboost hyperparameters | |
| Get the average performance across folds and save all of the results | |
| for easier calibration (Platt Scaling/Isotonic Regression) | |
| Parameters | |
| --------------- | |
| train_data (pd.DataFrame or np.array): | |
| A matrix that contains 1 row per observation and 1 column per feature | |
| train_output (pd.DataFrame or pd.Series or np.array): | |
| An array-like that contains the outcome of interest as a binary | |
| indicator | |
| param_grid (OrderedDict): | |
| An Ordered Dict where the keys are the parameter of interest, | |
| and the value is a list containing all possible parameter settings that | |
| need to be tested. The reason this parameter is an ordered dict is so | |
| that the inner loop can keep track of which parameter is being set. In | |
| python 3, this should not be an issue, since dictionaries have implicit | |
| orderings when calling .keys(), but to be safe, an Ordered Dict is | |
| required. | |
| type_dict (Dict): | |
| A dictionary whose keys are the same as param_grid, and the values are | |
| either int or float (the base python functions). These are used to | |
| coerce the parameters downstream | |
| metric_func_dict (Dict): | |
| key: the name of the metric as a string | |
| value: A function that takes in arguments (y_true, y_pred) and computes some | |
| metric to be used to select the best cross-validated parameters. | |
| Default is sklearn.metrics.roc_auc_score | |
| other_metrics_dict (Dict): | |
| A dictionary with the same structure as `metric_func_dict`. These metrics will | |
| not be used to determine the best parameters for the model. | |
| **kwargs: | |
| Each key is an argument to the xgboost model that has | |
| only 1 value. These values will be passed every time the xgboost model is | |
| run. `objective` and `eval_metric` are two parameters that need to be set. | |
| Returns | |
| -------- | |
| A tuple consisting of: | |
| results_dict | |
| best_settings | |
| final_model_uncalibrated | |
| keep_dict | |
| """ | |
| # Set up indices to keep track of training and validation folds | |
| indices = np.arange(0, train_data.shape[0]) | |
| indices = np.random.permutation(indices) | |
| indices_list = np.array_split(indices, n_folds) | |
| # Build up an OrderedDict to save results | |
| results_dict = {} | |
| for item in param_grid: | |
| results_dict[item] = [] | |
| results_dict['best_iteration'] = [] | |
| for key in metric_func_dict: | |
| results_dict[key] = [] | |
| if other_metrics_dict: | |
| for key in other_metrics_dict: | |
| results_dict[key] = [] | |
| # Initialize the data to keep | |
| if keep_data: | |
| keep_dict = {'true': [], 'pred': []} | |
| # Build up the expanded grid of parameter values | |
| expanded_grid = np.array(np.meshgrid(*param_grid.values())).T.reshape(-1, len(param_grid)) | |
| # Implement Cross-validation | |
| for ind, fold in enumerate(indices_list): | |
| validation_fold = train_data.iloc[fold, :] | |
| training_indices = np.concatenate([indices_list[f] for f in range(0, len(indices_list)) if f != ind]) | |
| training_fold = train_data.iloc[training_indices, :] | |
| validation_fold_output = train_output.iloc[fold] | |
| train_fold_output = train_output.iloc[training_indices] | |
| ## Train the model with parameters | |
| ## For each fold, fit all of the models with all parameter settings | |
| ## Store the results in another dictionary with the same keys | |
| ## as the result | |
| for setting in range(0, expanded_grid.shape[0]): | |
| # Create the current setting parameter dict | |
| current_parameter_dict = {} | |
| for index, (key, value) in enumerate(param_grid.items()): | |
| current_parameter_dict[key] = type_dict[key](expanded_grid[setting][index]) | |
| results_dict[key].append(current_parameter_dict[key]) | |
| current_parameter_dict.update(fixed_param_dict) | |
| X_train = xgb.DMatrix(training_fold, label = train_fold_output) | |
| X_test = xgb.DMatrix(validation_fold, label = validation_fold_output) | |
| temp_model = xgb.train(current_parameter_dict, X_train, evals = [(X_train, 'train'), (X_test, 'test')], **kwargs) | |
| # Now that the model is fit, evaluate the metric | |
| temp_pred = temp_model.predict(X_test) | |
| # Compute the metric of interest: Default is AUC | |
| # Append result | |
| for key in metric_func_dict: | |
| fold_result = metric_func_dict[key](validation_fold_output, temp_pred) | |
| results_dict[key].append(fold_result) | |
| if other_metrics_dict: | |
| for key, func in other_metrics_dict.items(): | |
| results_dict[key].append(func(validation_fold_output, temp_pred)) | |
| # Append best_iteration | |
| results_dict['best_iteration'].append(temp_model.best_iteration) | |
| def _find_best_settings(_result_dict, _param_dict): | |
| """ | |
| Now, we want to find the best settings of the hyperparameters given by results_dict | |
| We want the highest value of the metric in metric_func_dict and to return | |
| the elements of param_grid that correspond to that value. | |
| """ | |
| for key in metric_func_dict: | |
| max_index = _result_dict[key].index(max(_result_dict[key])) | |
| final_setting_dict = {} | |
| for key in _param_dict: | |
| final_setting_dict[key] = _result_dict[key][max_index] | |
| # Add the best iteration (with early_stopping_rounds provided) | |
| final_setting_dict['best_iteration'] = _result_dict['best_iteration'][max_index] | |
| return final_setting_dict | |
| best_settings = _find_best_settings(results_dict, param_grid) | |
| best_settings.update(fixed_param_dict) | |
| number_boost_rounds = best_settings.pop('best_iteration') | |
| if keep_data: | |
| for ind, fold in enumerate(indices_list): | |
| validation_fold = train_data.iloc[fold, :] | |
| training_indices = np.concatenate([indices_list[f] for f in range(0, len(indices_list)) if f != ind]) | |
| training_fold = train_data.iloc[training_indices, :] | |
| validation_fold_output = train_output.iloc[fold] | |
| train_fold_output = train_output.iloc[training_indices] | |
| X_train = xgb.DMatrix(training_fold, label = train_fold_output) | |
| X_test = xgb.DMatrix(validation_fold, label = validation_fold_output) | |
| temp_model = xgb.train(best_settings, X_train, num_boost_round = number_boost_rounds, early_stopping_rounds = None) | |
| # Now that the model is fit, evaluate the metric | |
| temp_pred = temp_model.predict(X_test) | |
| keep_dict['true'] += list(validation_fold_output.values) | |
| keep_dict['pred'] += list(temp_pred) | |
| # Retrain model with best settings ====================== | |
| train_df = xgb.DMatrix(train_data, label = train_output) | |
| final_model_uncalibrated = xgb.train(best_settings, train_df, num_boost_round = number_boost_rounds, early_stopping_rounds = None) | |
| return results_dict, best_settings, final_model_uncalibrated, keep_dict |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment