def cross_validate_xgboost(train_data, train_output, 
                           n_folds, param_grid, 
                           type_dict, 
                           fixed_param_dict = {'objective': 'binary:logistic', 'eval_metric': ['auc']}, 
                           metric_func_dict = {'auc': sklearn.metrics.roc_auc_score}, 
                           other_metrics_dict = None, keep_data = True, **kwargs):

    """ 
    Perform k-fold cross-validation with xgboost hyperparameters
    Get the average performance across folds and save all of the results
    for easier calibration (Platt Scaling/Isotonic Regression)

    Parameters
    ---------------
    train_data (pd.DataFrame or np.array):
        A matrix that contains 1 row per observation and 1 column per feature

    train_output (pd.DataFrame or pd.Series or np.array):
        An array-like that contains the outcome of interest as a binary 
        indicator

    param_grid (OrderedDict): 
        An Ordered Dict where the keys are the parameter of interest,
        and the value is a list containing all possible parameter settings that
        need to be tested. The reason this parameter is an ordered dict is so 
        that the inner loop can keep track of which parameter is being set. In 
        python 3, this should not be an issue, since dictionaries have implicit 
        orderings when calling .keys(), but to be safe, an Ordered Dict is 
        required.

    type_dict (Dict): 
        A dictionary whose keys are the same as param_grid, and the values are
        either int or float (the base python functions). These are used to 
        coerce the parameters downstream

    metric_func_dict (Dict): 
        key: the name of the metric as a string
        value: A function that takes in arguments (y_true, y_pred) and computes some 
        metric to be used to select the best cross-validated parameters.
        Default is sklearn.metrics.roc_auc_score

    other_metrics_dict (Dict):
        A dictionary with the same structure as `metric_func_dict`. These metrics will 
        not be used to determine the best parameters for the model.

    **kwargs: 
        Each key is an argument to the xgboost model that has 
        only 1 value. These values will be passed every time the xgboost model is 
        run. `objective` and `eval_metric` are two parameters that need to be set.
    
    Returns
    --------
    A tuple consisting of:

    results_dict
    best_settings
    final_model_uncalibrated
    keep_dict
    """

    # Set up indices to keep track of training and validation folds
    indices = np.arange(0, train_data.shape[0])
    indices = np.random.permutation(indices)
    indices_list = np.array_split(indices, n_folds)

    # Build up an OrderedDict to save results
    results_dict = {}
    for item in param_grid:
        results_dict[item] = []
    results_dict['best_iteration'] = []
    
    for key in metric_func_dict:
        results_dict[key] = []
    if other_metrics_dict:
        for key in other_metrics_dict:
            results_dict[key] = []
    
    # Initialize the data to keep
    if keep_data:
        keep_dict = {'true': [], 'pred': []}

    # Build up the expanded grid of parameter values
    expanded_grid = np.array(np.meshgrid(*param_grid.values())).T.reshape(-1, len(param_grid))
    
    # Implement Cross-validation
    for ind, fold in enumerate(indices_list):
        validation_fold = train_data.iloc[fold, :]
        training_indices = np.concatenate([indices_list[f] for f in range(0, len(indices_list)) if f != ind])
        training_fold = train_data.iloc[training_indices, :]

        validation_fold_output = train_output.iloc[fold]
        train_fold_output = train_output.iloc[training_indices]

        ## Train the model with parameters
        ## For each fold, fit all of the models with all parameter settings
        ## Store the results in another dictionary with the same keys
        ## as the result

        for setting in range(0, expanded_grid.shape[0]):
            # Create the current setting parameter dict
            current_parameter_dict = {}
            for index, (key, value) in enumerate(param_grid.items()):
                current_parameter_dict[key] = type_dict[key](expanded_grid[setting][index])
                results_dict[key].append(current_parameter_dict[key])
            current_parameter_dict.update(fixed_param_dict)

            X_train = xgb.DMatrix(training_fold, label = train_fold_output)
            X_test = xgb.DMatrix(validation_fold, label = validation_fold_output)

            temp_model = xgb.train(current_parameter_dict, X_train, evals = [(X_train, 'train'), (X_test, 'test')], **kwargs)
            # Now that the model is fit, evaluate the metric
            temp_pred = temp_model.predict(X_test)
            
            # Compute the metric of interest: Default is AUC
            # Append result
            for key in metric_func_dict:
                fold_result = metric_func_dict[key](validation_fold_output, temp_pred)
                results_dict[key].append(fold_result)
            
            if other_metrics_dict:
                for key, func in other_metrics_dict.items():
                    results_dict[key].append(func(validation_fold_output, temp_pred))
            # Append best_iteration
            results_dict['best_iteration'].append(temp_model.best_iteration)
    def _find_best_settings(_result_dict, _param_dict):
        """
        Now, we want to find the best settings of the hyperparameters given by results_dict
        We want the highest value of the metric in metric_func_dict and to return
        the elements of param_grid that correspond to that value.

        """
        for key in metric_func_dict:
            max_index = _result_dict[key].index(max(_result_dict[key]))
        
        final_setting_dict = {}
        for key in _param_dict:
            final_setting_dict[key] = _result_dict[key][max_index]
        # Add the best iteration (with early_stopping_rounds provided)
        final_setting_dict['best_iteration'] = _result_dict['best_iteration'][max_index]
        return final_setting_dict

    best_settings = _find_best_settings(results_dict, param_grid)
    best_settings.update(fixed_param_dict)
    number_boost_rounds = best_settings.pop('best_iteration')

    if keep_data:
        for ind, fold in enumerate(indices_list):
            validation_fold = train_data.iloc[fold, :]
            training_indices = np.concatenate([indices_list[f] for f in range(0, len(indices_list)) if f != ind])
            training_fold = train_data.iloc[training_indices, :]

            validation_fold_output = train_output.iloc[fold]
            train_fold_output = train_output.iloc[training_indices]

            X_train = xgb.DMatrix(training_fold, label = train_fold_output)
            X_test = xgb.DMatrix(validation_fold, label = validation_fold_output)

            temp_model = xgb.train(best_settings, X_train, num_boost_round = number_boost_rounds, early_stopping_rounds = None)
            
            # Now that the model is fit, evaluate the metric
            temp_pred = temp_model.predict(X_test)
            
            keep_dict['true'] += list(validation_fold_output.values)
            keep_dict['pred'] += list(temp_pred)

    # Retrain model with best settings ======================
    train_df = xgb.DMatrix(train_data, label = train_output)

    final_model_uncalibrated = xgb.train(best_settings, train_df, num_boost_round = number_boost_rounds, early_stopping_rounds = None)

    return results_dict, best_settings, final_model_uncalibrated, keep_dict