In [1]:
# !conda install -yc conda-forge xgboost
import xgboost as xgb
import sklearn.datasets
import sklearn.metrics
import sklearn.feature_selection
import sklearn.feature_extraction
import sklearn.cross_validation
import sklearn.model_selection
import tqdm



In [14]:
xgb.__version__

'0.6'

In [15]:
df = sklearn.datasets.load_boston()
print(df.keys())
print(df['feature_names'])

['data', 'feature_names', 'DESCR', 'target']
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [3]:
X = df['data']
y = df['target']

In [5]:
x_tr, x_te, y_tr, y_te = sklearn.model_selection.train_test_split(X, y)

In [6]:
y_tr[:10]

array([ 33.8,  23.7,  20.5,  12.8,  50. ,  17.4,   8.8,  17.8,  26.4,  18.2])

## One shot learning
Train with all the training data. Only one iteration over the dataset.

In [7]:
one_shot_model = xgb.train({
    'update':'refresh',
    'process_type': 'update',
    'refresh_leaf': True,
    'silent': False,
}, dtrain=xgb.DMatrix(x_tr, y_tr))
y_pr = one_shot_model.predict(xgb.DMatrix(x_te))
sklearn.metrics.mean_squared_error(y_te, y_pr)

9.7116581444822518

In [8]:
y_pr[:10]

array([ 13.41692734,  10.1039257 ,  26.06602859,  14.74896526,
        19.46399117,  22.82827187,  21.09622765,  18.83269501,
        27.70256996,  34.56838226], dtype=float32)

## One shot iterative training
Exploit the xgb_model parameter of xgb.train to iterate over the training data multiple time

In [12]:
iteration = 5
one_shot_model_itr = None
for i in range(iteration):
    one_shot_model_itr = xgb.train({
        'update':'refresh',
        'process_type': 'update',
        'refresh_leaf': True,
        'silent': False,
    }, dtrain=xgb.DMatrix(x_tr, y_tr), xgb_model=one_shot_model_itr)
    y_pr = one_shot_model_itr.predict(xgb.DMatrix(x_te))
    print('Iteration {}: {}'.format(i, sklearn.metrics.mean_squared_error(y_te, y_pr)))

Iteration 0: 9.71165814448
Iteration 1: 7.90938546712
Iteration 2: 7.83283545287
Iteration 3: 7.90989805123
Iteration 4: 7.93978549112


So xgboost models are able to improve when you iterate over data multiple times.

# Iterative Incremental Learning

In [10]:
batch_size = 50
iterations = 25
model = None
for i in range(iterations):
    for start in range(0, len(x_tr), batch_size):
        model = xgb.train({
            'learning_rate': 0.007,
            'update':'refresh',
            'process_type': 'update',
            'refresh_leaf': True,
            #'reg_lambda': 3,  # L2
            'reg_alpha': 3,  # L1
            'silent': False,
        }, dtrain=xgb.DMatrix(x_tr[start:start+batch_size], y_tr[start:start+batch_size]), xgb_model=model)

        y_pr = model.predict(xgb.DMatrix(x_te))
        #print('    MSE itr@{}: {}'.format(int(start/batch_size), sklearn.metrics.mean_squared_error(y_te, y_pr)))
    print('MSE itr@{}: {}'.format(i, sklearn.metrics.mean_squared_error(y_te, y_pr)))

y_pr = model.predict(xgb.DMatrix(x_te))
print('MSE at the end: {}'.format(sklearn.metrics.mean_squared_error(y_te, y_pr)))

MSE itr@0: 239.680186067
MSE itr@1: 111.044669451
MSE itr@2: 57.7185741392
MSE itr@3: 35.7994472176
MSE itr@4: 26.2178656072
MSE itr@5: 20.3012679934
MSE itr@6: 17.0486683066
MSE itr@7: 14.9458533528
MSE itr@8: 13.5863551796
MSE itr@9: 12.5722084078
MSE itr@10: 12.0621747382
MSE itr@11: 11.8287598733
MSE itr@12: 11.6878301253
MSE itr@13: 11.4897400114
MSE itr@14: 11.4627225743
MSE itr@15: 11.5417849176
MSE itr@16: 11.4022054245
MSE itr@17: 11.2675483456
MSE itr@18: 11.3866442707
MSE itr@19: 11.3504530668
MSE itr@20: 11.3818182553
MSE itr@21: 11.5099846894
MSE itr@22: 11.5365974758
MSE itr@23: 11.7541341329
MSE itr@24: 11.9677214525
MSE at the end: 11.9677214525


## Conclusion
MSE is decreasing with each iteration. Hence, the xgboost model is learning incrementally.