In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import zipfile
import cPickle as pickle
import sklearn
import xgboost as xgb
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV

In [2]:
z = zipfile.ZipFile('train.csv.zip')
df = pd.read_csv(z.open('train.csv'))

In [3]:
df['var3'].describe()


Out[3]:
count     76020.000000
mean      -1523.199277
std       39033.462364
min     -999999.000000
25%           2.000000
50%           2.000000
75%           2.000000
max         238.000000
Name: var3, dtype: float64

In [3]:
df.replace(to_replace={'var3': {-999999: 2}}, inplace=True)

In [4]:
with open('dropped_features.dump', 'r') as f:
    dropped_features = pickle.load(f)
df.drop(dropped_features, axis=1, inplace=True)

In [26]:
x = df.drop(['ID', 'TARGET'], axis=1)
y = df.TARGET

In [13]:
#selectK = SelectKBest(f_classif, k=100)
#selectK.fit(x, y)
#x_sel = selectK.transform(x)

In [42]:
sub_clf = xgb.XGBClassifier(
                missing=np.nan,
                max_depth=5,
                n_estimators=250,
                learning_rate=0.03, 
                nthread=4,
                subsample=0.95,
                colsample_bytree=0.85, 
                seed=4242
)
x_sel = x

In [35]:
cv = cross_validation.StratifiedKFold(y, n_folds=4, shuffle=True)
scores = cross_validation.cross_val_score(sub_clf, x_sel, y, cv=cv, scoring='roc_auc')
print("Auc: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))


Auc: 0.839 (+/- 0.003)

In [43]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x_sel, y, random_state=1027, test_size=0.3)
sub_clf.fit(x_train, y_train, early_stopping_rounds=20, eval_metric="auc", eval_set=[(x_test, y_test)], verbose=True)


Out[43]:
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.85,
       gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=250, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=4242, silent=True, subsample=0.95)

In [34]:
test = pd.read_csv(zipfile.ZipFile('test.csv.zip').open('test.csv'))
test.drop(dropped_features, axis=1, inplace=True)
x_test = test.drop(['ID'], axis=1)
y_pred = sub_clf.predict_proba(x_test)
submission = pd.DataFrame({"ID":test.ID, "TARGET":y_pred[:,1]})
submission.to_csv("submission.csv", index=False)

In [36]:
clf = xgb.XGBClassifier()
xgb_params = {
    'learning_rate': [0.04],
    'n_estimators': [350, 200],
    'max_depth': [5],
    'subsample': [0.95],
    'colsample_bytree': [0.85],
    'seed': [4242]
}
cv = cross_validation.StratifiedKFold(y, n_folds=8, shuffle=True, random_state=1027)
grid = GridSearchCV(clf, xgb_params, scoring='roc_auc', cv=cv, verbose=10, n_jobs=1)

In [38]:
grid.fit(x_sel, y)


Fitting 8 folds for each of 2 candidates, totalling 16 fits
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5, score=0.844070 - 1.6min
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5, score=0.858414 - 1.5min
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5, score=0.847333 - 1.9min
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5, score=0.817342 - 1.7min
[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:  1.6min
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:  6.6min
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5, score=0.820589 - 1.5min
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5, score=0.843786 - 1.6min
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5, score=0.839579 - 1.8min
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=350, subsample=0.95, seed=4242, max_depth=5, score=0.838620 - 1.6min
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5, score=0.845221 - 1.2min
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5, score=0.857566 -  56.0s
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5, score=0.846977 -  49.8s
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5, score=0.817106 -  52.2s
[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed: 11.6min
[Parallel(n_jobs=1)]: Done  12 tasks       | elapsed: 17.0min
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5, score=0.820825 -  50.5s
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5, score=0.844130 -  50.5s
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5, score=0.839860 -  52.9s
[CV] colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5 
[CV]  colsample_bytree=0.85, learning_rate=0.04, n_estimators=200, subsample=0.95, seed=4242, max_depth=5, score=0.838832 -  50.6s
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed: 20.4min finished
Out[38]:
GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 0 0], n_folds=8, shuffle=True, random_state=1027),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'colsample_bytree': [0.85], 'learning_rate': [0.04], 'n_estimators': [350, 200], 'subsample': [0.95], 'seed': [4242], 'max_depth': [5]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=10)

In [39]:
grid.best_params_


Out[39]:
{'colsample_bytree': 0.85,
 'learning_rate': 0.04,
 'max_depth': 5,
 'n_estimators': 200,
 'seed': 4242,
 'subsample': 0.95}

In [37]:
import sys
class flushfile(object):
    def __init__(self, f):
        self.f = f

    def __getattr__(self,name): 
        return object.__getattribute__(self.f, name)

    def write(self, x):
        self.f.write(x)
        self.f.flush()

    def flush(self):
        self.f.flush()
sys.stdout = flushfile(sys.stdout)

Grid Search #1


In [ ]:
clf = xgb.XGBClassifier()
xgb_params = {
    'learning_rate': [0.1],
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 7, 11]
}
cv = cross_validation.StratifiedKFold(y)
grid = GridSearchCV(clf, xgb_params, scoring='roc_auc', cv=cv, verbose=10, n_jobs=1)

In [ ]:
grid.fit(x_sel, y)

In [ ]:
grid.best_params_

Grid Search #2


In [ ]:
clf = xgb.XGBClassifier()
xgb_params = {
    'learning_rate': [0.05],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7]
}
cv = cross_validation.StratifiedKFold(y)
grid = GridSearchCV(clf, xgb_params, scoring='roc_auc', cv=cv, verbose=10, n_jobs=1)

In [ ]:
grid.fit(x_sel, y)

In [ ]:
grid.best_params_

Grid Search #3


In [ ]:
clf = xgb.XGBClassifier()
xgb_params = {
    'learning_rate': [0.05],
    'n_estimators': [200, 300, 400],
    'max_depth': [4, 5, 6],
    'subsample': [0.9, 0.925, 0.95, 0.975],
    'colsample_bytree': [0.8, 0.825, 0.85, 0.875, 0.9],
    'seed': [1126]
}
cv = cross_validation.StratifiedKFold(y)
grid = GridSearchCV(clf, xgb_params, scoring='roc_auc', cv=cv, verbose=10, n_jobs=1)

In [ ]:
grid.fit(x_sel, y)

In [ ]:
best_clf = grid.best_estimator_
print grid.best_params_

In [ ]:
cv = cross_validation.StratifiedKFold(y, n_folds=4, shuffle=True)
scores = cross_validation.cross_val_score(best_clf, x_sel, y, cv=cv, scoring='roc_auc')
print("Auc: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))

Early Stopping


In [ ]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x_sel, y, random_state=1125, stratify=y, test_size=0.3)

In [ ]:
best_clf.fit(x_train, y_train, early_stopping_rounds=25, eval_metric="auc", eval_set=[(x_test, y_test)], verbose=True)

Fuck


In [ ]:
print grid.best_params_

In [ ]:
clf = xgb.XGBClassifier(
                max_depth = 4,
                n_estimators=200,
                learning_rate=0.045, 
                nthread=4,
                subsample=0.975,
                colsample_bytree=0.875, 
                seed=1126
)
clf.fit(x_train, y_train, early_stopping_rounds=30, eval_metric="auc", eval_set=[(x_test, y_test)], verbose=True)
test = pd.read_csv(zipfile.ZipFile('test.csv.zip').open('test.csv'))
sel_test = selectK.transform(test)    
y_pred = best_clf.predict_proba(sel_test)
submission = pd.DataFrame({"ID":test.ID, "TARGET":y_pred[:,1]})
submission.to_csv("submission.csv", index=False)

Grid Search #4


In [17]:
clf = xgb.XGBClassifier()
xgb_params = {
    'learning_rate': [0.040, 0.045, 0.050, 0.055, 0.060],
    'n_estimators': [175, 200, 225, 250],
    'max_depth': [4],
    'subsample': [0.975, 0.980, 0.985, 0.990],
    'colsample_bytree': [0.8625, 0.875, 0.8875],
    'seed': [1027]
}
cv = cross_validation.StratifiedKFold(y, n_folds=8, shuffle=True, random_state=1027)
grid = GridSearchCV(clf, xgb_params, scoring='roc_auc', cv=cv, verbose=10, n_jobs=-1)

In [18]:
grid.fit(x_sel, y)


Fitting 8 folds for each of 240 candidates, totalling 1920 fits
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 24.5min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 29.1min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 33.8min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 38.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 42.5min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed: 47.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 52.0min
[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed: 58.1min
[Parallel(n_jobs=-1)]: Done 234 tasks      | elapsed: 64.7min
[Parallel(n_jobs=-1)]: Done 257 tasks      | elapsed: 72.0min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 77.1min
[Parallel(n_jobs=-1)]: Done 305 tasks      | elapsed: 83.2min
[Parallel(n_jobs=-1)]: Done 330 tasks      | elapsed: 90.5min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 98.2min
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed: 106.8min
[Parallel(n_jobs=-1)]: Done 413 tasks      | elapsed: 113.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 120.6min
[Parallel(n_jobs=-1)]: Done 473 tasks      | elapsed: 129.5min
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed: 139.6min
[Parallel(n_jobs=-1)]: Done 537 tasks      | elapsed: 147.6min
[Parallel(n_jobs=-1)]: Done 570 tasks      | elapsed: 155.7min
[Parallel(n_jobs=-1)]: Done 605 tasks      | elapsed: 165.6min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 177.0min
[Parallel(n_jobs=-1)]: Done 677 tasks      | elapsed: 185.8min
[Parallel(n_jobs=-1)]: Done 714 tasks      | elapsed: 195.7min
[Parallel(n_jobs=-1)]: Done 753 tasks      | elapsed: 207.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 218.0min
[Parallel(n_jobs=-1)]: Done 833 tasks      | elapsed: 228.6min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed: 240.7min
[Parallel(n_jobs=-1)]: Done 917 tasks      | elapsed: 252.8min
[Parallel(n_jobs=-1)]: Done 960 tasks      | elapsed: 263.5min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 277.5min
[Parallel(n_jobs=-1)]: Done 1050 tasks      | elapsed: 289.1min
[Parallel(n_jobs=-1)]: Done 1097 tasks      | elapsed: 301.6min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed: 315.9min
[Parallel(n_jobs=-1)]: Done 1193 tasks      | elapsed: 328.4min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 341.9min
[Parallel(n_jobs=-1)]: Done 1293 tasks      | elapsed: 356.9min
[Parallel(n_jobs=-1)]: Done 1344 tasks      | elapsed: 369.9min
[Parallel(n_jobs=-1)]: Done 1397 tasks      | elapsed: 386.6min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 399.7min
[Parallel(n_jobs=-1)]: Done 1505 tasks      | elapsed: 415.5min
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed: 431.0min
[Parallel(n_jobs=-1)]: Done 1617 tasks      | elapsed: 446.6min
[Parallel(n_jobs=-1)]: Done 1674 tasks      | elapsed: 463.7min
[Parallel(n_jobs=-1)]: Done 1733 tasks      | elapsed: 478.6min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 497.1min
[Parallel(n_jobs=-1)]: Done 1853 tasks      | elapsed: 512.1min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed: 532.3min finished
Out[18]:
GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 0 0], n_folds=8, shuffle=True, random_state=1027),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'colsample_bytree': [0.8625, 0.875, 0.8875], 'learning_rate': [0.04, 0.045, 0.05, 0.055, 0.06], 'n_estimators': [175, 200, 225, 250], 'subsample': [0.975, 0.98, 0.985, 0.99], 'seed': [1027], 'max_depth': [4]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=10)

In [19]:
grid.best_params_


Out[19]:
{'colsample_bytree': 0.875,
 'learning_rate': 0.045,
 'max_depth': 4,
 'n_estimators': 250,
 'seed': 1027,
 'subsample': 0.985}

Fuck


In [40]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x_sel, y, random_state=1126, stratify=y, test_size=0.3)

In [41]:
sub_clf = xgb.XGBClassifier(
                max_depth = 5,
                n_estimators=350,
                learning_rate=0.045, 
                nthread=4,
                subsample=0.985,
                colsample_bytree=0.875, 
                seed=1027
)

In [42]:
cv = cross_validation.StratifiedKFold(y_train, n_folds=4, shuffle=True)
scores = cross_validation.cross_val_score(sub_clf, x_train, y_train, cv=cv, scoring='roc_auc')
print("Auc: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()))


Auc: 0.836 (+/- 0.005)

In [44]:
sub_clf.fit(x_sel, y, verbose=True)
test = pd.read_csv(zipfile.ZipFile('test.csv.zip').open('test.csv'))
sel_test = selectK.transform(test)    
y_pred = sub_clf.predict_proba(sel_test)
submission = pd.DataFrame({"ID":test.ID, "TARGET":y_pred[:,1]})
submission.to_csv("submission.csv", index=False)

Grid Search #5


In [ ]:
clf = xgb.XGBClassifier()
xgb_params = {
    'learning_rate': [0.045],
    'n_estimators': [250],
    'max_depth': [4],
    'subsample': [0.985],
    'colsample_bytree': [0.875],
    'seed': [1027]
}
cv = cross_validation.StratifiedKFold(y, n_folds=8, shuffle=True, random_state=1027)
grid = GridSearchCV(clf, xgb_params, scoring='roc_auc', cv=cv, verbose=10, n_jobs=-1)