In [1]:
pwd


Out[1]:
u'/Users/martiom/kaggle'

In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing as pre
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.calibration import CalibratedClassifierCV

In [5]:
clf = RandomForestClassifier()

In [9]:
import xgboost
import numpy as np

from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

In [10]:
RandomizedSearchCV?

In [4]:
train = pd.read_csv('train.csv')
train.replace([' ', '', -1], np.nan, inplace = True)

In [25]:
from collections import defaultdict
typed = defaultdict(int)
prefix = defaultdict(int)

for c in train.columns:
    typed[train[c].dtype] += 1
    prefix[c[:5]] += 1
print dict(typed)
print dict(prefix)


typed = defaultdict(int)
prefix = defaultdict(int)
ul = defaultdict(int)

for c in train.columns[54:68]:
    typed[train[c].dtype] += 1
    prefix[c[:5]] += 1
    ul[len(train[c].unique())] += 1
for c in train.columns[109:113]:
    typed[train[c].dtype] += 1
    prefix[c[:5]] += 1
for c in train.columns[169:183]:
    typed[train[c].dtype] += 1
    prefix[c[:5]] += 1
for c in train.columns[244:248]:
    typed[train[c].dtype] += 1
    prefix[c[:5]] += 1
for c in train.columns[294:299]:
    typed[train[c].dtype] += 1
    prefix[c[:5]] += 1

print dict(typed)
print dict(prefix)
print dict(ul)


{dtype('O'): 28, dtype('int64'): 121, dtype('float64'): 150}
{'Prope': 47, 'Quote': 2, 'Cover': 16, 'Sales': 17, 'Field': 7, 'Perso': 83, 'Geogr': 126, 'Origi': 1}
{dtype('O'): 8, dtype('int64'): 18, dtype('float64'): 15}
{'Prope': 4, 'Perso': 18, 'Geogr': 19}
{66: 1, 4: 1, 5: 2, 7: 1, 13: 1, 14: 3, 50: 1, 22: 1, 57: 1, 61: 1, 30: 1}

Load data Extract labels Remove labels and quote number Convert date to datetime


In [10]:
execfile('preprocessing.py')

In [16]:
# keep only good rows
chosen_indices = range(0, 53) + range(68, 109) + range(113, 169) + range(183, 234) + range(248, 294)
train_chosen = train.iloc[:, chosen_indices]
print train_chosen.shape, train.shape


(260753, 247) (260753, 299)

convert date. extract year, month, day, weekday from date and drop date


In [133]:
X, y = convert_data(train, cat2vectors = False, normalize_numeric = False)
print X.shape, y.shape


QuoteNumber skipped
Original_Quote_Date skipped
QuoteConversion_Flag skipped
(260753, 285) (260753,)
  • label encoder vs one hot encoder
  • median calculation
  • data normalization (without median)

In [142]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.1, random_state=888)
#Xvalidation, Xtest, yvalidation, ytest = train_test_split(Xtest, ytest, test_size = 0.7, random_state = 8)
print Xtrain.shape, Xvalidation.shape, Xtest.shape


(234677, 285) (23467, 285) (26076, 285)

In [75]:
print xgb.cv?

In [1]:
def check_xboost(Xtrain, ytrain, Xvalidation, yvalidation, Xtest, ytest, num_boost_root=30):
    dtrain = xgb.DMatrix(Xtrain, ytrain)
    clf = xgb.XGBClassifier(learning_rate=0.02, max_depth = 10, 
                            subsample =0.85, colsample_bytree = 0.66)
    booster_params = clf.get_params()
    booster_params['eval_metric'] = "auc"
    booster_params['eval_set'] = ((Xvalidation, yvalidation), (Xtrain, ytrain))
    v = xgb.cv(booster_params, dtrain, num_boost_round = num_boost_round,
              early_stopping_rounds = 150
              )
    print v

In [140]:
def check_classifier(Xtrain, ytrain, Xvalidation, yvalidation, Xtest, ytest, clf):
    print Xtrain.shape, ytrain.shape, Xvalidation.shape, yvalidation.shape
    clf.fit(Xtrain, ytrain,
           #eval_metric = "auc", 
            #early_stopping_rounds = 150,
            #eval_set=((Xtrain, ytrain), (Xvalidation, yvalidation), ),
            #verbose = False
           )
    print clf.score(Xtrain, ytrain)
    print clf.score(Xtest, ytest)

In [110]:
clf.fit?

In [143]:
%%timeit -n 1 -r 1
dc = lambda: xgb.XGBClassifier(n_estimators=100,learning_rate=0.02, max_depth = 10, subsample =0.85, 
                               colsample_bytree = 0.66)
clf = dc()
check_classifier(Xtrain, ytrain, Xvalidation, yvalidation, Xtest, ytest,clf )
#check_xboost(Xtrain, ytrain, Xvalidation, yvalidation, Xtest, ytest)

clf = dc()
clfbag = BaggingClassifier(clf, n_estimators=5)
check_classifier(Xtrain, ytrain,  Xvalidation, yvalidation, Xtest, ytest,clfbag )

clf = dc()
clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
check_classifier(Xtrain, ytrain,  Xvalidation, yvalidation, Xtest, ytest,clf_isotonic )


(234677, 285) (234677,) (23467, 285) (23467,)
0.928915061979
0.923531216444
(234677, 285) (234677,) (23467, 285) (23467,)
0.928382414979
0.923109372603
(234677, 285) (234677,) (23467, 285) (23467,)
0.929268739587
0.922649179322
1 loops, best of 1: 1h 24min 30s per loop

In [176]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
check_classifier(Xtrain, ytrain, Xtest, ytest,clf_isotonic )


0.917019012282
0.916204866637

In [177]:
clf = RandomForestClassifier(n_estimators=111, n_jobs=-1)
check_classifier(Xtrain, ytrain, Xtest, ytest,clf )

clf = RandomForestClassifier(n_estimators=111, n_jobs=-1)
clfbag = BaggingClassifier(clf, n_estimators=5)
check_classifier(Xtrain, ytrain, Xtest, ytest,clfbag )

clf = RandomForestClassifier(n_estimators=111, n_jobs=-1)
clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
check_classifier(Xtrain, ytrain, Xtest, ytest,clf_isotonic )


0.993024994966
0.899292439263
0.9801200372
0.90690494909
0.998686493897
0.909397710495

In [178]:
clf = RandomForestClassifier(n_estimators=111, n_jobs=-1)
check_classifier(Xtrain, ytrain, Xtest, ytest,clf )

clf = RandomForestClassifier(n_estimators=111, n_jobs=-1)
clfbag = BaggingClassifier(clf, n_estimators=5)
check_classifier(Xtrain, ytrain, Xtest, ytest,clfbag )

clf = RandomForestClassifier(n_estimators=111, n_jobs=-1)
clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
check_classifier(Xtrain, ytrain, Xtest, ytest,clf_isotonic )


1.0
0.910433165232
0.984261895859
0.909551111196
1.0
0.914958485935

In [182]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
check_classifier(Xtrain, ytrain, Xtest, ytest,clf )

clf = GaussianNB()
clfbag = BaggingClassifier(clf, n_estimators=5)
check_classifier(Xtrain, ytrain, Xtest, ytest,clfbag )

clf = GaussianNB()
clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
check_classifier(Xtrain, ytrain, Xtest, ytest,clf_isotonic )


0.416597156307
0.413510766812
0.40460302394
0.400682633123
0.810346976539
0.812812793619

In [ ]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier
dc = lambda: KNeighborsClassifier(7)

clf = dc()
check_classifier(Xtrain, ytrain, Xtest, ytest,clf )

clf = dc()
clfbag = BaggingClassifier(clf, n_estimators=5)
check_classifier(Xtrain, ytrain, Xtest, ytest,clfbag )

clf = dc()
clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
check_classifier(Xtrain, ytrain, Xtest, ytest,clf_isotonic )

In [ ]:


In [ ]:
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
clf1 = LogisticRegression(random_state=1)
clf1_isotonic = CalibratedClassifierCV(clf1, cv=5, method='isotonic')

clf2 = RandomForestClassifier(n_estimators = 111)
clf2_isotonic = CalibratedClassifierCV(clf2, cv=5, method='isotonic')

clf3 = GaussianNB()
clf3_isotonic = CalibratedClassifierCV(clf3, cv=5, method='isotonic')

eclf = VotingClassifier(estimators=[('lr', clf1_isotonic), 
                                    ('rf', clf2_isotonic), 
                                    ('gnb', clf3_isotonic)], voting='soft')
check_classifier(Xtrain, ytrain, Xtest, ytest,eclf )

In [ ]:
import sklearn
sklearn.__file__