In [1]:
pwd
Out[1]:
In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing as pre
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.calibration import CalibratedClassifierCV
In [5]:
clf = RandomForestClassifier()
In [9]:
import xgboost
import numpy as np
from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
In [10]:
RandomizedSearchCV?
In [4]:
train = pd.read_csv('train.csv')
train.replace([' ', '', -1], np.nan, inplace = True)
In [25]:
from collections import defaultdict
typed = defaultdict(int)
prefix = defaultdict(int)
for c in train.columns:
typed[train[c].dtype] += 1
prefix[c[:5]] += 1
print dict(typed)
print dict(prefix)
typed = defaultdict(int)
prefix = defaultdict(int)
ul = defaultdict(int)
for c in train.columns[54:68]:
typed[train[c].dtype] += 1
prefix[c[:5]] += 1
ul[len(train[c].unique())] += 1
for c in train.columns[109:113]:
typed[train[c].dtype] += 1
prefix[c[:5]] += 1
for c in train.columns[169:183]:
typed[train[c].dtype] += 1
prefix[c[:5]] += 1
for c in train.columns[244:248]:
typed[train[c].dtype] += 1
prefix[c[:5]] += 1
for c in train.columns[294:299]:
typed[train[c].dtype] += 1
prefix[c[:5]] += 1
print dict(typed)
print dict(prefix)
print dict(ul)
Load data Extract labels Remove labels and quote number Convert date to datetime
In [10]:
execfile('preprocessing.py')
In [16]:
# keep only good rows
chosen_indices = range(0, 53) + range(68, 109) + range(113, 169) + range(183, 234) + range(248, 294)
train_chosen = train.iloc[:, chosen_indices]
print train_chosen.shape, train.shape
convert date. extract year, month, day, weekday from date and drop date
In [133]:
X, y = convert_data(train, cat2vectors = False, normalize_numeric = False)
print X.shape, y.shape
In [142]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.1, random_state=888)
#Xvalidation, Xtest, yvalidation, ytest = train_test_split(Xtest, ytest, test_size = 0.7, random_state = 8)
print Xtrain.shape, Xvalidation.shape, Xtest.shape
In [75]:
print xgb.cv?
In [1]:
def check_xboost(Xtrain, ytrain, Xvalidation, yvalidation, Xtest, ytest, num_boost_root=30):
dtrain = xgb.DMatrix(Xtrain, ytrain)
clf = xgb.XGBClassifier(learning_rate=0.02, max_depth = 10,
subsample =0.85, colsample_bytree = 0.66)
booster_params = clf.get_params()
booster_params['eval_metric'] = "auc"
booster_params['eval_set'] = ((Xvalidation, yvalidation), (Xtrain, ytrain))
v = xgb.cv(booster_params, dtrain, num_boost_round = num_boost_round,
early_stopping_rounds = 150
)
print v
In [140]:
def check_classifier(Xtrain, ytrain, Xvalidation, yvalidation, Xtest, ytest, clf):
print Xtrain.shape, ytrain.shape, Xvalidation.shape, yvalidation.shape
clf.fit(Xtrain, ytrain,
#eval_metric = "auc",
#early_stopping_rounds = 150,
#eval_set=((Xtrain, ytrain), (Xvalidation, yvalidation), ),
#verbose = False
)
print clf.score(Xtrain, ytrain)
print clf.score(Xtest, ytest)
In [110]:
clf.fit?
In [143]:
%%timeit -n 1 -r 1
dc = lambda: xgb.XGBClassifier(n_estimators=100,learning_rate=0.02, max_depth = 10, subsample =0.85,
colsample_bytree = 0.66)
clf = dc()
check_classifier(Xtrain, ytrain, Xvalidation, yvalidation, Xtest, ytest,clf )
#check_xboost(Xtrain, ytrain, Xvalidation, yvalidation, Xtest, ytest)
clf = dc()
clfbag = BaggingClassifier(clf, n_estimators=5)
check_classifier(Xtrain, ytrain, Xvalidation, yvalidation, Xtest, ytest,clfbag )
clf = dc()
clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
check_classifier(Xtrain, ytrain, Xvalidation, yvalidation, Xtest, ytest,clf_isotonic )
In [176]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
check_classifier(Xtrain, ytrain, Xtest, ytest,clf_isotonic )
In [177]:
clf = RandomForestClassifier(n_estimators=111, n_jobs=-1)
check_classifier(Xtrain, ytrain, Xtest, ytest,clf )
clf = RandomForestClassifier(n_estimators=111, n_jobs=-1)
clfbag = BaggingClassifier(clf, n_estimators=5)
check_classifier(Xtrain, ytrain, Xtest, ytest,clfbag )
clf = RandomForestClassifier(n_estimators=111, n_jobs=-1)
clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
check_classifier(Xtrain, ytrain, Xtest, ytest,clf_isotonic )
In [178]:
clf = RandomForestClassifier(n_estimators=111, n_jobs=-1)
check_classifier(Xtrain, ytrain, Xtest, ytest,clf )
clf = RandomForestClassifier(n_estimators=111, n_jobs=-1)
clfbag = BaggingClassifier(clf, n_estimators=5)
check_classifier(Xtrain, ytrain, Xtest, ytest,clfbag )
clf = RandomForestClassifier(n_estimators=111, n_jobs=-1)
clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
check_classifier(Xtrain, ytrain, Xtest, ytest,clf_isotonic )
In [182]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
check_classifier(Xtrain, ytrain, Xtest, ytest,clf )
clf = GaussianNB()
clfbag = BaggingClassifier(clf, n_estimators=5)
check_classifier(Xtrain, ytrain, Xtest, ytest,clfbag )
clf = GaussianNB()
clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
check_classifier(Xtrain, ytrain, Xtest, ytest,clf_isotonic )
In [ ]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier
dc = lambda: KNeighborsClassifier(7)
clf = dc()
check_classifier(Xtrain, ytrain, Xtest, ytest,clf )
clf = dc()
clfbag = BaggingClassifier(clf, n_estimators=5)
check_classifier(Xtrain, ytrain, Xtest, ytest,clfbag )
clf = dc()
clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
check_classifier(Xtrain, ytrain, Xtest, ytest,clf_isotonic )
In [ ]:
In [ ]:
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
clf1 = LogisticRegression(random_state=1)
clf1_isotonic = CalibratedClassifierCV(clf1, cv=5, method='isotonic')
clf2 = RandomForestClassifier(n_estimators = 111)
clf2_isotonic = CalibratedClassifierCV(clf2, cv=5, method='isotonic')
clf3 = GaussianNB()
clf3_isotonic = CalibratedClassifierCV(clf3, cv=5, method='isotonic')
eclf = VotingClassifier(estimators=[('lr', clf1_isotonic),
('rf', clf2_isotonic),
('gnb', clf3_isotonic)], voting='soft')
check_classifier(Xtrain, ytrain, Xtest, ytest,eclf )
In [ ]:
import sklearn
sklearn.__file__