In [32]:
import pandas as pd
In [33]:
def read_and_split(file_name):
"""
Input: Filename containing csv data
Output: vector y of binary classes, matrix X of independent variables
"""
data = pd.read_csv(file_name)
y = data['response']
X = data.iloc[:,0:10]
return y, X
In [34]:
y_train, X_train = read_and_split('train.csv')
y_test, X_test = read_and_split('test.csv')
In [35]:
# X_train,y_train,X_test,y_test are all global variables
# We're going to make another called result that will store our modeling results
from collections import defaultdict
result = defaultdict(dict)
In [36]:
def benchmark(estimator,name):
import time
from sklearn.metrics import roc_auc_score
current_time = time.clock()
estimator.fit(X_train, y_train)
result[name]['train_time'] = time.clock() - current_time
current_time = time.clock()
prob = estimator.predict_proba(X_test)
result[name]['test_time'] = time.clock() - current_time
result[name]['AUC'] = roc_auc_score(y_test, prob[:,1])
In [37]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
##### baseline
# predict according to training distribution
# uniform random
benchmark(DummyClassifier(strategy='uniform'),name='DummyUniform')
##### linear model
benchmark(LogisticRegression(), 'Logisitic Regression')
##### ensemble methods
benchmark(ExtraTreesClassifier(), 'Extra Trees')
benchmark(AdaBoostClassifier(), 'AdaBoost')
benchmark(RandomForestClassifier(), 'RandomForest')
benchmark(GradientBoostingClassifier(), 'GradientBoosting')
In [38]:
result_df = pd.DataFrame(data=result).T
result_df[['train_time', 'test_time', 'AUC']].sort_values('AUC', ascending=False)
Out[38]: