Setup pandas


In [32]:
import pandas as pd

Make a function to read data


In [33]:
def read_and_split(file_name):
    """
    Input:  Filename containing csv data
    Output: vector y of binary classes, matrix X of independent variables
    """
    data = pd.read_csv(file_name)
    y = data['response']
    X = data.iloc[:,0:10]
    return y, X

Read in training/test data


In [34]:
y_train, X_train = read_and_split('train.csv')
y_test, X_test = read_and_split('test.csv')

Make a benchmark function


In [35]:
# X_train,y_train,X_test,y_test are all global variables
# We're going to make another called result that will store our modeling results

from collections import defaultdict
result = defaultdict(dict)

In [36]:
def benchmark(estimator,name):
    import time
    from sklearn.metrics import roc_auc_score
        
    current_time = time.clock()
    estimator.fit(X_train, y_train)
    result[name]['train_time'] = time.clock() - current_time
    
    current_time = time.clock()
    prob = estimator.predict_proba(X_test)
    result[name]['test_time'] = time.clock() - current_time
        
    result[name]['AUC'] = roc_auc_score(y_test, prob[:,1])

Test models


In [37]:
from sklearn.dummy import DummyClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier

##### baseline
# predict according to training distribution
# uniform random
benchmark(DummyClassifier(strategy='uniform'),name='DummyUniform')      

##### linear model
benchmark(LogisticRegression(), 'Logisitic Regression')

##### ensemble methods
benchmark(ExtraTreesClassifier(), 'Extra Trees')
benchmark(AdaBoostClassifier(), 'AdaBoost')
benchmark(RandomForestClassifier(), 'RandomForest')
benchmark(GradientBoostingClassifier(), 'GradientBoosting')

Collate results


In [38]:
result_df = pd.DataFrame(data=result).T
result_df[['train_time', 'test_time', 'AUC']].sort_values('AUC', ascending=False)


Out[38]:
train_time test_time AUC
GradientBoosting 1.58 0.01 0.951147
AdaBoost 0.82 0.01 0.939967
RandomForest 0.43 0.00 0.908641
Extra Trees 2.75 0.00 0.898670
DummyUniform 0.00 0.00 0.500000
Logisitic Regression 0.02 0.33 0.462054