notebook.community

Edit and run



In [161]:

    
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation as cv
import scipy.stats as stats
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV as gs
from sklearn import metrics as metrics
from sklearn.preprocessing import StandardScaler



In [162]:

    
data = np.genfromtxt("data_2.csv", skip_header=1, delimiter = ',')



In [163]:

    
data.shape









    Out[163]:





(19668L, 5L)



In [164]:

    
print data[1,:]









    



[ 0.          0.01995469  0.10625038  0.          2.17108361]



In [165]:

    
y=data[:,0]
X=data[:,1::]



In [166]:

    
X = StandardScaler().fit_transform(X)



In [167]:

    
print X.shape
print y.shape









    



(19668L, 4L)
(19668L,)



In [168]:

    
print y









    



[ 0.  0.  1. ...,  0.  0.  0.]



In [169]:

    
print X









    



[[-0.6323869  -0.6536458  -0.03282931 -0.17082653]
 [-0.68942098 -0.26924731 -0.03282931  0.4394025 ]
 [-0.71015893  0.73618884 -0.03282931 -0.1674353 ]
 ..., 
 [-0.69404982  1.31850572 -0.03282931 -0.10803752]
 [ 0.56301405  2.12994268 -0.03282931 -0.1682032 ]
 [-0.69753024  0.40583136 -0.02800921 -0.17082653]]



In [185]:

    
# split X and y into train and test datasets

(X_train, X_test, y_train, y_test) = cv.train_test_split(X, y, test_size=.03)



In [186]:

    
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model.fit(X_train, y_train)









    Out[186]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)



In [187]:

    
#Predicted class label per sample
#X_new = [[ .5,  .5]]
#model.predict(X_new)



In [188]:

    
#Returns the probability of the sample for each class in the model
model.predict_proba(X_test)









    Out[188]:





array([[  9.99999995e-01,   4.68858342e-09],
       [  9.26577235e-01,   7.34227651e-02],
       [  8.53217690e-01,   1.46782310e-01],
       ..., 
       [  9.99927737e-01,   7.22629732e-05],
       [  9.99994606e-01,   5.39445152e-06],
       [  1.00000000e+00,   7.25040449e-30]])



In [189]:

    
model.predict_proba(X_train).shape









    Out[189]:





(19077L, 2L)



In [190]:

    
#mean accuracy on the given test data and labels
model.score(X_test, y_test)









    Out[190]:





0.90186125211505919



In [191]:

    
#Confidence scores per (sample, class) combination.
model.decision_function(X_train)









    Out[191]:





array([-11.50496716, -13.12875648,  -8.93719379, ..., -12.43386602,
       -17.95817063,  -6.82778259])



In [192]:

    
#For each pair of train and test set, a prediction score
cross_val_score(model, X, y)









    Out[192]:





array([ 0.88012811,  0.88956681,  0.88207475])



In [193]:

    
grid = gs(model, {'C': np.logspace(-5, 5, 50)})
grid.fit(X_train, y_train)
grid.best_params_









    Out[193]:





{'C': 910.29817799152272}



In [194]:

    
cv.cross_val_score(grid.best_estimator_, X, y)









    Out[194]:





array([ 0.86014946,  0.91534472,  0.88802441])



In [195]:

    
model.predict(X_test)









    Out[195]:





array([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.])



In [196]:

    
y_pred = model.predict(X_test)
print 'Precision: ', metrics.precision_score(y_test, y_pred)
print 'Recall: ', metrics.recall_score(y_test, y_pred, average='binary')
print 'Accuracy', metrics.accuracy_score(y_test, y_pred)
print 'F1', metrics.f1_score(y_test, y_pred, average='binary')









    



Precision:  0.76
Recall:  0.267605633803
Accuracy 0.901861252115
F1 0.395833333333



In [199]:

    
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)









    Out[199]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [200]:

    
yrfc_pred = rfc.predict(X_test)
print 'Precision: ', metrics.precision_score(y_test, yrfc_pred)
print 'Recall: ', metrics.recall_score(y_test, yrfc_pred, average='binary')
print 'Accuracy', metrics.accuracy_score(y_test, yrfc_pred)
print 'F1', metrics.f1_score(y_test, yrfc_pred, average='binary')









    



Precision:  0.68115942029
Recall:  0.661971830986
Accuracy 0.922165820643
F1 0.671428571429



In [ ]: