notebook.community

Edit and run



In [20]:

    
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation as cv
import scipy.stats as stats
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV as gs
from sklearn import metrics as metrics



In [2]:

    
data = np.genfromtxt("qry_students_per_moduleNOTEXT.csv", skip_header=1, delimiter = ',')



In [3]:

    
data.shape









    Out[3]:





(62592L, 11L)



In [4]:

    
print data[1,:]









    



[             nan              nan   3.91928520e+02   4.79800000e+03
   8.16858108e-02   2.10000000e+01   0.00000000e+00   0.00000000e+00
   0.00000000e+00   2.00000000e+00   0.00000000e+00]



In [5]:

    
y=data[:,10]
X=data[:,2:9]



In [6]:

    
# split X and y into train and test datasets

(X_train, X_test, y_train, y_test) = cv.train_test_split(X, y, test_size=.05)



In [7]:

    
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model.fit(X_train, y_train)









    Out[7]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)



In [8]:

    
#Predicted class label per sample
X_new = [[ 100,  40000,  0.034,  70, 0, 0, 1]]
model.predict(X_new)









    Out[8]:





array([ 0.])



In [9]:

    
#Returns the probability of the sample for each class in the model
model.predict_proba(X_test)









    Out[9]:





array([[ 0.54835697,  0.45164303],
       [ 0.54173843,  0.45826157],
       [ 0.55106329,  0.44893671],
       ..., 
       [ 0.54558176,  0.45441824],
       [ 0.41508689,  0.58491311],
       [ 0.53324762,  0.46675238]])



In [10]:

    
model.predict_proba(X_train).shape









    Out[10]:





(59462L, 2L)



In [11]:

    
#mean accuracy on the given test data and labels
model.score(X_test, y_test)









    Out[11]:





0.73035143769968047



In [12]:

    
#Confidence scores per (sample, class) combination.
model.decision_function(X_train)









    Out[12]:





array([-0.19642636,  0.10597453, -0.00395081, ..., -0.19429536,
       -0.20496776, -0.09952749])



In [13]:

    
#For each pair of train and test set, a prediction score
cross_val_score(model, X, y)









    Out[13]:





array([ 0.74506327,  0.74539877,  0.65821511])



In [15]:

    
grid = gs(model, {'C': np.logspace(-5, 5, 50)})
grid.fit(X_train, y_train)
grid.best_params_









    Out[15]:





{'C': 4.0949150623804272e-05}



In [16]:

    
cv.cross_val_score(grid.best_estimator_, X, y)









    Out[16]:





array([ 0.7483704 ,  0.75469709,  0.63952262])



In [26]:

    
y_pred = model.predict(X_test)
print 'Precision: ', metrics.precision_score(y_test, y_pred)
print 'Recall: ', metrics.recall_score(y_test, y_pred, average='binary')
print 'Accuracy', metrics.accuracy_score(y_test, y_pred)
print 'F1', metrics.f1_score(y_test, y_pred, average='binary')









    



Precision:  0.602395209581
Recall:  0.495566502463
Accuracy 0.7303514377
F1 0.543783783784



In [27]:

    
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)









    Out[27]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [29]:

    
yrfc_pred = rfc.predict(X_test)
print 'Precision: ', metrics.precision_score(y_test, yrfc_pred)
print 'Recall: ', metrics.recall_score(y_test, y_pred, average='binary')
print 'Accuracy', metrics.accuracy_score(y_test, y_pred)
print 'F1', metrics.f1_score(y_test, y_pred, average='binary')









    



Precision:  0.614430665163
Recall:  0.536945812808
Accuracy 0.740575079872
F1 0.573080967403



In [ ]: