In [20]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation as cv
import scipy.stats as stats
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV as gs
from sklearn import metrics as metrics

In [2]:
data = np.genfromtxt("qry_students_per_moduleNOTEXT.csv", skip_header=1, delimiter = ',')

In [3]:
data.shape


Out[3]:
(62592L, 11L)

In [4]:
print data[1,:]


[             nan              nan   3.91928520e+02   4.79800000e+03
   8.16858108e-02   2.10000000e+01   0.00000000e+00   0.00000000e+00
   0.00000000e+00   2.00000000e+00   0.00000000e+00]

In [5]:
y=data[:,10]
X=data[:,2:9]

In [6]:
# split X and y into train and test datasets

(X_train, X_test, y_train, y_test) = cv.train_test_split(X, y, test_size=.05)

In [7]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model.fit(X_train, y_train)


Out[7]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [8]:
#Predicted class label per sample
X_new = [[ 100,  40000,  0.034,  70, 0, 0, 1]]
model.predict(X_new)


Out[8]:
array([ 0.])

In [9]:
#Returns the probability of the sample for each class in the model
model.predict_proba(X_test)


Out[9]:
array([[ 0.54835697,  0.45164303],
       [ 0.54173843,  0.45826157],
       [ 0.55106329,  0.44893671],
       ..., 
       [ 0.54558176,  0.45441824],
       [ 0.41508689,  0.58491311],
       [ 0.53324762,  0.46675238]])

In [10]:
model.predict_proba(X_train).shape


Out[10]:
(59462L, 2L)

In [11]:
#mean accuracy on the given test data and labels
model.score(X_test, y_test)


Out[11]:
0.73035143769968047

In [12]:
#Confidence scores per (sample, class) combination.
model.decision_function(X_train)


Out[12]:
array([-0.19642636,  0.10597453, -0.00395081, ..., -0.19429536,
       -0.20496776, -0.09952749])

In [13]:
#For each pair of train and test set, a prediction score
cross_val_score(model, X, y)


Out[13]:
array([ 0.74506327,  0.74539877,  0.65821511])

In [15]:
grid = gs(model, {'C': np.logspace(-5, 5, 50)})
grid.fit(X_train, y_train)
grid.best_params_


Out[15]:
{'C': 4.0949150623804272e-05}

In [16]:
cv.cross_val_score(grid.best_estimator_, X, y)


Out[16]:
array([ 0.7483704 ,  0.75469709,  0.63952262])

In [26]:
y_pred = model.predict(X_test)
print 'Precision: ', metrics.precision_score(y_test, y_pred)
print 'Recall: ', metrics.recall_score(y_test, y_pred, average='binary')
print 'Accuracy', metrics.accuracy_score(y_test, y_pred)
print 'F1', metrics.f1_score(y_test, y_pred, average='binary')


Precision:  0.602395209581
Recall:  0.495566502463
Accuracy 0.7303514377
F1 0.543783783784

In [27]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)


Out[27]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [29]:
yrfc_pred = rfc.predict(X_test)
print 'Precision: ', metrics.precision_score(y_test, yrfc_pred)
print 'Recall: ', metrics.recall_score(y_test, y_pred, average='binary')
print 'Accuracy', metrics.accuracy_score(y_test, y_pred)
print 'F1', metrics.f1_score(y_test, y_pred, average='binary')


Precision:  0.614430665163
Recall:  0.536945812808
Accuracy 0.740575079872
F1 0.573080967403

In [ ]: