In [20]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation as cv
import scipy.stats as stats
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV as gs
from sklearn import metrics as metrics
In [2]:
data = np.genfromtxt("qry_students_per_moduleNOTEXT.csv", skip_header=1, delimiter = ',')
In [3]:
data.shape
Out[3]:
In [4]:
print data[1,:]
In [5]:
y=data[:,10]
X=data[:,2:9]
In [6]:
# split X and y into train and test datasets
(X_train, X_test, y_train, y_test) = cv.train_test_split(X, y, test_size=.05)
In [7]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model.fit(X_train, y_train)
Out[7]:
In [8]:
#Predicted class label per sample
X_new = [[ 100, 40000, 0.034, 70, 0, 0, 1]]
model.predict(X_new)
Out[8]:
In [9]:
#Returns the probability of the sample for each class in the model
model.predict_proba(X_test)
Out[9]:
In [10]:
model.predict_proba(X_train).shape
Out[10]:
In [11]:
#mean accuracy on the given test data and labels
model.score(X_test, y_test)
Out[11]:
In [12]:
#Confidence scores per (sample, class) combination.
model.decision_function(X_train)
Out[12]:
In [13]:
#For each pair of train and test set, a prediction score
cross_val_score(model, X, y)
Out[13]:
In [15]:
grid = gs(model, {'C': np.logspace(-5, 5, 50)})
grid.fit(X_train, y_train)
grid.best_params_
Out[15]:
In [16]:
cv.cross_val_score(grid.best_estimator_, X, y)
Out[16]:
In [26]:
y_pred = model.predict(X_test)
print 'Precision: ', metrics.precision_score(y_test, y_pred)
print 'Recall: ', metrics.recall_score(y_test, y_pred, average='binary')
print 'Accuracy', metrics.accuracy_score(y_test, y_pred)
print 'F1', metrics.f1_score(y_test, y_pred, average='binary')
In [27]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
Out[27]:
In [29]:
yrfc_pred = rfc.predict(X_test)
print 'Precision: ', metrics.precision_score(y_test, yrfc_pred)
print 'Recall: ', metrics.recall_score(y_test, y_pred, average='binary')
print 'Accuracy', metrics.accuracy_score(y_test, y_pred)
print 'F1', metrics.f1_score(y_test, y_pred, average='binary')
In [ ]: