In [1]:
    
# Import libraries
import numpy as np
import pandas as pd
    
In [2]:
    
# Read student data
student_data = pd.read_csv("student-data.csv")
print "Student data read successfully!"
# Note: The last column 'passed' is the target/label, all other are feature columns
#student_data.head()
    
    
Now, can you find out the following facts about the dataset?
Use the code block below to compute these values. Instructions/steps are marked using TODOs.
In [3]:
    
# TODO: Compute desired values - replace each '?' with an appropriate expression/function call
n_students = student_data.shape[0]
n_features = student_data.shape[1]-1
n_passed   = student_data.loc[student_data['passed']=='yes','passed'].count()
n_failed   = student_data.loc[student_data['passed']=='no','passed'].count()
grad_rate  = (n_passed*100.0)/(n_passed+n_failed)
print "Total number of students: {}".format(n_students)
print "Number of students who passed: {}".format(n_passed)
print "Number of students who failed: {}".format(n_failed)
print "Number of features: {}".format(n_features)
print "Graduation rate of the class: {:.2f}%".format(grad_rate)
    
    
In this section, we will prepare the data for modeling, training and testing.
It is often the case that the data you obtain contains non-numeric features. This can be a problem, as most machine learning algorithms expect numeric data to perform computations with.
Let's first separate our data into feature and target columns, and see if any features are non-numeric.
Note: For this dataset, the last column ('passed') is the target or label we are trying to predict.
In [4]:
    
# Extract feature (X) and target (y) columns
feature_cols = list(student_data.columns[:-1])  # all columns but last are features
target_col = student_data.columns[-1]  # last column is the target/label
print "Feature column(s):-\n{}".format(feature_cols)
print "Target column: {}".format(target_col)
X_all = student_data[feature_cols]  # feature values for all students
y_all = student_data[target_col]  # corresponding targets/labels
print "\nFeature values:-"
print X_all.head()  # print the first 5 rows
    
    
As you can see, there are several non-numeric columns that need to be converted! Many of them are simply yes/no, e.g. internet. These can be reasonably converted into 1/0 (binary) values.
Other columns, like Mjob and Fjob, have more than two values, and are known as categorical variables. The recommended way to handle such a column is to create as many columns as possible values (e.g. Fjob_teacher, Fjob_other, Fjob_services, etc.), and assign a 1 to one of them and 0 to all others.
These generated columns are sometimes called dummy variables, and we will use the pandas.get_dummies() function to perform this transformation.
In [5]:
    
# Preprocess feature columns
def preprocess_features(X):
    outX = pd.DataFrame(index=X.index)  # output dataframe, initially empty
    # Check each column
    for col, col_data in X.iteritems():
        # If data type is non-numeric, try to replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])
        # Note: This should change the data type for yes/no columns to int
        # If still non-numeric, convert to one or more dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix=col)  # e.g. 'school' => 'school_GP', 'school_MS'
        outX = outX.join(col_data)  # collect column(s) in output dataframe
    return outX
X_all = preprocess_features(X_all)
print "Processed feature columns ({}):-\n{}".format(len(X_all.columns), list(X_all.columns))
    
    
In [6]:
    
# First, decide how many training vs test samples you want
num_all = student_data.shape[0]  # same as len(student_data)
num_train = 300  # about 75% of the data
num_test = num_all - num_train
# TODO: Then, select features (X) and corresponding labels (y) for the training and test sets
# Note: Shuffle the data or randomly select samples to avoid any bias due to ordering in the dataset
from sklearn import cross_validation
X_train,X_test,y_train,y_test=cross_validation.train_test_split(X_all,y_all,train_size=num_train,random_state=100)
print "Training set: {} samples".format(X_train.shape[0])
print "Test set: {} samples".format(X_test.shape[0])
# Note: If you need a validation set, extract it from within training data
    
    
Choose 3 supervised learning models that are available in scikit-learn, and appropriate for this problem. For each model:
Produce a table showing training time, prediction time, F1 score on training set and F1 score on test set, for each training set size.
Note: You need to produce 3 such tables - one for each model.
In [7]:
    
# Train a model
import time
def train_classifier(clf, X_train, y_train):
    print "Training {}...".format(clf.__class__.__name__)
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    print "Done!\nTraining time (secs): {:.3f}".format(end - start)
    
In [8]:
    
# Predict on training set and compute F1 score
from sklearn.metrics import f1_score
def predict_labels(clf, features, target):
    print "Predicting labels using {}...".format(clf.__class__.__name__)
    start = time.time()
    y_pred = clf.predict(features)
    end = time.time()
    print "Done!\nPrediction time (secs): {:.3f}".format(end - start)
    return f1_score(target.values, y_pred, pos_label='yes')
    
In [9]:
    
# Train and predict using different training set sizes
def train_predict(clf, X_train, y_train, X_test, y_test):
    print "------------------------------------------"
    print "Training set size: {}".format(len(X_train))
    train_classifier(clf, X_train, y_train)
    print "F1 score for training set: {}".format(predict_labels(clf, X_train, y_train))
    print "F1 score for test set: {}".format(predict_labels(clf, X_test, y_test))
    
In [10]:
    
# TODO: Choose a model, import it and instantiate an object
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
    
In [11]:
    
clf_NB = GaussianNB()
clf_DT = DecisionTreeClassifier(min_samples_split=5)
clf_SV = svm.SVC()
    
In [12]:
    
for clf in [clf_NB,clf_DT,clf_SV]:
    for size in [100,200,300]:
        train_predict(clf,X_train[:size],y_train[:size],X_test,y_test)
    
    
In [13]:
    
from sklearn.grid_search import GridSearchCV 
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
f1_scorer = make_scorer(f1_score, pos_label="yes")
parameters=  {'gamma': [1e-2, 1e-4, 1e-6], 
              'C': [10, 100, 200, 300, 500, 700],
              'kernel':['rbf','sigmoid','poly']
             }  
ssscv = StratifiedShuffleSplit( y_train, n_iter=10, test_size=0.1) 
grid = GridSearchCV( svm.SVC(), parameters, cv = ssscv , scoring=f1_scorer) 
grid.fit( X_train, y_train ) 
best = grid.best_estimator_ 
y_pred = best.predict( X_test ) 
print "F1 score: {}".format( f1_score( y_test, y_pred, pos_label = 'yes' ))
print "Best params: {}".format( grid.best_params_ )