In [1]:
# Import libraries
import numpy as np
import pandas as pd
In [2]:
# Read student data
student_data = pd.read_csv("student-data.csv")
print "Student data read successfully!"
# Note: The last column 'passed' is the target/label, all other are feature columns
Now, can you find out the following facts about the dataset?
Use the code block below to compute these values. Instructions/steps are marked using TODOs.
In [3]:
# TODO: Compute desired values - replace each '?' with an appropriate expression/function call
n_students = student_data.shape[0]
n_features = student_data.shape[1]-1
y_df = student_data['passed']
n_passed = y_df[y_df=='yes'].shape[0]
n_failed = n_students - n_passed
grad_rate = 100.0 * n_passed / n_students
print "Total number of students: {}".format(n_students)
print "Number of students who passed: {}".format(n_passed)
print "Number of students who failed: {}".format(n_failed)
print "Number of features: {}".format(n_features)
print "Graduation rate of the class: {:.2f}%".format(grad_rate)
In this section, we will prepare the data for modeling, training and testing.
It is often the case that the data you obtain contains non-numeric features. This can be a problem, as most machine learning algorithms expect numeric data to perform computations with.
Let's first separate our data into feature and target columns, and see if any features are non-numeric.
Note: For this dataset, the last column ('passed') is the target or label we are trying to predict.
In [4]:
# %%capture
# Extract feature (X) and target (y) columns
feature_cols = list(student_data.columns[:-1]) # all columns but last are features
target_col = student_data.columns[-1] # last column is the target/label
print "Feature column(s):-\n{}".format(feature_cols)
print "Target column: {}".format(target_col)
X_all = student_data[feature_cols] # feature values for all students
y_all = student_data[target_col] # corresponding targets/labels
print "\nFeature values:-"
print X_all.head() # print the first 5 rows
As you can see, there are several non-numeric columns that need to be converted! Many of them are simply yes/no, e.g. internet. These can be reasonably converted into 1/0 (binary) values.
Other columns, like Mjob and Fjob, have more than two values, and are known as categorical variables. The recommended way to handle such a column is to create as many columns as possible values (e.g. Fjob_teacher, Fjob_other, Fjob_services, etc.), and assign a 1 to one of them and 0 to all others.
These generated columns are sometimes called dummy variables, and we will use the pandas.get_dummies() function to perform this transformation.
In [5]:
# Preprocess feature columns
def preprocess_features(X):
# output dataframe, initially empty
outX = pd.DataFrame(index=X.index)
# Check each column
for col, col_data in X.iteritems():
# If data type is non-numeric, try to replace all yes/no values with 1/0
if col_data.dtype == object:
col_data = col_data.replace(['yes', 'no'], [1, 0])
# Note: This should change the data type for yes/no columns to int
# If still non-numeric, convert to one or more dummy variables
if col_data.dtype == object:
col_data = pd.get_dummies(col_data, prefix=col) # e.g. 'school' => 'school_GP', 'school_MS'
outX = outX.join(col_data) # collect column(s) in output dataframe
return outX
X_all = preprocess_features(X_all)
# X_all = pd.get_dummies(X_all)
print "Processed feature columns ({}):-\n{}".format(len(X_all.columns), list(X_all.columns))
In [6]:
from sklearn.cross_validation import train_test_split
# First, decide how many training vs test samples you want
num_all = student_data.shape[0] # same as len(student_data)
num_train = 300 # about 75% of the data
num_test = num_all - num_train
# TODO: Then, select features (X) and corresponding labels (y) for the training and test sets
# Note: Shuffle the data or randomly select samples to avoid any bias due to ordering in the dataset
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=num_train, random_state=11)
# Preserve this train/test split for final evaluation of model F1 score
X_train_initial, X_test_initial, y_train_initial, y_test_initial = X_train, X_test, y_train, y_test
print "Training set: {} samples".format(X_train.shape[0])
print "Test set: {} samples".format(X_test.shape[0])
# Note: If you need a validation set, extract it from within training data
Choose 3 supervised learning models that are available in scikit-learn, and appropriate for this problem. For each model:
Produce a table showing training time, prediction time, F1 score on training set and F1 score on test set, for each training set size.
Note: You need to produce 3 such tables - one for each model.
In [7]:
# Train a model
import time
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
def train_classifier(clf, X_train, y_train):
print "\nTraining {}...".format(clf.__class__.__name__)
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
duration = end - start
print "Training time (secs): {:.4f}".format(duration)
return duration
def predict_labels(clf, features, target):
# print "Predicting labels using {}...".format(clf.__class__.__name__)
start = time.time()
y_pred = clf.predict(features)
end = time.time()
print "Prediction time (secs): {:.4f}".format(end - start)
return f1_score(target.values, y_pred, pos_label='yes')
def train_predict(clf, X_train, y_train, X_test, y_test):
print "----------"
print "Training set size: {}".format(len(X_train))
train_classifier(clf, X_train, y_train)
print "Training set:"
train_f1_score = predict_labels(clf, X_train, y_train)
print "Testing set:"
test_f1_score = predict_labels(clf, X_test, y_test)
print "F1 score for training set: {}".format(train_f1_score)
print "F1 score for test set: {}".format(test_f1_score)
return train_f1_score, test_f1_score
# TODO: Choose a model, import it and instantiate an object
# TODO: Run the helper function above for desired subsets of training data
clfs = [DecisionTreeClassifier(random_state=42),
KNeighborsClassifier(),
LogisticRegression(random_state=42)]
for clf in clfs:
print "============================================="
# Fit model to training data
train_classifier(clf, X_train, y_train) # note: using entire training set here
# Predict on training & testing set and compute F1 score
train_f1_score = predict_labels(clf, X_train, y_train)
test_f1_score = predict_labels(clf, X_test, y_test)
print "F1 score for training set: {}".format(train_f1_score)
print "F1 score for test set: {}".format(test_f1_score)
for idx, train_size in enumerate([100, 200, 300]):
X_train_temp = X_train.iloc[:train_size]
y_train_temp = y_train.iloc[:train_size]
train_predict(clf, X_train_temp, y_train_temp, X_test, y_test)
print "============================================="
In [8]:
# %%capture
# test the effect of training sample size on F1 score with a finer interval of 20 instead of 100
# the resutls are visualized in the next cell
# output in this cell is suppressed
train_f1_scores = []
test_f1_scores = []
for clf in clfs:
print "============================================="
# Fit model to training data
# note: using entire training set here
train_classifier(clf, X_train, y_train)
# Predict on training & testing set and compute F1 score
train_f1_score = predict_labels(clf, X_train, y_train)
test_f1_score = predict_labels(clf, X_test, y_test)
print "F1 score for training set: {}".format(train_f1_score)
print "F1 score for test set: {}".format(test_f1_score)
# Train and predict using different training set sizes
train_sizes = np.arange(20, X_train.shape[0]+1, 20)
train_f1_score = np.zeros(train_sizes.shape)
test_f1_score = np.zeros(train_sizes.shape)
for idx, train_size in enumerate(train_sizes):
X_train_temp = X_train.iloc[:train_size]
y_train_temp = y_train.iloc[:train_size]
train_f1_score[idx], test_f1_score[idx] = train_predict(clf, X_train_temp, y_train_temp, X_test, y_test)
# Collect f1 scores for each classifier
train_f1_scores.append(train_f1_score)
test_f1_scores.append(test_f1_score)
print "============================================="
In [9]:
# visualize F1 score vs training sample size
# seaborn settings from [http://bebi103.caltech.edu/2015/tutorials/t0b_intro_to_jupyter_notebooks.html]
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}
rc = {'lines.linewidth': 2,
'axes.labelsize': 14,
'axes.titlesize': 14,
'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', font_scale=1.2, rc=rc)
sns.set_style('darkgrid', rc=rc)
plt.figure(1, figsize=(20, 5), dpi=300)
idx_subplot = 1
for idx, clf in enumerate(clfs):
# each subplot corresponds to a classifier
plt.subplot(1, len(clfs),idx_subplot)
plt.plot(train_sizes, train_f1_scores[idx], marker='o', label='F1 score ( train )')
plt.plot(train_sizes, test_f1_scores[idx], marker='s', label='F1 score ( test )')
if idx_subplot == 1: plt.ylabel('F1 score', fontweight='bold')
plt.xlabel('Training samples', fontweight='bold')
plt.title('%s' % clf.__class__.__name__, fontweight='bold')
plt.xlim(0, X_train.shape[0]+15)
plt.ylim(0.3, 1.05)
plt.yticks(np.arange(0.3, 1.05, 0.1))
plt.legend(loc='lower right')
idx_subplot += 1
plt.savefig('./F1_vs_training_size.pdf')
In [10]:
%%capture
# Takes around 6 mins to run on a 4 Ghz, quad-core machine
# TODO: Fine-tune your model and report the best F1 score
import time
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# time the script
start = time.time()
# calc_scores (f1_score, accuracy_score, recall_score, precision_score)
def calc_scores(y, y_pred):
return (f1_score (y, y_pred),
accuracy_score (y, y_pred),
recall_score (y, y_pred),
precision_score(y, y_pred))
# import data
student_data = pd.read_csv("student-data.csv")
# extract feature (X) and target (y) columns
feature_cols = list(student_data.columns[:-1])
target_col = student_data.columns[-1]
le_y = LabelEncoder()
X_all = pd.get_dummies(student_data[feature_cols])
y_all = student_data[target_col]
y_all = le_y.fit_transform(y_all)
# initialize classifiers for evaluations of performance
clfs_set = [AdaBoostClassifier(),
DecisionTreeClassifier(),
KNeighborsClassifier(),
LogisticRegression(),
SVC(),
SGDClassifier(),
RandomForestClassifier()]
clfs_best = []
train_scores = []
test_scores = []
# building param_grids for GridSearchCV
ada_grid = {'algorithm': ['SAMME', 'SAMME.R'],
'n_estimators': np.linspace(1, 6, num=5).astype(int),
'learning_rate': (0.001, 0.01, 0.1, 1, 10)}
dt_grid = {'criterion': ['gini', 'entropy'],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth': np.linspace(1, 10, num=10),
'min_samples_split': np.linspace(2, 10, 1),
'min_samples_leaf': (1, 2, 3, 4, 5)}
knn_grid = {'n_neighbors': (3, 4, 5, 6, 7, 8, 9),
'algorithm': ['auto', 'ball_tree', 'kd_tree'],
'p': (1, 2, 3, 4),
'leaf_size': (10, 20, 30, 40, 50),
'weights': ['uniform', 'distance']}
lr_grid = {'C': np.linspace(0.01, 0.2, num=200),
'penalty': ['l1', 'l2']}
svc_grid = {'kernel': ['rbf', 'poly'],
'gamma': np.linspace(0.01, 1, num=100)}
sgd_grid = {'loss': ['squared_hinge', 'hinge'],
'penalty': ['l2', 'l1'],
'alpha': np.linspace(0.001, 0.01, num=100)}
rf_grid = {'n_estimators': (10, 11, 12, 13, 14, 15, 16),
'max_features': ['auto'],
'criterion': ['gini', 'entropy'],
'max_depth': (3, 4, 5, 6),
'min_samples_split': (2, 3, 4, 5, 6)}
param_grids = [ada_grid, dt_grid, knn_grid, lr_grid, svc_grid, sgd_grid, rf_grid]
# run GridSearchCV for each classifier (maximizing f1-score)
# increase the train size to 80% sample size
num_runs = 25
num_clfs = len(clfs_set)
num_scores = 4
train_size = 0.80
for num_run in np.arange(num_runs):
# randomize train_split for each run
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=train_size)
print('===============================================================================')
print('Run #%d' % (num_run+1))
for clf, param_grid in zip(clfs_set, param_grids):
print("%s" % clf.__class__.__name__)
clf_opt = GridSearchCV(estimator=clf,
param_grid=param_grid,
scoring='f1',
n_jobs=-1)
clf_opt.fit(X_train, y_train)
y_train_pred = clf_opt.predict(X_train)
y_test_pred = clf_opt.predict(X_test)
# collect the bset estimator for each run
clfs_best.append(clf_opt.best_estimator_)
# calculate performance scores
train_scores.append(calc_scores(y_train, y_train_pred))
test_scores.append (calc_scores(y_test, y_test_pred))
print('Training set: F1 score %.3f | Accuracy %.3f | Recall %.3f | Precision %.3f '
% calc_scores(y_train, y_train_pred))
print('Training set: F1 score %.3f | Accuracy %.3f | Recall %.3f | Precision %.3f\n '
% calc_scores(y_test, y_test_pred))
print('===============================================================================')
train_scores = np.array(train_scores).reshape(num_runs, num_clfs, num_scores)
test_scores = np.array(test_scores ).reshape(num_runs, num_clfs, num_scores)
# time the script
end = time.time()
In [11]:
print('\nTime elapsed: %.3f mins' % ((end-start)/60))
In [12]:
# box plots of ['F1 score', 'Accuracy', 'Recall', 'Precision'] for both training and testing set
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
score_labels = ['F1 score', 'Accuracy', 'Recall', 'Precision']
clf_labels = [s.__class__.__name__ for s in clfs_set]
for idx_score, score_label in enumerate(score_labels):
plt.figure(figsize=[14, 4])
plt.subplot(1, 2, 1)
ax = sns.boxplot(data=train_scores [:,:,idx_score], palette="RdBu")
ax.set_ylim(0.5, 1.05)
ax.set_xticklabels(())
ax.set_title(score_label+' ( train )')
plt.xticks(np.arange(num_clfs), clf_labels, rotation='45')
plt.subplot(1, 2, 2)
ax = sns.boxplot(data=test_scores [:,:,idx_score], palette="RdBu")
ax.set_ylim(0.5, 1.05)
ax.set_xticklabels(())
ax.set_title(score_label+' ( test )')
plt.xticks(np.arange(num_clfs), clf_labels, rotation='45')
In [13]:
# print statistics
for idx_score, score_label in enumerate(score_labels):
print('=====================================================================')
print(score_label)
print('')
print('=== training set ===')
print(pd.DataFrame(train_scores[:, :, idx_score], columns=clf_labels).describe().T[['count', 'mean', 'std', 'min', 'max']])
print('')
print('=== testing set ===')
print(pd.DataFrame(test_scores [:, :, idx_score], columns=clf_labels).describe().T[['count', 'mean', 'std', 'min', 'max']])
print('=====================================================================')
In [14]:
print('Best F1 score:\n')
print('=== training set ===')
print(pd.DataFrame(train_scores[:, :, 0], columns=clf_labels).describe().T['max'])
print('')
print('=== testing set ===')
print(pd.DataFrame(test_scores [:, :, 0], columns=clf_labels).describe().T['max'])
In [15]:
# Extract the best logistic regression model from clfs_best
# Since 25 independent runs generate similar optimal parameters for logistic regression,
# the first parameter set is selected.
lr_best = (np.array(clfs_best).reshape(num_runs, num_clfs))[:,3][0]
# fit the model with all the whole dataset
# le_y is the label encoder to transform "yes/no" to "1/0" for the target set
lr_best.fit(X_train_initial, le_y.transform(y_train_initial))
print("The final F1 socre using all data points as training set is %.3f. "
%f1_score(le_y.transform(y_test_initial), lr_best.predict(X_test_initial)))