Answer: I think this type of problem is classification, because the target data is discrete, students who passed and students who failed.
In [1]:
# Import libraries
import numpy as np
import pandas as pd
In [2]:
# Read student data
student_data = pd.read_csv("student-data.csv")
print "Student data read successfully!"
# Note: The last column 'passed' is the target/label, all other are feature columns
Now, can you find out the following facts about the dataset?
In [3]:
n_students = student_data.shape[0]
n_features = student_data.shape[1]
n_passed = student_data[student_data['passed']=='yes'].shape[0]
n_failed = n_students - n_passed
grad_rate = (n_passed * 100.0) / n_students
print "Total number of students: {}".format(n_students)
print "Number of students who passed: {}".format(n_passed)
print "Number of students who failed: {}".format(n_failed)
print "Number of features: {}".format(n_features-1)
print "Graduation rate of the class: {:.2f}%".format(grad_rate)
In this section, we will prepare the data for modeling, training and testing.
It is often the case that the data you obtain contains non-numeric features. This can be a problem, as most machine learning algorithms expect numeric data to perform computations with.
In [4]:
# Extract feature (X) and target (y) columns
feature_cols = list(student_data.columns[:-1]) # all columns but last are features
target_col = student_data.columns[-1] # last column is the target/label
print "Feature column(s):-\n{}".format(feature_cols)
print "Target column: {}".format(target_col)
X_all = student_data[feature_cols] # feature values for all students
y_all = student_data[target_col] # corresponding targets/labels
print "\nFeature values:-"
print X_all.head() # print the first 5 rows
As you can see, there are several non-numeric columns that need to be converted! Many of them are simply yes
/no
, e.g. internet
. These can be reasonably converted into 1
/0
(binary) values.
Other columns, like Mjob
and Fjob
, have more than two values, and are known as categorical variables. The recommended way to handle such a column is to create as many columns as possible values (e.g. Fjob_teacher
, Fjob_other
, Fjob_services
, etc.), and assign a 1
to one of them and 0
to all others.
These generated columns are sometimes called dummy variables, and we will use the pandas.get_dummies()
function to perform this transformation.
In [5]:
# Preprocess feature columns
def preprocess_features(X):
outX = pd.DataFrame(index=X.index) # output dataframe, initially empty
# Check each column
for col, col_data in X.iteritems():
# If data type is non-numeric, try to replace all yes/no values with 1/0
if col_data.dtype == object:
col_data = col_data.replace(['yes', 'no'], [1, 0])
# Note: This should change the data type for yes/no columns to int
# If still non-numeric, convert to one or more dummy variables
if col_data.dtype == object:
col_data = pd.get_dummies(col_data, prefix=col) # e.g. 'school' => 'school_GP', 'school_MS'
outX = outX.join(col_data) # collect column(s) in output dataframe
return outX
X_all = preprocess_features(X_all)
print "Processed feature columns ({}):-\n{}".format(len(X_all.columns), list(X_all.columns))
In [6]:
from sklearn.cross_validation import train_test_split
# First, decide how many training vs test samples you want
num_all = student_data.shape[0] # same as len(student_data)
num_train = 300 # about 75% of the data
num_test = num_all - num_train
# TODO: Then, select features (X) and corresponding labels (y) for the training and test sets
# Note: Shuffle the data or randomly select samples to avoid any bias due to ordering in the dataset
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, train_size=num_train)
print "Training set: {} samples".format(X_train.shape[0])
print "Test set: {} samples".format(X_test.shape[0])
# Note: If you need a validation set, extract it from within training data
Choose 3 supervised learning models that are available in scikit-learn, and appropriate for this problem. For each model:
Produce a table showing training time, prediction time, F1 score on training set and F1 score on test set, for each training set size.
Note: You need to produce 3 such tables - one for each model.
In [7]:
# Train a Decision Tree model
import time
def train_classifier(clf, X_train, y_train):
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
return clf, end - start
# Choose a model, import it and instantiate an object
from sklearn import tree
clf = tree.DecisionTreeClassifier()
# Fit model to training data
clf,_ = train_classifier(clf, X_train, y_train) # note: using entire training set here
print clf
In [8]:
# Predict on training set and compute F1 score
from sklearn.metrics import f1_score
def predict_labels(clf, features, target):
start = time.time()
y_pred = clf.predict(features)
end = time.time()
return f1_score(target.values, y_pred, pos_label='yes'), end - start
train_f1_score, _ = predict_labels(clf, X_train, y_train)
print "F1 score for training set: {}".format(train_f1_score)
In [9]:
# Predict on test data
test_f1_score, _ = predict_labels(clf, X_test, y_test)
print "F1 score for test set: {}".format(test_f1_score)
In [10]:
# Train and predict using different training set sizes
def train_predict(clf, X_train, y_train, X_test, y_test):
clf, time_training = train_classifier(clf, X_train, y_train)
score_training, _ = predict_labels(clf, X_train, y_train)
score_testing, time_predicting = predict_labels(clf, X_test, y_test)
return score_training, time_training, score_testing, time_predicting
df_DT = pd.DataFrame(columns=['Set Size', 'Training time', 'Prediction time',
'F1 score for training set', 'F1 score for test set'])
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, train_size=num_train)
for num in xrange(50, 350, 50):
score_training, time_training, score_testing, time_testing = train_predict(tree.DecisionTreeClassifier(),
X_train[:num], y_train[:num],
X_test, y_test)
df_DT = df_DT.append({'Set Size':num,'Training time':time_training,
'Prediction time': time_testing,
'F1 score for training set': score_training,
'F1 score for test set': score_testing}, ignore_index=True)
In [11]:
# Table of Decision Tree
from IPython.display import display, HTML
display(df_DT)
In [12]:
# Train a SVM model
from sklearn import svm
clf, _ = train_classifier(svm.SVC(), X_train, y_train) # note: using entire training set here
print clf
In [13]:
# Predict on training set and compute F1 score
train_f1_score, _ = predict_labels(clf, X_train, y_train)
print "F1 score for training set: {}".format(train_f1_score)
In [14]:
# Predict on test data
test_f1_score, _ = predict_labels(clf, X_test, y_test)
print "F1 score for test set: {}".format(test_f1_score)
In [15]:
df_SVC = pd.DataFrame(columns=['Set Size', 'Training time', 'Prediction time',
'F1 score for training set', 'F1 score for test set'])
for num in xrange(50, 350, 50):
score_training, time_training, score_testing, time_testing = train_predict(svm.SVC(),
X_train[:num], y_train[:num],
X_test, y_test)
df_SVC = df_SVC.append({'Set Size':num,'Training time':time_training,
'Prediction time': time_testing,
'F1 score for training set': score_training,
'F1 score for test set': score_testing}, ignore_index=True)
In [16]:
# Table of SVC
display(df_SVC)
In [17]:
# Train a AdaBoost model with Decision Tree as a base estimator
from sklearn.ensemble import AdaBoostClassifier
clf,_ = train_classifier(AdaBoostClassifier(), X_train, y_train) # note: using entire training set here
print clf
In [18]:
# Predict on training set and compute F1 score
train_f1_score,_ = predict_labels(clf, X_train, y_train)
print "F1 score for training set: {}".format(train_f1_score)
In [19]:
# Predict on test data
test_f1_score,_ = predict_labels(clf, X_test, y_test)
print "F1 score for test set: {}".format(test_f1_score)
In [20]:
# Train and predict using different training set sizes
df_Ada = pd.DataFrame(columns=['Set Size', 'Training time', 'Prediction time',
'F1 score for training set', 'F1 score for test set'])
for num in xrange(50, 350, 50):
score_training, time_training, score_testing, time_testing = train_predict(AdaBoostClassifier(),
X_train[:num], y_train[:num],
X_test, y_test)
df_Ada = df_Ada.append({'Set Size':num,'Training time':time_training,
'Prediction time': time_testing,
'F1 score for training set': score_training,
'F1 score for test set': score_testing}, ignore_index=True)
In [21]:
# Table of AdaBoost
display(df_Ada)
I would like to choose Adaboost for this case, because
In [22]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.cross_validation import StratifiedShuffleSplit
param_grid = {"n_estimators": np.arange(3,20,2), "learning_rate": np.arange(0.5,0.8,0.02)}
scoring_function = make_scorer(f1_score, pos_label='yes', average='binary')
clf = GridSearchCV(AdaBoostClassifier(), param_grid=param_grid, scoring=scoring_function)
clf.fit(X_train, y_train)
training_score,_ = predict_labels(clf.best_estimator_, X_train, y_train)
testing_score,_ = predict_labels(clf.best_estimator_, X_test, y_test)
print "Best parameters: {}".format(clf.best_params_)
print "F1 score for training set: {}".format(training_score)
print "F1 score for test set: {}".format(testing_score)