Note to self: this is the source of the data: https://archive.ics.uci.edu/ml/datasets/Student+Performance.
Identifying students who might need early intervention is a classification problem as you are sorting students into classes (needs intervention, doesn't need intervention) rather than trying to predict a quantitative value.
In [1]:
# Import libraries
import numpy
import pandas as pd
# my imports
import matplotlib.pyplot as plot
import seaborn
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
In [2]:
%matplotlib inline
In [3]:
# Read student data
student_data = pd.read_csv("student-data.csv")
print("Student data read successfully!")
# Note: The last column 'passed' is the target/label, all other are feature columns
Now, can you find out the following facts about the dataset?
Use the code block below to compute these values. Instructions/steps are marked using TODOs.
In [4]:
n_students = student_data.shape[0]
n_features = student_data.shape[1]
n_passed = sum(student_data.passed.map({'no': 0, 'yes': 1}))
n_failed = n_students - n_passed
grad_rate = n_passed/float(n_students)
print "Total number of students: {}".format(n_students)
print "Number of students who passed: {}".format(n_passed)
print "Number of students who failed: {}".format(n_failed)
print "Number of features: {}".format(n_features)
print "Graduation rate of the class: {:.2f}%".format(grad_rate)
In [5]:
types = student_data.dtypes
categoricals = [column for column in types.index if types.loc[column] == object]
numericals = [column for column in types.index if column not in categoricals]
In [6]:
print("Categorical Variables: {0}".format(len(categoricals)))
print("Numeric Variables: {0}".format(len(numericals)))
In [7]:
for categorical in categoricals:
print('{0}\t{1}'.format(categorical, ','.join(student_data[categorical].unique())))
In [8]:
categorical_data = student_data[categoricals]
for categorical in categoricals:
grid = seaborn.FacetGrid(categorical_data, col='passed')
grid = grid.map(seaborn.countplot, categorical)
grid.fig.suptitle('passed vs {0}'.format(categorical))
Surprisingly, Females were less likely to pass than males. family size
seems to influence passing, as does parental cohabitation, whether parents worked jobs other than services, health, teacher, or at home, reason for taking the course, whether they were paid, whether they had internet access at home.
In [9]:
figure = plot.figure(figsize=(10,8))
axe = figure.gca()
axe.set_title('numeric variables')
lines = seaborn.boxplot(x=student_data[numericals], ax=axe)
In [10]:
numerical_data = student_data[numericals]
figure = plot .figure(figsize=(10,8))
axe = figure.gca()
axe = numerical_data.plot(kind='kde', ax=axe)
In [11]:
from pandas.tools.plotting import parallel_coordinates
numerical_data['passed'] = student_data['passed']
figure = plot.figure(figsize=(10,10))
axe = figure.gca()
subplot = parallel_coordinates(numerical_data, 'passed', ax=axe)
In [12]:
passed = student_data.passed.value_counts()/student_data.shape[0]
print(passed)
In [13]:
grid = seaborn.FacetGrid(student_data, col='passed', size=8)
grid = grid.map_dataframe(lambda data, color: seaborn.heatmap(data.corr(), linewidths=0))
The highest corellations appear to be Dalc (workday alcohol consumption) and Walc (weekend alcohol consumption), along with Medu (mother's education) and Fedu (father's education).
In [14]:
figure = plot.figure(figsize=(10,8))
axe = figure.gca()
axe.set_ylabel('proportion')
axe.set_title("Count of Passing Students")
grid = seaborn.countplot(student_data.passed, ax=axe)
In this section, we will prepare the data for modeling, training and testing.
It is often the case that the data you obtain contains non-numeric features. This can be a problem, as most machine learning algorithms expect numeric data to perform computations with.
Let's first separate our data into feature and target columns, and see if any features are non-numeric.
Note: For this dataset, the last column ('passed'
) is the target or label we are trying to predict.
In [15]:
# Extract feature (X) and target (y) columns
feature_cols = list(student_data.columns[:-1]) # all columns but last are features
target_col = student_data.columns[-1] # last column is the target/label
print "Feature column(s):-\n{}".format(feature_cols)
print "Target column: {}".format(target_col)
X_all = student_data[feature_cols] # feature values for all students
y_all = student_data[target_col] # corresponding targets/labels
print "\nFeature values:-"
print X_all.head() # print the first 5 rows
As you can see, there are several non-numeric columns that need to be converted! Many of them are simply yes
/no
, e.g. internet
. These can be reasonably converted into 1
/0
(binary) values.
Other columns, like Mjob
and Fjob
, have more than two values, and are known as categorical variables. The recommended way to handle such a column is to create as many columns as possible values (e.g. Fjob_teacher
, Fjob_other
, Fjob_services
, etc.), and assign a 1
to one of them and 0
to all others.
These generated columns are sometimes called dummy variables, and we will use the pandas.get_dummies()
function to perform this transformation.
In [16]:
# Preprocess feature columns
def preprocess_features(X):
outX = pd.DataFrame(index=X.index) # output dataframe, initially empty
# Check each column
for col, col_data in X.iteritems():
# If data type is non-numeric, try to replace all yes/no values with 1/0
if col_data.dtype == object:
col_data = col_data.replace(['yes', 'no'], [1, 0])
# Note: This should change the data type for yes/no columns to int
# If still non-numeric, convert to one or more dummy variables
if col_data.dtype == object:
col_data = pd.get_dummies(col_data, prefix=col) # e.g. 'school' => 'school_GP', 'school_MS'
outX = outX.join(col_data) # collect column(s) in output dataframe
return outX
X_all = preprocess_features(X_all)
print "Processed feature columns ({}):-\n{}".format(len(X_all.columns), list(X_all.columns))
In [17]:
# First, decide how many training vs test samples you want
num_all = student_data.shape[0] # same as len(student_data)
num_train = 300 # about 75% of the data
num_test = num_all - num_train
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all,
test_size=num_test,
train_size=num_train)
print "Training set: {} samples".format(X_train.shape[0])
print "Test set: {} samples".format(X_test.shape[0])
# Note: If you need a validation set, extract it from within training data
Choose 3 supervised learning models that are available in scikit-learn, and appropriate for this problem. For each model:
Produce a table showing training time, prediction time, F1 score on training set and F1 score on test set, for each training set size.
Note: You need to produce 3 such tables - one for each model.
The first supervised learning model that I've chosen is Logistic Regression <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression>
_. Logistic Regression uses numeric data to predict binary categorical values, matching our inputs (after transformation) and outputs here. It is a linear classification model and so does best when the data is linearly separable, although it can be made to work as long as the features are pairwise-separable (Alpaydin, 2010). Logistic Regression has the advantage of being computationally cheap, reasonable to implement, and is interpretable but has the disadvantage that it is prone to underfitting (Harrington, 2012).
Logistic Regression uses the log-likelihood of the model to decide how good it is and tries to improve it by choosing weights that maximize the log-likelihood (Witten & Eibe, 2005). Logistic Regression calculates the probability that a target-feature is 1 using the logistic (sigmoid) function
(Alpaydin, 2010).
In [18]:
%%latex
P(y=1|x) = sigmoid(W^Tx + w_0)
= \frac{1}{1 + e^{-(W^Tx + w_0)}}
The sklearn implementation also supports regularization and thus can be used for feature selection.
The second learning model that I will use will be Random Forests<http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier>
_. This is an ensemble learner that combines predictions from multiple decision trees, each trained on a separate data set.
Decision Trees have several advantages, including the fact that they are easily interpretable, can sometimes fit complex data more easily than linear models, and don't require dummy variable. They are, however, generally not as accurate (James G. et al., 2013).
The idea behind using ensemble learners is that any particular model has a bias built into it based on its assumptions - when the assumptions are wrong it will perform poorly. You can improve performance by combining base-learners each of which has a different bias so that (ideally) no instance of the data will cause a majority of the learners to perform poorly, even if each performs poorly in some instances. For combining of models to work, there has to be enough diversity that they don't all fail on the same data (Alpaydin 2010).
The first way to introduce diversity is through bagging (boostrap aggregation) where each tree (base-learner) is given a data set that is constructed by re-sampling (with replacement) from the training-data.
The next way that diversity is introduced is by using a random samples of features whenever a split is made, rather than choosing the best split from all the features (the number of features used is near the square-root of the number of total features). By keeping the number of features small it reduces the likelihood that more influential features will dominate the splitting early on, causing the trees to be too similar (Gareth G. et al., 2013). This use of sub-sets of features in splitting is what makes it a random-forest (rather than just bagged trees).
Predictions are made by having each tree make a prediction and then the average of the predictions is used for the final prediction for the entire forest. Using these methods improves the performance over using an individual tree, but the ensemble is no longer interpretable the way a tree would be.
My final predictor will use K-Nearest Neighbors (KNN) classification. It is a fairly straight-forward method that doesn't build a model in the sense that the other two methods do. Instead KNN stores the training data and when asked to make a prediction finds the k number of points in the training data that are 'closest' to the input and calculates the probability of a classification (say y=1) as the fraction of the k-neighbors that are of that class. Thus if k=4 and three of the chosen neighbors are classified as 1 then the predicted class will be 1, because the majority of the neighbors were 1.
Unlike Logistic Regression, KNN doesn't require linear separability and unlike some other methods also makes no assumption about the distribution of the data (it is non-parametric). This makes it better in some cases, but how accurate it is depends on the choice of k. If k is too small it will tend to overfit the training data and if k is too large, it will become too rigid. Besides the difficulty in choosing k, because it is non-parametric it's not possible to inspect the model to decide which features are important. Additionally, since it's non-parametric, KNN needs more data to be accurate.
In [19]:
# Train a model
import time
def train_classifier(clf, X_train, y_train):
print "Training {}...".format(clf.__class__.__name__)
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
print "Done!\nTraining time (secs): {:.3f}".format(end - start)
# TODO: Choose a model, import it and instantiate an object
from sklearn import tree
clf = tree.DecisionTreeClassifier()
# Fit model to training data
train_classifier(clf, X_train, y_train) # note: using entire training set here
print clf # you can inspect the learned model by printing it
In [20]:
# Predict on training set and compute F1 score
from sklearn.metrics import f1_score
def predict_labels(clf, features, target):
print "Predicting labels using {}...".format(clf.__class__.__name__)
start = time.time()
y_pred = clf.predict(features)
end = time.time()
print "Done!\nPrediction time (secs): {:.3f}".format(end - start)
return f1_score(target.values, y_pred, pos_label='yes')
train_f1_score = predict_labels(clf, X_train, y_train)
print "F1 score for training set: {}".format(train_f1_score)
In [21]:
# Predict on test data
print "F1 score for test set: {}".format(predict_labels(clf, X_test, y_test))
In [22]:
# Train and predict using different training set sizes
def train_predict(clf, X_train, y_train, X_test, y_test):
print "------------------------------------------"
print "Training set size: {}".format(len(X_train))
train_classifier(clf, X_train, y_train)
print "F1 score for training set: {}".format(predict_labels(clf, X_train, y_train))
print "F1 score for test set: {}".format(predict_labels(clf, X_test, y_test))
In [23]:
class Classifier(object):
"""
Trains, predicts, evaluates classifier using f1 score
"""
def __init__(self, classifier, x_train, y_train, x_test, y_test, delim='\t'):
"""
:param:
- `classifier`: sklearn classifier object
- `x_train`: feature training data
- `y_train`: target training data
- `x_test`: feature test data
- `y_test`: target test data
- `delim`: separator for the table row
"""
self.clf = classifier
self._classifier = None
self.x_train = x_train
self.x_test = x_test
self.y_train = y_train
self.y_test = y_test
self._f1_train = None
self._f1_test = None
self.delim = delim
self._table_row = None
self._training_time = None
self._prediction_time = None
return
@property
def f1_train(self):
"""
:return: F1 score using training data
"""
if self._f1_train is None:
predictions, time_ = self.predict(self.x_train)
self._f1_train = self.f1_score(predictions, self.y_train)
return self._f1_train
@property
def f1_test(self):
"""
:return: f1 score for test-set predictions
:postcondition: self.prection_time set
"""
if self._f1_test is None:
predictions, self._prediction_time = self.predict(self.x_test)
self._f1_test = self.f1_score(predictions, self.y_test)
return self._f1_test
@property
def prediction_time(self):
"""
:return: prediction time for test data
"""
if self._prediction_time is None:
predictions, self._prediction_time = self.predict(self.x_test)
self._f1_test = self.f1_score(predictions, self.y_test)
return self._prediction_time
@property
def training_time(self):
"""
:return: training time in seconds
"""
if self._training_time is None:
start = time.time()
self._classifier = self.clf.fit(self.x_train, self.y_train)
self._training_time = time.time() - start
return self._training_time
@property
def classifier(self):
"""
:return: trained classifier
"""
if self._classifier is None:
start = time.time()
self._classifier = self.clf.fit(self.x_train, self.y_train)
self._training_time = time.time() - start
return self._classifier
def f1_score(self, predictions, target):
"""
:param:
- `predictions`: predicted values for model
- `target`: actual outcomes from data
:return: f1 score for predictions
"""
return f1_score(target.values, predictions, pos_label='yes')
def predict(self, features):
"""
:param:
- `features`: array of feature data
:return: predicted values, time to execute
"""
start = time.time()
predictions = self.classifier.predict(features)
elapsed = time.time() - start
return predictions, elapsed
def train_and_predict(self):
"""
:return: time, f1 score for training and testing data
"""
train_predictions, train_predictions_time = self.predict(self.x_train)
train_f1_score = self.f1_score(train_predictions, self.y_train)
test_predictions, test_predictions_time = self.predict(self.x_test)
test_f1_score = self.f1_score(test_predictions, self.y_test)
return (train_predictions_time, train_f1_score,
test_predictions_time, test_f1_score)
@property
def table_row(self):
"""
:return: string of training size, training time, prediction time, f1 train, f1 test
"""
if self._table_row is None:
self._table_row = self.delim.join([str(len(self.x_train))] +
["{0:.4f}".format(item) for item in (self.training_time,
self.prediction_time,
self.f1_train,
self.f1_test)])
return self._table_row
In [24]:
def train_and_predict(clf):
scores = []
for size in range(100, 400, 100):
x_train_subset, y_train_subset = X_train[:size], y_train[:size]
classifier = Classifier(clf, x_train_subset, y_train_subset,
X_test, y_test, delim='\t\t')
# train_time, train_score, test_time, test_score = classifier.train_and_predict()
# print('\t\t\t'.join([str(size)] + ['{0:.2f}'.format(item) for item in (classifier.training_time,
# train_score,
# test_time,
# test_score)]))
print(classifier.table_row)
scores.append((classifier.f1_test, size))
return max(scores)
In [25]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
classifiers = [LogisticRegression(), tree.DecisionTreeClassifier(), RandomForestClassifier(),
svm.SVC(), GaussianNB(), SGDClassifier(), AdaBoostClassifier(),
GradientBoostingClassifier(), KNeighborsClassifier()]
best_scores = []
line_width = 80
for classifier in classifiers:
print('')
print(classifier.__class__.__name__)
print("=" * line_width)
print("Size\t\tTime(t)\t\tTime(p)\t\tTrain F1\tTest F1")
print('-' * line_width)
best_score, best_size = train_and_predict(classifier)
print("-" * line_width)
print("best score: {0:.2f}, best_size: {1}".format(best_score, best_size))
best_scores.append((best_score,classifier.__class__.__name__, best_size))
print("=" * line_width)
print('')
print("Ranked by Score")
print('~' * line_width)
from tabulate import tabulate
table = [[score[1], score[0], score[-1]] for index,score in enumerate(sorted(best_scores, reverse=True))]
print(tabulate(table, headers='Classifier score training-size'.split()))
In [26]:
y_train_numeric = y_train.replace('yes no'.split(), [1, 0])
y_test_numeric = y_test.replace('yes no'.split(), [1, 0])
In [27]:
class LRClassifier(object):
"""
holds the LogisticRegression classifier
"""
def __init__(self, c_range, score_function=f1_score, n_jobs=-1, folds=10,
training_features=X_train, training_targets=y_train_numeric,
test_features=X_test, test_targets=y_test_numeric):
"""
:param:
- `c_range`: range of 'C' values for grid search
- `score_function`: function to maximize
- `n_jobs`: number of parallel jobs for the grid search
- `folds`: number of cross validation folds to use
- `training_features`: array of training feature-data
- `training_targets`: array of training target-values
- `test_features`: array of testing feature-data
- `test_targets`: array of testing target-data
"""
self.c_range = c_range
self.n_jobs = n_jobs
self.folds = folds
self.score_function = score_function
self.training_features = training_features
self.training_targets = training_targets
self.test_features = test_features
self.test_targets = test_targets
self._scorer = None
self._model = None
self._grid = None
self._parameters = None
return
@property
def parameters(self):
"""
:return: dict of grid search parameters
"""
if self._parameters is None:
self._parameters = {'penalty': ('l1', 'l2'),
'C': self.c_range}
return self._parameters
@property
def scorer(self):
"""
:return: scorer for the grid search
"""
if self._scorer is None:
self._scorer = make_scorer(self.score_function)
return self._scorer
@property
def model(self):
"""
:return: LogisticRegression object
"""
if self._model is None:
self._model = LogisticRegression()
return self._model
@property
def grid(self):
"""
:return: GridSearchCV object with best model
"""
if self._grid is None:
self._grid = GridSearchCV(self.model,
param_grid=self.parameters,
scoring=self.scorer,
cv=self.folds,
n_jobs=self.n_jobs)
self._grid.fit(self.training_features, self.training_targets)
return self._grid
def print_columns(self):
"""
prints non-zero coefficients in descending order
"""
coefficients = self.grid.best_estimator_.coef_[0]
sorted_coefficients = sorted((column for column in coefficients), reverse=True)
for coefficient in sorted_coefficients:
if abs(coefficient) > 0:
index = numpy.where(coefficients == coefficient)[0][0]
print(X_test.columns[index], coefficient)
return
def print_best(self):
print('Parameters')
print(self.grid.best_params_)
print('\nF1 Score')
print(self.grid.score(self.test_features, self.test_targets))
print('\ncoefficients')
self.print_columns()
In [ ]:
grid_01 = LRClassifier(numpy.arange(.01, 1.1, .01))
grid_01.print_best()