In [93]:
# python standard library
import warnings
from collections import namedtuple
import pickle
# third-party
import numpy
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, make_scorer
import pandas
from tabulate import tabulate
import seaborn
# this code
#from student_intervention.common import feature_map
In [66]:
feature_map = {"school": "student's school",
"sex": "student's sex",
"age": "student's age",
"address": "student's home address type",
"famsize": "family size",
"Pstatus": "parent's cohabitation status",
"Medu": "mother's education",
"Fedu": "father's education",
"Mjob": "mother's job",
"Fjob": "father's job",
"reason": "reason to choose this school",
"guardian": "student's guardian",
"traveltime": "home to school travel time",
"studytime": "weekly study time",
"failures": "number of past class failures",
"schoolsup": "extra educational support",
"famsup": "family educational support",
"paid": "extra paid classes within the course subject (Math or Portuguese)",
"activities": "extra-curricular activities",
"nursery": "attended nursery school",
"higher": "wants to take higher education",
"internet": "Internet access at home",
"romantic": "within a romantic relationship",
"famrel": "quality of family relationships",
"freetime": "free time after school",
"goout": "going out with friends",
"Dalc": "workday alcohol consumption",
"Walc": "weekend alcohol consumption",
"health": "current health status",
"absences": "number of school absences",
"passed": "did the student pass the final exam"}
In [67]:
student_data = pandas.read_csv('student-data.csv')
In [68]:
def to_numeric(frame):
"""
:param:
- `frame`: data frame to transform
:return: data frame with categorical values changed to numeric
"""
new_features = pandas.DataFrame(index = frame.index)
for column, column_data in frame.iteritems():
if column_data.dtype == object:
column_data = column_data.replace(['yes', 'no'], [ 1, 0])
if column_data.dtype == object:
column_data = pandas.get_dummies(column_data, prefix=column)
new_features = new_features.join(column_data)
return new_features
In [69]:
numeric_data = to_numeric(student_data)
In [70]:
passing_ratio = sum(numeric_data.passed)/float(len(numeric_data.passed))
In [71]:
features = numeric_data[numeric_data.columns[:-1]]
target = numeric_data['passed']
In [92]:
x_train, x_test, y_train, y_test = train_test_split(features, target,
train_size=300,
test_size=numeric_data.shape[0] - 300,
random_state=0)
TrainTestDataOne = namedtuple('TrainTestDataOne', 'X_train X_test y_train y_test'.split())
save_data = TrainTestDataOne(x_train, x_test, y_train, y_test)
with open('saved_data.pkl', 'wb') as pickler:
pickle.dump(save_data, pickler)
In [73]:
model = LogisticRegression()
cv_model = LogisticRegressionCV(cv=10, n_jobs=-1, penalty='l1', solver='liblinear')
In [74]:
model.fit(x_train, y_train)
Out[74]:
In [75]:
cv_model.fit(x_train, y_train)
Out[75]:
In [76]:
predictions = model.predict(x_test)
f1_score(y_test.values, model.predict(x_test), pos_label=1)
Out[76]:
In [77]:
f1_score(y_test.values, cv_model.predict(x_test), pos_label=1)
Out[77]:
In [78]:
scorer = make_scorer(f1_score)
In [79]:
def fit_grid(c_range, penalty=('l1', 'l2')):
parameters = {'penalty': penalty,
'C': c_range,
'class_weight': [None, 'balanced', {1:passing_ratio, 0: 1-passing_ratio}]}
grid = GridSearchCV(model, param_grid=parameters, scoring=scorer, cv=10, n_jobs=-1)
with warnings.catch_warnings():
warnings.simplefilter('ignore')
return grid.fit(x_train, y_train)
In [80]:
grid_01 = fit_grid(numpy.arange(.01, 1.1, .05))
In [81]:
def print_columns(grid):
coefficients = grid.best_estimator_.coef_[0]
odds = numpy.exp(coefficients)
sorted_coefficients = sorted((column for column in coefficients), reverse=True)
rows = []
for coefficient in sorted_coefficients:
if abs(coefficient) > 0:
index = numpy.where(coefficients == coefficient)[0][0]
column_name = x_train.columns[index]
description = feature_map[column_name] if column_name in feature_map else ' = '.join(column_name.split('_'))
rows.append([column_name, description, "{0:.2f}".format(coefficient), '{0:.2f}'.format(odds[index])])
print(tabulate(rows, headers='Variable Description Coefficient Odds'.split()))
In [82]:
def print_best(grid):
print("parameters")
print("==========\n")
print(tabulate(grid.best_params_.items(), headers='Parameter Value'.split()))
print('\nF1 score')
print('========\n')
print("{0:.2f}".format(grid.score(x_test, y_test)))
print('\nCoefficients')
print('============\n')
print_columns(grid)
In [83]:
print_best(grid_01)
In [84]:
grid_l1 = fit_grid(numpy.arange(.01, 1.1, .05), penalty=['l1'])
print_best(grid_l1)
The previous model seems to be overfitted. (actually I changed the train-test split to have more training data and this all changed).
In [85]:
grid_05 = fit_grid(numpy.arange(.05, 1.1, .05))
print_best(grid_05)
In [86]:
grid_1 = fit_grid(numpy.arange(.1, 1.1, .1))
print_best(grid_1)
In [87]:
grid_4 = fit_grid(numpy.arange(.3, .5, .05))
print_best(grid_4)
In [88]:
from sklearn.ensemble import RandomForestClassifier
In [89]:
forest = RandomForestClassifier()
forest.fit(x_train, y_train)
f1_score(forest.predict(x_test), y_test)
Out[89]:
In [90]:
forest_parameters = {'n_estimators': range(5, 20), 'max_features': range(5, len(x_train.columns))}
g = GridSearchCV(forest, param_grid=forest_parameters, scoring=scorer, cv=10, n_jobs=-1)
g.fit(x_train, y_train)
Out[90]:
In [91]:
print(g.score(x_test, y_test))
print(g.best_estimator_)
In [96]:
%matplotlib inline
seaborn.countplot(x='Medu', hue='passed', data=student_data)
Out[96]:
In [97]:
seaborn.countplot(x='famrel', hue='passed', data=student_data)
Out[97]:
In [98]:
seaborn.countplot(x='Fedu', hue='passed', data=student_data)
Out[98]:
In [101]:
seaborn.countplot(x='passed', hue='age', data=student_data)
Out[101]:
In [116]:
#axe = seaborn.countplot(x='absences', hue='passed', data=student_data)
axe = seaborn.kdeplot(student_data[student_data.passed=='yes'].absences, label='passed')
axe = seaborn.kdeplot(student_data[student_data.passed=='no'].absences, ax=axe, label="didn't pass")
In [117]:
seaborn.countplot(x='goout', hue='passed', data=student_data)
Out[117]:
In [118]:
seaborn.countplot(x='failures', hue='passed', data=student_data)
Out[118]:
In [122]:
pass_fail = {'yes': 1, 'no':0}
student_data['passed_numeric'] = student_data.passed.map(pass_fail)
seaborn.barplot(student_data.passed_numeric)
Out[122]:
In [124]:
passed_counts = student_data.passed.value_counts()
print(passed_counts)
In [133]:
passed_proportions = passed_counts/len(student_data.passed)
Out[133]:
In [135]:
proportions = pandas.DataFrame.from_dict({"yes": passed_proportions.loc['yes'],
"no":passed_proportions.loc['no']})