experiments tuning Logistic Regression with GridSearch


In [93]:
# python standard library
import warnings
from collections import namedtuple
import pickle

# third-party
import numpy
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, make_scorer
import pandas
from tabulate import tabulate
import seaborn
# this code
#from student_intervention.common import feature_map

In [66]:
feature_map = {"school": "student's school",
               "sex": "student's sex",
               "age": "student's age",
               "address": "student's home address type",
               "famsize": "family size",
               "Pstatus": "parent's cohabitation status",
               "Medu": "mother's education",
               "Fedu": "father's education",
               "Mjob": "mother's job",
               "Fjob": "father's job",
               "reason": "reason to choose this school",
               "guardian": "student's guardian",
               "traveltime": "home to school travel time",
               "studytime": "weekly study time",
               "failures": "number of past class failures",
               "schoolsup": "extra educational support",
               "famsup": "family educational support",
               "paid": "extra paid classes within the course subject (Math or Portuguese)",
               "activities": "extra-curricular activities",
               "nursery": "attended nursery school",
               "higher": "wants to take higher education",
               "internet": "Internet access at home",
               "romantic": "within a romantic relationship",
               "famrel": "quality of family relationships",
               "freetime": "free time after school",
               "goout": "going out with friends",
               "Dalc": "workday alcohol consumption",
               "Walc": "weekend alcohol consumption",
               "health": "current health status",
               "absences": "number of school absences",
               "passed": "did the student pass the final exam"}

In [67]:
student_data = pandas.read_csv('student-data.csv')

In [68]:
def to_numeric(frame):
    """
    :param:
     - `frame`: data frame to transform
    :return: data frame with categorical values changed to numeric
    """
    new_features = pandas.DataFrame(index = frame.index)
    for column, column_data in frame.iteritems():
        if column_data.dtype == object:
            column_data = column_data.replace(['yes', 'no'], [ 1, 0])
        if column_data.dtype == object:
            column_data = pandas.get_dummies(column_data, prefix=column)
        new_features = new_features.join(column_data)
    return new_features

In [69]:
numeric_data = to_numeric(student_data)

In [70]:
passing_ratio = sum(numeric_data.passed)/float(len(numeric_data.passed))

In [71]:
features = numeric_data[numeric_data.columns[:-1]]
target = numeric_data['passed']

In [92]:
x_train, x_test, y_train, y_test = train_test_split(features, target,
                                                    train_size=300,
                                                    test_size=numeric_data.shape[0] - 300,
                                                    random_state=0)
TrainTestDataOne = namedtuple('TrainTestDataOne', 'X_train X_test y_train y_test'.split())
save_data = TrainTestDataOne(x_train, x_test, y_train, y_test)
with open('saved_data.pkl', 'wb') as pickler:
    pickle.dump(save_data, pickler)

In [73]:
model = LogisticRegression()
cv_model = LogisticRegressionCV(cv=10, n_jobs=-1, penalty='l1', solver='liblinear')

In [74]:
model.fit(x_train, y_train)


Out[74]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [75]:
cv_model.fit(x_train, y_train)


Out[75]:
LogisticRegressionCV(Cs=10, class_weight=None, cv=10, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l1', random_state=None,
           refit=True, scoring=None, solver='liblinear', tol=0.0001,
           verbose=0)

In [76]:
predictions = model.predict(x_test)
f1_score(y_test.values, model.predict(x_test), pos_label=1)


Out[76]:
0.79104477611940294

In [77]:
f1_score(y_test.values, cv_model.predict(x_test), pos_label=1)


Out[77]:
0.79999999999999982

In [78]:
scorer = make_scorer(f1_score)

In [79]:
def fit_grid(c_range, penalty=('l1', 'l2')):
    parameters = {'penalty': penalty,
    'C': c_range,
    'class_weight': [None, 'balanced', {1:passing_ratio, 0: 1-passing_ratio}]}
    grid = GridSearchCV(model, param_grid=parameters, scoring=scorer, cv=10, n_jobs=-1)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        return grid.fit(x_train, y_train)

In [80]:
grid_01 = fit_grid(numpy.arange(.01, 1.1, .05))

In [81]:
def print_columns(grid):
    coefficients = grid.best_estimator_.coef_[0]
    odds = numpy.exp(coefficients)
    sorted_coefficients = sorted((column for column in coefficients), reverse=True)
    rows = []
    for coefficient in sorted_coefficients:
        if abs(coefficient) > 0:
            index = numpy.where(coefficients == coefficient)[0][0]
            column_name = x_train.columns[index]
            description = feature_map[column_name] if column_name in feature_map else ' = '.join(column_name.split('_'))
            rows.append([column_name, description, "{0:.2f}".format(coefficient), '{0:.2f}'.format(odds[index])])
    print(tabulate(rows, headers='Variable Description Coefficient Odds'.split()))

In [82]:
def print_best(grid):
    print("parameters")
    print("==========\n")
    print(tabulate(grid.best_params_.items(), headers='Parameter Value'.split()))
    print('\nF1 score')
    print('========\n')
    print("{0:.2f}".format(grid.score(x_test, y_test)))
    print('\nCoefficients')
    print('============\n')
    print_columns(grid)

In [83]:
print_best(grid_01)


parameters
==========

Parameter     Value
------------  ------------------------------------------------
penalty       l1
C             0.61
class_weight  {0: 0.32911392405063289, 1: 0.67088607594936711}

F1 score
========

0.79

Coefficients
============

Variable       Description                        Coefficient    Odds
-------------  -------------------------------  -------------  ------
studytime      weekly study time                         0.25    1.28
Walc           weekend alcohol consumption               0.22    1.24
address_U      address = U                               0.18    1.2
Mjob_services  Mjob = services                           0.12    1.13
Medu           mother's education                        0.1     1.1
Fedu           father's education                        0.07    1.08
age            student's age                             0.07    1.07
famrel         quality of family relationships           0.06    1.06
internet       Internet access at home                   0.03    1.03
sex_M          sex = M                                   0.01    1.01
health         current health status                    -0.01    0.99
absences       number of school absences                -0.03    0.97
famsup         family educational support               -0.06    0.94
goout          going out with friends                   -0.33    0.72
failures       number of past class failures            -0.73    0.48

In [84]:
grid_l1 = fit_grid(numpy.arange(.01, 1.1, .05), penalty=['l1'])
print_best(grid_l1)


parameters
==========

Parameter     Value
------------  ------------------------------------------------
penalty       l1
C             0.61
class_weight  {0: 0.32911392405063289, 1: 0.67088607594936711}

F1 score
========

0.79

Coefficients
============

Variable       Description                        Coefficient    Odds
-------------  -------------------------------  -------------  ------
studytime      weekly study time                         0.25    1.28
Walc           weekend alcohol consumption               0.22    1.24
address_U      address = U                               0.18    1.2
Mjob_services  Mjob = services                           0.12    1.13
Medu           mother's education                        0.1     1.1
Fedu           father's education                        0.07    1.08
age            student's age                             0.07    1.07
famrel         quality of family relationships           0.06    1.06
internet       Internet access at home                   0.03    1.03
sex_M          sex = M                                   0.01    1.01
health         current health status                    -0.01    0.99
absences       number of school absences                -0.03    0.97
famsup         family educational support               -0.06    0.94
goout          going out with friends                   -0.33    0.72
failures       number of past class failures            -0.73    0.48

Better

The previous model seems to be overfitted. (actually I changed the train-test split to have more training data and this all changed).


In [85]:
grid_05 = fit_grid(numpy.arange(.05, 1.1, .05))
print_best(grid_05)


parameters
==========

Parameter     Value
------------  ------------------------------------------------
penalty       l1
C             0.6
class_weight  {0: 0.32911392405063289, 1: 0.67088607594936711}

F1 score
========

0.79

Coefficients
============

Variable       Description                        Coefficient    Odds
-------------  -------------------------------  -------------  ------
studytime      weekly study time                         0.24    1.27
Walc           weekend alcohol consumption               0.22    1.24
address_U      address = U                               0.18    1.19
Mjob_services  Mjob = services                           0.12    1.12
Medu           mother's education                        0.1     1.1
Fedu           father's education                        0.07    1.07
age            student's age                             0.07    1.07
famrel         quality of family relationships           0.06    1.06
internet       Internet access at home                   0.02    1.02
sex_M          sex = M                                   0.01    1.01
health         current health status                    -0.01    0.99
absences       number of school absences                -0.03    0.97
famsup         family educational support               -0.05    0.95
goout          going out with friends                   -0.33    0.72
failures       number of past class failures            -0.73    0.48

In [86]:
grid_1 = fit_grid(numpy.arange(.1, 1.1, .1))
print_best(grid_1)


parameters
==========

Parameter     Value
------------  ------------------------------------------------
penalty       l1
C             0.6
class_weight  {0: 0.32911392405063289, 1: 0.67088607594936711}

F1 score
========

0.79

Coefficients
============

Variable       Description                        Coefficient    Odds
-------------  -------------------------------  -------------  ------
studytime      weekly study time                         0.24    1.27
Walc           weekend alcohol consumption               0.22    1.24
address_U      address = U                               0.18    1.19
Mjob_services  Mjob = services                           0.12    1.12
Medu           mother's education                        0.1     1.1
Fedu           father's education                        0.07    1.07
age            student's age                             0.07    1.07
famrel         quality of family relationships           0.06    1.06
internet       Internet access at home                   0.02    1.02
sex_M          sex = M                                   0.01    1.01
health         current health status                    -0.01    0.99
absences       number of school absences                -0.03    0.97
famsup         family educational support               -0.05    0.95
goout          going out with friends                   -0.33    0.72
failures       number of past class failures            -0.73    0.48

In [87]:
grid_4 = fit_grid(numpy.arange(.3, .5, .05))
print_best(grid_4)


parameters
==========

Parameter     Value
------------  -------
penalty       l1
C             0.3
class_weight

F1 score
========

0.79

Coefficients
============

Variable       Description                        Coefficient    Odds
-------------  -------------------------------  -------------  ------
studytime      weekly study time                         0.26    1.29
Mjob_services  Mjob = services                           0.23    1.26
Walc           weekend alcohol consumption               0.2     1.23
address_U      address = U                               0.18    1.2
internet       Internet access at home                   0.12    1.12
sex_M          sex = M                                   0.11    1.12
Fedu           father's education                        0.09    1.1
Medu           mother's education                        0.09    1.09
famrel         quality of family relationships           0.05    1.05
Mjob_health    Mjob = health                             0.04    1.04
age            student's age                             0.04    1.04
absences       number of school absences                -0.03    0.97
health         current health status                    -0.03    0.97
Mjob_teacher   Mjob = teacher                           -0.06    0.94
famsup         family educational support               -0.15    0.86
goout          going out with friends                   -0.36    0.7
failures       number of past class failures            -0.76    0.47

Random Forests


In [88]:
from sklearn.ensemble import RandomForestClassifier

In [89]:
forest = RandomForestClassifier()
forest.fit(x_train, y_train)
f1_score(forest.predict(x_test), y_test)


Out[89]:
0.78195488721804507

In [90]:
forest_parameters = {'n_estimators': range(5, 20), 'max_features': range(5, len(x_train.columns))}
g = GridSearchCV(forest, param_grid=forest_parameters, scoring=scorer, cv=10, n_jobs=-1)
g.fit(x_train, y_train)


Out[90]:
GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], 'max_features': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]},
       pre_dispatch='2*n_jobs', refit=True, scoring=make_scorer(f1_score),
       verbose=0)

In [91]:
print(g.score(x_test, y_test))
print(g.best_estimator_)


0.788321167883
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=22, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=19, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [96]:
%matplotlib inline
seaborn.countplot(x='Medu', hue='passed', data=student_data)


<matplotlib.figure.Figure at 0x7f78151b1a50>
Out[96]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f78151a7d90>

In [97]:
seaborn.countplot(x='famrel', hue='passed', data=student_data)


<matplotlib.figure.Figure at 0x7f78151132d0>
Out[97]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f78150e7c90>

In [98]:
seaborn.countplot(x='Fedu', hue='passed', data=student_data)


<matplotlib.figure.Figure at 0x7f78148a2c50>
Out[98]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f781489ac90>

In [101]:
seaborn.countplot(x='passed', hue='age', data=student_data)


<matplotlib.figure.Figure at 0x7f78146a8b90>
Out[101]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f78147574d0>

In [116]:
#axe = seaborn.countplot(x='absences', hue='passed', data=student_data)
axe = seaborn.kdeplot(student_data[student_data.passed=='yes'].absences, label='passed')
axe = seaborn.kdeplot(student_data[student_data.passed=='no'].absences, ax=axe, label="didn't pass")


<matplotlib.figure.Figure at 0x7f780fbc7510>

In [117]:
seaborn.countplot(x='goout', hue='passed', data=student_data)


<matplotlib.figure.Figure at 0x7f780f07d490>
Out[117]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f780f057fd0>

In [118]:
seaborn.countplot(x='failures', hue='passed', data=student_data)


<matplotlib.figure.Figure at 0x7f780f034410>
Out[118]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f780f13d350>

In [122]:
pass_fail  = {'yes': 1, 'no':0}
student_data['passed_numeric'] = student_data.passed.map(pass_fail)
seaborn.barplot(student_data.passed_numeric)


<matplotlib.figure.Figure at 0x7f78240b63d0>
Out[122]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f780f1e2590>

In [124]:
passed_counts = student_data.passed.value_counts()
print(passed_counts)


yes    265
no     130
Name: passed, dtype: int64

In [133]:
passed_proportions = passed_counts/len(student_data.passed)


Out[133]:
0.67088607594936711

In [135]:
proportions = pandas.DataFrame.from_dict({"yes": passed_proportions.loc['yes'],
                                          "no":passed_proportions.loc['no']})


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-135-57463c4add72> in <module>()
      1 proportions = pandas.DataFrame.from_dict({"yes": passed_proportions.loc['yes'],
----> 2 "no":passed_proportions.loc['no']})

/home/charon/.virtualenvs/machinelearning/local/lib/python2.7/site-packages/pandas/core/frame.pyc in from_dict(cls, data, orient, dtype)
    802             raise ValueError('only recognize index or columns for orient')
    803 
--> 804         return cls(data, index=index, columns=columns, dtype=dtype)
    805 
    806     @deprecate_kwarg(old_arg_name='outtype', new_arg_name='orient')

/home/charon/.virtualenvs/machinelearning/local/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    224                                  dtype=dtype, copy=copy)
    225         elif isinstance(data, dict):
--> 226             mgr = self._init_dict(data, index, columns, dtype=dtype)
    227         elif isinstance(data, ma.MaskedArray):
    228             import numpy.ma.mrecords as mrecords

/home/charon/.virtualenvs/machinelearning/local/lib/python2.7/site-packages/pandas/core/frame.pyc in _init_dict(self, data, index, columns, dtype)
    361 
    362         return _arrays_to_mgr(arrays, data_names, index, columns,
--> 363                               dtype=dtype)
    364 
    365     def _init_ndarray(self, values, index, columns, dtype=None,

/home/charon/.virtualenvs/machinelearning/local/lib/python2.7/site-packages/pandas/core/frame.pyc in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
   5156     # figure out the index, if necessary
   5157     if index is None:
-> 5158         index = extract_index(arrays)
   5159     else:
   5160         index = _ensure_index(index)

/home/charon/.virtualenvs/machinelearning/local/lib/python2.7/site-packages/pandas/core/frame.pyc in extract_index(data)
   5195 
   5196         if not indexes and not raw_lengths:
-> 5197             raise ValueError('If using all scalar values, you must pass'
   5198                              ' an index')
   5199 

ValueError: If using all scalar values, you must pass an index