In [14]:
import matplotlib.pyplot as plot
import numpy
import pandas
import pickle
import plotly
import plotly.tools as tls

import seaborn
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, make_scorer
from common import (TrainTestData, train_test_path, feature_map,
                    student_data, print_image_directive)

In [2]:
student_data = pandas.read_csv('student-data.csv')

In [3]:
feature_columns = list(student_data.columns[:-1])  # all columns but last are features
target_column = student_data.columns[-1]  # last column is the target/label
X_all = student_data[feature_columns]  # feature values for all students
y_all = student_data[target_column]  # corresponding targets/labels
y_all = y_all.replace(['yes', 'no'], [1, 0])

In [4]:
def preprocess_features(X):
    """
    Converts categorical data to numeric
    :param:
     - `X`: dataframe of data
    :return: data with yes/no changed to 1/0, others changed to dummies
    """
    outX = pandas.DataFrame(index=X.index)

    # Check each column
    for col, col_data in X.iteritems():
        # If data type is non-numeric, try to replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])
        # Note: This should change the data type for yes/no columns to int

        # If still non-numeric, convert to one or more dummy variables
        if col_data.dtype == object:
            col_data = pandas.get_dummies(col_data, prefix=col)  # e.g. 'school' => 'school_GP', 'school_MS'

        outX = outX.join(col_data)  # collect column(s) in output dataframe
    return outX
X_all = preprocess_features(X_all)

In [5]:
num_all = student_data.shape[0]  # same as len(student_data)
assert num_all == 395, "Expected: 395 Actual: {0}".format(num_all)
num_train = 300  # about 75% of the data
num_test = num_all - num_train
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all,
                                                    test_size=num_test,
                                                    train_size=num_train,
                                                    random_state=500)

In [6]:
model_cv = LogisticRegressionCV(penalty='l1', cv=10, solver='liblinear')
model_cv.fit(X_train, y_train)
print(f1_score(model_cv.predict(X_test), y_test))


0.768115942029

In [7]:
model_cv.C_


Out[7]:
array([ 0.35938137])

In [8]:
model = LogisticRegression(C=model_cv.C_[0], penalty='l1')
model.fit(X_train, y_train)


Out[8]:
LogisticRegression(C=0.35938136638046259, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [9]:
f1_score(model.predict(X_test), y_test)


Out[9]:
0.76811594202898559

In [10]:
numpy.log(numpy.arange(.05, .4, .05))


Out[10]:
array([-2.99573227, -2.30258509, -1.89711998, -1.60943791, -1.38629436,
       -1.2039728 , -1.04982212])

In [11]:
scorer = make_scorer(f1_score)
model = LogisticRegression()
parameters = {'penalty': ['l1', 'l2'],
              'C': numpy.arange(.01, 1, .01)}

grid = GridSearchCV(model, param_grid=parameters, scoring=scorer, cv=10, n_jobs=-1)

grid.fit(X_train.values, y_train)


Out[11]:
GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([ 0.01,  0.02, ...,  0.98,  0.99])},
       pre_dispatch='2*n_jobs', refit=True, scoring=make_scorer(f1_score),
       verbose=0)

In [12]:
grid.best_params_


Out[12]:
{'C': 0.040000000000000001, 'penalty': 'l2'}

In [13]:
grid.score(X_test, y_test)


/home/cronos/.virtualenvs/student_intervention/local/lib/python2.7/site-packages/sklearn/grid_search.py:418: ChangedBehaviorWarning: The long-standing behavior to use the estimator's score function in GridSearchCV.score has changed. The scoring parameter is now used.
  ChangedBehaviorWarning)
Out[13]:
0.80555555555555558

Plotly


In [3]:
%matplotlib inline

In [5]:
import plotly
print plotly.__version__            # version 1.9.4 required
plotly.offline.init_notebook_mode() # run at the start of every notebook
plotly.offline.iplot({
"data": [{
    "x": [1, 2, 3],
    "y": [4, 2, 5]
}],
"layout": {
    "title": "hello world"
}
})


1.9.6

In [9]:
passing_rates = student_data.passed.value_counts()/student_data.passed.count()
print(passing_rates)


yes    0.670886
no     0.329114
Name: passed, dtype: float64

In [17]:
seaborn.set_style('whitegrid')
figure = plot.figure()
axe = figure.gca()
axe = seaborn.barplot(x=passing_rates.index, y=passing_rates.values, ax=axe)
title = axe.set_title("Proportion of Passing Students")
plotly_fig = tls.mpl_to_plotly(figure)
url = plotly.offline.iplot(plotly_fig)



In [ ]: