In [14]:
import matplotlib.pyplot as plot
import numpy
import pandas
import pickle
import plotly
import plotly.tools as tls
import seaborn
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, make_scorer
from common import (TrainTestData, train_test_path, feature_map,
student_data, print_image_directive)
In [2]:
student_data = pandas.read_csv('student-data.csv')
In [3]:
feature_columns = list(student_data.columns[:-1]) # all columns but last are features
target_column = student_data.columns[-1] # last column is the target/label
X_all = student_data[feature_columns] # feature values for all students
y_all = student_data[target_column] # corresponding targets/labels
y_all = y_all.replace(['yes', 'no'], [1, 0])
In [4]:
def preprocess_features(X):
"""
Converts categorical data to numeric
:param:
- `X`: dataframe of data
:return: data with yes/no changed to 1/0, others changed to dummies
"""
outX = pandas.DataFrame(index=X.index)
# Check each column
for col, col_data in X.iteritems():
# If data type is non-numeric, try to replace all yes/no values with 1/0
if col_data.dtype == object:
col_data = col_data.replace(['yes', 'no'], [1, 0])
# Note: This should change the data type for yes/no columns to int
# If still non-numeric, convert to one or more dummy variables
if col_data.dtype == object:
col_data = pandas.get_dummies(col_data, prefix=col) # e.g. 'school' => 'school_GP', 'school_MS'
outX = outX.join(col_data) # collect column(s) in output dataframe
return outX
X_all = preprocess_features(X_all)
In [5]:
num_all = student_data.shape[0] # same as len(student_data)
assert num_all == 395, "Expected: 395 Actual: {0}".format(num_all)
num_train = 300 # about 75% of the data
num_test = num_all - num_train
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all,
test_size=num_test,
train_size=num_train,
random_state=500)
In [6]:
model_cv = LogisticRegressionCV(penalty='l1', cv=10, solver='liblinear')
model_cv.fit(X_train, y_train)
print(f1_score(model_cv.predict(X_test), y_test))
In [7]:
model_cv.C_
Out[7]:
In [8]:
model = LogisticRegression(C=model_cv.C_[0], penalty='l1')
model.fit(X_train, y_train)
Out[8]:
In [9]:
f1_score(model.predict(X_test), y_test)
Out[9]:
In [10]:
numpy.log(numpy.arange(.05, .4, .05))
Out[10]:
In [11]:
scorer = make_scorer(f1_score)
model = LogisticRegression()
parameters = {'penalty': ['l1', 'l2'],
'C': numpy.arange(.01, 1, .01)}
grid = GridSearchCV(model, param_grid=parameters, scoring=scorer, cv=10, n_jobs=-1)
grid.fit(X_train.values, y_train)
Out[11]:
In [12]:
grid.best_params_
Out[12]:
In [13]:
grid.score(X_test, y_test)
Out[13]:
In [3]:
%matplotlib inline
In [5]:
import plotly
print plotly.__version__ # version 1.9.4 required
plotly.offline.init_notebook_mode() # run at the start of every notebook
plotly.offline.iplot({
"data": [{
"x": [1, 2, 3],
"y": [4, 2, 5]
}],
"layout": {
"title": "hello world"
}
})
In [9]:
passing_rates = student_data.passed.value_counts()/student_data.passed.count()
print(passing_rates)
In [17]:
seaborn.set_style('whitegrid')
figure = plot.figure()
axe = figure.gca()
axe = seaborn.barplot(x=passing_rates.index, y=passing_rates.values, ax=axe)
title = axe.set_title("Proportion of Passing Students")
plotly_fig = tls.mpl_to_plotly(figure)
url = plotly.offline.iplot(plotly_fig)
In [ ]: