In [1]:
%matplotlib inline
with open( 'SnowDays.csv' ) as f:
data = [ line[:-1].split(',') for line in f.readlines() ]
X_text = [ row[:-1] for row in data ]
Y_text = [ row[-1] for row in data ]
snowkey = { 'Light': 0, 'Medium': 1, 'Heavy': 2 }
tfkey = { 'TRUE': True, 'FALSE': False }
X = [ [ snowkey[el] for el in row ] for row in X_text ]
Y = [ tfkey[el] for el in Y_text ]
featureNames = [ 'Previous morning', 'Previous day', 'Previous night', 'Early morning' ]
In [2]:
for row in data:
print( '<tr>' + ''.join([ '<td>' + el + '</td>' for el in row ]) + '</tr>' )
In [3]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth=1)
clf = clf.fit(X, Y)
In [8]:
clf.tree_.children_left
Out[8]:
In [5]:
from sklearn.externals.six import StringIO
import pydot
from IPython.display import Image
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data,
feature_names=featureNames,
class_names=tfkey.keys(),
filled=True, rounded=True,
special_characters=True)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
In [6]:
clf.predict(X)
Out[6]:
In [7]:
# Convert a list of true/false to 1/0
Y_num = [ 1 if el else 0 for el in Y ]
Y_num
X
Out[7]:
In [8]:
# This regression runs on our full dataset
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(X, Y_num)
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean square error
print("Residual sum of squares: %.2f"
% np.mean((regr.predict(X) - Y_num) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X, Y_num))
# Plot outputs
#plt.scatter(X, Y_num, color='black')
#plt.plot(X, regr.predict(X), color='blue',
# linewidth=3)
#plt.xticks(())
#plt.yticks(())
#plt.show()
In [9]:
Y_predict = regr.predict(X)
Y_predict
Out[9]:
In [10]:
# This regression runs on a subset
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
# Create linear regression object
regr = linear_model.LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, Y_num, test_size=.5,
random_state=0)
# Train the model using the training sets
regr.fit(X_train, y_train)
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean square error
print("Residual sum of squares: %.2f"
% np.mean((regr.predict(X) - Y_num) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X, Y_num))
# Plot outputs
#plt.scatter(X, Y_num, color='black')
#plt.plot(X, regr.predict(X), color='blue',
# linewidth=3)
#plt.xticks(())
#plt.yticks(())
#plt.show()
y_predict_test = regr.predict(X_test)
In [ ]:
# ROC curve for our regression
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
fpr, tpr, _ = roc_curve(y_test, y_predict_test)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
In [ ]:
y_predict_test
In [ ]:
y_test
In [ ]:
# Lets make a list of 1000 classes, and chop it into 10-folds
cv_classes = [ 0 for k in range(50) ] + [ 1 for k in range(50) ]
import random
random.shuffle( cv_classes )
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in xrange(0, len(l), n):
yield l[i:i+n]
folds = list( chunks( cv_classes, 25 ) )
fold_proportions = [ float(sum(fold))/len(fold) for fold in folds ]
fold_proportions
In [ ]:
fig = plt.figure(figsize=(8,8))
ax = fig.gca()
import numpy as np
for k in range(len(fold_proportions)):
fp = fold_proportions[k]
ax.pie([fp, (1-fp)],
autopct='%1.1f%%', startangle=90,
radius=0.25, center=(k/2, k%2), frame=True)
ax.set_xticks([0, 1])
ax.set_yticks([0, 1])
ax.set_xticklabels(["", ""])
ax.set_yticklabels(["", ""])
ax.set_xlim((-0.5, 1.5))
ax.set_ylim((-0.5, 1.5))
# Set aspect ratio to be equal so that pie is drawn as a circle.
ax.set_aspect('equal')
plt.show()
In [ ]: