notebook.community

Edit and run



In [1]:

    
%matplotlib inline

with open( 'SnowDays.csv' ) as f:
    data = [ line[:-1].split(',') for line in f.readlines() ]
    
    X_text = [ row[:-1] for row in data ]
    Y_text = [ row[-1] for row in data ]
    
    snowkey = { 'Light': 0, 'Medium': 1, 'Heavy': 2 }
    tfkey = { 'TRUE': True, 'FALSE': False }
    X = [ [ snowkey[el] for el in row ] for row in X_text ]
    Y = [ tfkey[el] for el in Y_text ]
    
    featureNames = [ 'Previous morning', 'Previous day', 'Previous night', 'Early morning' ]



In [2]:

    
for row in data:
    print( '<tr>' + ''.join([ '<td>' + el + '</td>' for el in row ]) + '</tr>' )









    



<tr><td>Light</td><td>Light</td><td>Light</td><td>Heavy</td><td>TRUE</td></tr>
<tr><td>Light</td><td>Light</td><td>Heavy</td><td>Light</td><td>TRUE</td></tr>
<tr><td>Heavy</td><td>Heavy</td><td>Light</td><td>Light</td><td>FALSE</td></tr>
<tr><td>Heavy</td><td>Medium</td><td>Medium</td><td>Light</td><td>FALSE</td></tr>
<tr><td>Medium</td><td>Medium</td><td>Medium</td><td>Medium</td><td>TRUE</td></tr>
<tr><td>Light</td><td>Light</td><td>Heavy</td><td>Heavy</td><td>TRUE</td></tr>
<tr><td>Light</td><td>Heavy</td><td>Heavy</td><td>Medium</td><td>TRUE</td></tr>
<tr><td>Heavy</td><td>Medium</td><td>Medium</td><td>Light</td><td>FALSE</td></tr>
<tr><td>Medium</td><td>Medium</td><td>Light</td><td>Light</td><td>FALSE</td></tr>
<tr><td>Light</td><td>Light</td><td>Light</td><td>Light</td><td>FALSE</td></tr>
<tr><td>Light</td><td>Light</td><td>Medium</td><td>Light</td><td>FALSE</td></tr>
<tr><td>Light</td><td>Light</td><td>Light</td><td>Medium</td><td>TRUE</td></tr>



In [3]:

    
from sklearn import tree

clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth=1)
clf = clf.fit(X, Y)



In [4]:

    
clf









    Out[4]:





DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')



In [33]:

    
from sklearn.externals.six import StringIO  
import pydot 
from IPython.display import Image  
dot_data = StringIO()  
tree.export_graphviz(clf, out_file=dot_data,  
                         feature_names=featureNames,  
                         class_names=tfkey.keys(),  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())









    Out[33]:



In [34]:

    
clf.predict(X)









    Out[34]:





array([ True, False, False, False,  True,  True,  True, False, False,
       False, False,  True], dtype=bool)



In [43]:

    
# Convert a list of true/false to 1/0
Y_num = [ 1 if el else 0 for el in Y ]
Y_num
X









    Out[43]:





[[0, 0, 0, 2],
 [0, 0, 2, 0],
 [2, 2, 0, 0],
 [2, 1, 1, 0],
 [1, 1, 1, 1],
 [0, 0, 2, 2],
 [0, 2, 2, 1],
 [2, 1, 1, 0],
 [1, 1, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 1, 0],
 [0, 0, 0, 1]]



In [44]:

    
# This regression runs on our full dataset

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X, Y_num)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean square error
print("Residual sum of squares: %.2f"
      % np.mean((regr.predict(X) - Y_num) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X, Y_num))

# Plot outputs
#plt.scatter(X, Y_num,  color='black')
#plt.plot(X, regr.predict(X), color='blue',
#         linewidth=3)

#plt.xticks(())
#plt.yticks(())

#plt.show()









    



('Coefficients: \n', array([-0.18649558,  0.07225614,  0.15659649,  0.39815622]))
Residual sum of squares: 0.07
Variance score: 0.73



In [46]:

    
Y_predict = regr.predict(X)
Y_predict









    Out[46]:





array([ 1.0097172 ,  0.52659773, -0.01507412,  0.06926623,  0.65391803,
        1.32291018,  1.06926623,  0.06926623,  0.09916532,  0.21340476,
        0.37000125,  0.61156098])



In [54]:

    
# This regression runs on a subset

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model

# Create linear regression object
regr = linear_model.LinearRegression()

X_train, X_test, y_train, y_test = train_test_split(X, Y_num, test_size=.5,
                                                    random_state=0)

# Train the model using the training sets
regr.fit(X_train, y_train)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean square error
print("Residual sum of squares: %.2f"
      % np.mean((regr.predict(X) - Y_num) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X, Y_num))

# Plot outputs
#plt.scatter(X, Y_num,  color='black')
#plt.plot(X, regr.predict(X), color='blue',
#         linewidth=3)

#plt.xticks(())
#plt.yticks(())

#plt.show()

y_predict_test = regr.predict(X_test)









    



('Coefficients: \n', array([-0.2 , -0.1 ,  0.25,  0.25]))
Residual sum of squares: 0.10
Variance score: 0.59



In [55]:

    
# ROC curve for our regression

import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

fpr, tpr, _ = roc_curve(y_test, y_predict_test)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()



In [56]:

    
y_predict_test









    Out[56]:





array([ 0.8 ,  0.5 ,  0.45,  0.5 , -0.35, -0.05])



In [57]:

    
y_test









    Out[57]:





[1, 1, 1, 0, 0, 0]



In [76]:

    
# Lets make a list of 1000 classes, and chop it into 10-folds
cv_classes = [ 0 for k in range(50) ] + [ 1 for k in range(50) ]
import random
random.shuffle( cv_classes )

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in xrange(0, len(l), n):
        yield l[i:i+n]
        
folds = list( chunks( cv_classes, 25 ) )

fold_proportions = [ float(sum(fold))/len(fold) for fold in folds ]

fold_proportions









    Out[76]:





[0.56, 0.48, 0.36, 0.6]



In [85]:

    
fig = plt.figure(figsize=(8,8))
ax = fig.gca()
import numpy as np

for k in range(len(fold_proportions)):
    fp = fold_proportions[k]
    ax.pie([fp, (1-fp)], 
           autopct='%1.1f%%', startangle=90,
           radius=0.25, center=(k/2, k%2), frame=True)
    
ax.set_xticks([0, 1])
ax.set_yticks([0, 1])
ax.set_xticklabels(["", ""])
ax.set_yticklabels(["", ""])
ax.set_xlim((-0.5, 1.5))
ax.set_ylim((-0.5, 1.5))

# Set aspect ratio to be equal so that pie is drawn as a circle.
ax.set_aspect('equal')

plt.show()



In [ ]: