In [3]:
import pandas as pd
%matplotlib inline
import numpy as np
In [32]:
from sklearn import tree
from sklearn import datasets
from sklearn import cross_validation
from sklearn import metrics
In [7]:
iris = datasets.load_iris() # load iris data set
x = iris.data[:,2:] # the attributes
y = iris.target # the target variable
In [9]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y,test_size=0.5)
In [11]:
dt = tree.DecisionTreeClassifier()
In [12]:
dt = dt.fit(x_train,y_train)
In [29]:
#from Learning scikit-learn: Machine Learning in Python
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n"
if show_classification_report:
print "Classification report"
print metrics.classification_report(y,y_pred),"\n"
if show_confussion_matrix:
print "Confusion matrix"
print metrics.confusion_matrix(y,y_pred),"\n"
In [16]:
measure_performance(x_train, y_train, dt)
In [17]:
measure_performance(x_test,y_test,dt)
In [ ]:
#pretty good results (96% accuracy, with high precision and recall)
In [18]:
#visualize the model
from sklearn.externals.six import StringIO
import pydotplus #pip install pydotplus
In [19]:
with open("iris_50.dot", 'w') as f: #output the .dot file
f = tree.export_graphviz(dt, out_file=f)
In [20]:
import os
os.unlink('iris_50.dot') #remove the file from the file path
In [21]:
dot_data = StringIO()
tree.export_graphviz(dt, out_file=dot_data) #brew install graphviz
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("iris_50.pdf")
Out[21]:
In [23]:
from wand.image import Image as WImage
img = WImage(filename='iris_50.pdf')
img
Out[23]:
In [28]:
x_train_75, x_test_25, y_train_75, y_test_25 = cross_validation.train_test_split(x,y,train_size=0.75)
In [29]:
dt_75_25 = tree.DecisionTreeClassifier()
In [30]:
dt_75_25 = dt_75_25.fit(x_train_75,y_train_75)
In [31]:
measure_performance(x_train_75, y_train_75, dt)
In [32]:
measure_performance(x_test_25, y_test_25, dt)
In [34]:
#interestingly, our performance deteriorated on our training set with more examples (100% vs 98.2%),
#but increased on our test set (97% vs 96%), likely due to more examples
In [1]:
from sklearn import cross_validation
In [5]:
iris = datasets.load_iris() # load iris data set
x = iris.data[:,2:] # the attributes
y = iris.target # the target variable
In [6]:
dt = tree.DecisionTreeClassifier()
In [7]:
dt = dt.fit(x,y) #build the model on all the data and then test with cross-fold validation
In [12]:
cv = cross_validation.KFold(len(x),10,shuffle=True,random_state=0)
The method above is a more elaborated way of creating the cross-folds. cross_val_score
is already doing this under the hood. We're just making it explicit
In [14]:
scores = cross_validation.cross_val_score(dt,x,y,cv=cv)
In [15]:
scores.mean()
Out[15]:
Based on this result, it's likely our model will achieve a 94% accuracy on unseen data, rather than the 97% predicted with the hold-out method
For context of the data, see the documentation here: https://archive.ics.uci.edu/ml/datasets/seeds
In [41]:
df = pd.read_csv("data/seeds_dataset.txt",header=None)
In [42]:
df
Out[42]:
In [43]:
df.describe()
Out[43]:
In [44]:
from pandas.tools.plotting import scatter_matrix
In [45]:
scatter_matrix(df,alpha=0.2, figsize=(10, 10), diagonal='kde')
Out[45]:
In [46]:
df.corr()
Out[46]:
based on the various characteristics of the wheat kernel, we're predicting the variety, either Kama, Rosa and Canadian. Some features seem highly correlated and potentially useful for splitting features (area and perimeter), while others don't appear correlated and unlikely to help split features (asymetry coefficient)
In [65]:
x = np.asarray(df[[0,1,2,3,4,5,6]])
y = np.asarray(df[7])
In [48]:
#50-50 split
x_train_50,x_test_50,y_train_50,y_test_50 = cross_validation.train_test_split(x,y,train_size=0.5)
In [49]:
dt_seeds_50 = tree.DecisionTreeClassifier()
In [50]:
dt_seeds_50 = dt_seeds_50.fit(x_train_50,y_train_50)
In [51]:
measure_performance(x_train_50,y_train_50,dt_seeds_50)
In [52]:
measure_performance(x_test_50,y_test_50,dt_seeds_50)
class 1 and 3 are hard for the classifier to distinguish, which makes sense looking at the scatter matrix results and seeing the overlap between them
In [53]:
#75-25 split
x_train_75,x_test_25,y_train_75,y_test_25 = cross_validation.train_test_split(x,y,train_size=0.75)
In [54]:
dt_seeds_75 = tree.DecisionTreeClassifier()
In [55]:
dt_seeds_75 = dt_seeds_75.fit(x_train_75,y_train_75)
In [56]:
measure_performance(x_train_75,y_train_75,dt_seeds_75)
In [57]:
measure_performance(x_test_25,y_test_25,dt_seeds_75)
Although the precision and recall for class 3 have improved, there's still a problem in having the model distinguish between classes 1 and 3
In [58]:
# 10 fold cross validation
dt_seeds_cv = tree.DecisionTreeClassifier()
dt_seeds_cv = dt_seeds_cv.fit(x,y) #fit the model on all data
In [60]:
scores = cross_validation.cross_val_score(dt_seeds_cv,x,y,cv=10)
In [61]:
scores.mean()
Out[61]:
In [66]:
dt_seeds_cv.feature_importances_
Out[66]:
The decision tree model for this data is likely fairly accurate, approximately 93% on this data.
In [ ]: