Taken from Google's Visualizing a Decision Tree - Machine Learning Recipes #2
In [1]:
import numpy as np
from sklearn import tree
from sklearn.datasets import load_iris
In [2]:
iris = load_iris()
In [3]:
iris.keys()
Out[3]:
In [4]:
iris.feature_names
Out[4]:
In [5]:
iris.target_names
Out[5]:
In [6]:
# Withhold 3 for testing
test_idx = [0, 50, 100]
train_data = np.delete(iris.data, test_idx, axis=0)
train_target = np.delete(iris.target, test_idx)
In [7]:
test_target = iris.target[test_idx] # array([0, 1, 2])
In [8]:
test_data = iris.data[test_idx] # array([[ 5.1, 3.5, 1.4, 0.2], [ 7. , 3.2, 4.7, 1.4], ...])
In [9]:
clf = tree.DecisionTreeClassifier()
clf.fit(train_data, train_target)
Out[9]:
In [10]:
# make prediction
clf.predict(test_data)
Out[10]:
In [11]:
# matches input labels?
clf.predict(test_data) == test_target
Out[11]:
In [12]:
from sklearn.externals.six import StringIO
import pydotplus # note installed pydotplus for Py3 compatibility
In [13]:
dot_data = StringIO()
tree.export_graphviz(clf,
out_file=dot_data,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True,
rounded=True,
impurity=False)
In [14]:
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
In [15]:
# graphviz installed on mac with `brew install graphviz`
graph.write_pdf('iris.pdf')
# open -a preview ~/ipython/tensorflow/iris.pdf
Out[15]:
In [17]:
# http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
import matplotlib.pyplot as plt
from sklearn import linear_model
In [19]:
# Use fit_intercept=False for Multivariate Linear Regression Model
# http://stackoverflow.com/a/24394996
clf = linear_model.LinearRegression(fit_intercept=False)
clf.fit(train_data, train_target)
Out[19]:
In [36]:
clf.predict(test_data)
Out[36]:
In [37]:
test_target
Out[37]:
In [34]:
# from Wiki: RSS "is a measure of the discrepancy between the data and an estimation model"
print("Residual sum of squares: %.2f" % np.mean((clf.predict(test_data) - test_target) ** 2))
In [35]:
# .score() returns the coefficient of determination R^2 of the prediction.
# 1 is perfect prediction
print('Variance score: %.2f' % clf.score(test_data, test_target))
In [22]:
print('Coefficients:', clf.coef_)
In [40]:
#? Is this right?
list(zip(clf.coef_, iris.feature_names))
Out[40]:
In [57]:
clf = linear_model.LogisticRegression()
In [58]:
clf.fit(train_data, train_target)
Out[58]:
In [44]:
clf.predict(test_data)
Out[44]:
In [45]:
clf.predict(test_data) == test_target
Out[45]:
In [47]:
clf.score(test_data, test_target)
Out[47]:
In [64]:
from sklearn.ensemble import RandomForestClassifier
In [65]:
clf = RandomForestClassifier()
In [66]:
clf.fit(train_data, train_target)
Out[66]:
In [69]:
clf.score(test_data, test_target)
Out[69]:
In [71]:
clf.predict(test_data) == test_target
Out[71]:
In [9]:
from sklearn.ensemble import RandomForestRegressor
In [10]:
clf = RandomForestRegressor()
In [11]:
clf.fit(train_data, train_target)
Out[11]:
In [12]:
clf.score(test_data, test_target)
Out[12]:
In [13]:
clf.predict(test_data) == test_target
Out[13]:
In [ ]: