In [58]:
from sklearn import datasets
iris = datasets.load_iris()
In [59]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)
print("Number of mislabeled points out of a total %d points : %d"
% (iris.data.shape[0],(iris.target != y_pred).sum()))
In [60]:
iris.data.shape[0]
Out[60]:
In [61]:
iris.target
Out[61]:
In [62]:
y_pred
Out[62]:
In [63]:
(iris.target != y_pred)
Out[63]:
In [64]:
(iris.target != y_pred).sum()
Out[64]:
http://archive.ics.uci.edu/ml/datasets/Pen-Based+Recognition+of+Handwritten+Digits We create a digit database by collecting 250 samples from 44 writers.
In [65]:
from sklearn import datasets
iris = datasets.load_iris()
digits = datasets.load_digits()
In [66]:
from sklearn import svm
clf = svm.SVC(gamma=0.001, C=100.)
In [67]:
digits.data
Out[67]:
In [68]:
digits.target
Out[68]:
In [69]:
iris.data
Out[69]:
In [70]:
iris.target
Out[70]:
In [71]:
clf.fit(digits.data[:-1], digits.target[:-1])
Out[71]:
In [72]:
clf.predict(digits.data[-1:])
Out[72]:
In [73]:
clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
clf.fit(X, y)
Out[73]:
In [74]:
import pickle
s = pickle.dumps(clf)
clf2 = pickle.loads(s)
clf2.predict(X[0:1])
Out[74]:
In [75]:
y[0]
Out[75]:
In [76]:
y
Out[76]:
In [77]:
clf2.predict(X)
Out[77]:
In [78]:
(clf2.predict(X) != y).sum()
Out[78]:
In [79]:
from sklearn import tree
from sklearn.datasets import load_iris
In [80]:
iris.target
Out[80]:
In [81]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)
In [82]:
clf
Out[82]:
In [83]:
clf.predict(X)
Out[83]:
In [84]:
iris.target
Out[84]:
In [85]:
(clf.predict(X)!=iris.target).sum()
Out[85]:
In [86]:
! pip install pydotplus
In [87]:
import pydotplus as pydot
from IPython.display import Image
from sklearn.externals.six import StringIO
In [88]:
import matplotlib.pyplot as plt
%matplotlib inline
from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
In [92]:
# Author: Kian Ho <hui.kian.ho@gmail.com>
# Gilles Louppe <g.louppe@gmail.com>
# Andreas Mueller <amueller@ais.uni-bonn.de>
#
# License: BSD 3 Clause
print(__doc__)
RANDOM_STATE = 123
# Generate a binary classification dataset.
X, y = make_classification(n_samples=500, n_features=25,
n_clusters_per_class=1, n_informative=15,
random_state=RANDOM_STATE)
# NOTE: Setting the `warm_start` construction parameter to `True` disables
# support for parallelized ensembles but is necessary for tracking the OOB
# error trajectory during training.
ensemble_clfs = [
("RandomForestClassifier, max_features='sqrt'",
RandomForestClassifier(warm_start=True, oob_score=True,
max_features="sqrt",
random_state=RANDOM_STATE)),
("RandomForestClassifier, max_features='log2'",
RandomForestClassifier(warm_start=True, max_features='log2',
oob_score=True,
random_state=RANDOM_STATE)),
("RandomForestClassifier, max_features=None",
RandomForestClassifier(warm_start=True, max_features=None,
oob_score=True,
random_state=RANDOM_STATE))
]
# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)
# Range of `n_estimators` values to explore.
min_estimators = 15
max_estimators = 175
for label, clf in ensemble_clfs:
for i in range(min_estimators, max_estimators + 1):
clf.set_params(n_estimators=i)
clf.fit(X, y)
# Record the OOB error for each `n_estimators=i` setting.
oob_error = 1 - clf.oob_score_
error_rate[label].append((i, oob_error))
# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
xs, ys = zip(*clf_err)
In [93]:
plt.plot(xs, ys, label=label)
plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()
In [ ]: