In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 20, 8
In [2]:
from sklearn import datasets
from sklearn import metrics
digits = datasets.load_digits()
fig, axes = plt.subplots(5, 10, figsize=(8, 5))
fig.subplots_adjust(hspace=0.1, wspace=0.1)
for i, ax in enumerate(axes.flat):
ax.imshow(digits.images[i], cmap='binary')
ax.text(0.05, 0.05, str(digits.target[i]), transform=ax.transAxes, color='green')
ax.set_xticks([])
ax.set_yticks([])
In [3]:
print(digits.images.shape)
print(digits.images[0])
In [4]:
plt.rcParams['figure.figsize'] = 4, 4
plt.imshow(digits.images[0]);
plt.rcParams['figure.figsize'] = 20, 8
In [5]:
%%time
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(digits.data, digits.target)
pred = clf.predict(digits.data)
In [6]:
sns.heatmap(metrics.confusion_matrix(digits.target, pred), annot=True, fmt='d')
plt.ylabel('True label')
plt.xlabel('Predicted label');
In [7]:
from sklearn.decomposition import PCA
pca_digits = PCA(n_components=2)
reduced_data_pca_digits = pca_digits.fit_transform(digits.data)
In [8]:
colors = ['black', 'blue', 'purple', 'yellow', 'white', 'red', 'lime', 'cyan', 'orange', 'gray']
for i in range(len(colors)):
x = reduced_data_pca_digits[:, 0][digits.target == i]
y = reduced_data_pca_digits[:, 1][digits.target == i]
plt.scatter(x, y, c=colors[i])
plt.legend(digits.target_names, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show()
In [9]:
from sklearn.datasets import load_iris
iris = load_iris()
data = {iris.feature_names[it]: iris.data.transpose()[it] for it in range(4)}
data.update({'species': [iris.target_names[it] for it in iris.target]})
pd.DataFrame(data).head(4)
Out[9]:
Studenti: pomocí PCA zredukujte dimenzi u X = iris.data
a vykreslete data v redukovaných souřadnicích.
In [10]:
pca_iris = PCA(n_components=2)
reduced_data_pca_iris = pca_iris.fit_transform(iris.data)
colors = ['black', 'blue', 'red']
for i in range(len(colors)):
x = reduced_data_pca_iris[iris.target == i, 0]
y = reduced_data_pca_iris[iris.target == i, 1]
plt.scatter(x, y, c=colors[i])
plt.legend(iris.target_names, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show()
In [11]:
from sklearn.datasets import fetch_olivetti_faces
from numpy.random import RandomState
dataset = fetch_olivetti_faces(shuffle=True, random_state=RandomState(0), download_if_missing=True, data_home='.')
faces = dataset.data
n_samples, n_features = faces.shape
image_shape = (64, 64)
print n_samples, n_features
In [12]:
plt.rcParams['figure.figsize'] = 6, 6
plt.imshow(faces[0].reshape(image_shape));
plt.rcParams['figure.figsize'] = 20, 8
In [13]:
def plot_gallery(images, n_col, n_row):
plt.figure(figsize=(2. * n_col, 2.26 * n_row))
for i, comp in enumerate(images):
plt.subplot(n_row, n_col, i + 1)
vmax = max(comp.max(), -comp.min())
plt.imshow(comp.reshape(image_shape), cmap=plt.cm.gray,
interpolation='nearest',
vmin=-vmax, vmax=vmax)
plt.xticks(())
plt.yticks(())
plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)
plot_gallery(faces[:10], 5, 2)
In [14]:
from sklearn.decomposition import PCA
estimator_faces = PCA(n_components=10)
estimator_faces.fit(faces);
In [15]:
plt.imshow(estimator_faces.components_[0].reshape(image_shape));
In [16]:
plot_gallery(estimator_faces.components_[:10], 5, 2);
In [17]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)
In [18]:
from IPython.display import Image
import pydotplus
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png(), width=600)
Out[18]:
Accuracy: Overall, how often is the classifier correct?
(TP+TN)/total = (100+50)/165 = 0.91
Misclassification/Error Rate: Overall, how often is it wrong?
(FP+FN)/total = (10+5)/165 = 0.09
True Positive Rate/Recall: When it's actually yes, how often does it predict yes?
TP/actual yes = 100/105 = 0.95
False Positive Rate: When it's actually no, how often does it predict yes?
FP/actual no = 10/60 = 0.17
Specificity: When it's actually no, how often does it predict no?
TN/actual no = 50/60 = 0.83
Precision: When it predicts yes, how often is it correct?
TP/predicted yes = 100/110 = 0.91
Prevalence: How often does the yes condition actually occur in our sample?
actual yes/total = 105/165 = 0.64
F1-score: Harmonic mean of precision and recall — multiplying the constant of 2 scales the score to 1 when both recall and precision are 1:
In [19]:
from sklearn.model_selection import train_test_split
from sklearn import svm
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)
print 'Original data set:', iris.data.shape, iris.target.shape
print 'Training part:', X_train.shape, y_train.shape
print 'Test part:', X_test.shape, y_test.shape
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
print 'Accuracy: {:0.2f}'.format(clf.score(X_test, y_test))
In [20]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, iris.data, iris.target, scoring='f1_macro', cv=5)
print scores
print "Accuracy: {:0.2f} (+/- {:0.2f})".format(scores.mean(), scores.std() * 2)
In [21]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
In [22]:
%%time
clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring='f1_macro')
clf.fit(X_train, y_train);
In [23]:
clf.best_params_
Out[23]:
In [24]:
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print "{:0.3f} (+/-{:0.03f}) for {}".format(mean, std*2, params)
Studenti: nalezněte optimálni hodnoty hyperparametrů criterion
a max_depth
pro DecisionTreeClassifier, pro klasifikaci iris.data
.
In [25]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(iris.data)
X = iris.data.reshape((n_samples, -1))
y = iris.target
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
# Set the parameters by cross-validation
tuned_parameters = {'criterion':['gini','entropy'], 'max_depth':range(3,20)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), tuned_parameters, cv=5, scoring='f1_macro')
clf.fit(X_train, y_train);
print 'Best:', clf.best_params_
print
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print "{:0.3f} (+/-{:0.03f}) for {}".format(mean, std*2, params)
In [ ]:
sklearn.preprocessing.Normalizer
)sklearn.preprocessing.LabelEncoder
/ sklearn.preprocessing.OneHotEncoder
In [26]:
from sklearn.preprocessing import LabelEncoder
labels = ['one', 'two', 'three']
encoder = LabelEncoder().fit_transform(labels)
encoder
Out[26]:
In [27]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
e = encoder.fit_transform(np.array([0, 1, 2, 3]).reshape(-1, 1))
e.todense()
Out[27]:
In [ ]: