In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = 20, 8

Jiří Polcar <polcar@physics.muni.cz>

  • Úvod
  • Motivace
  • Workflow
  • Základní dělení metod strojového učení
  • Redukce dimenzi & feature importance
  • Vyhodnocení modelů
  • Cross validation & Grid search
  • Závěr

Motivace


In [2]:
from sklearn import datasets
from sklearn import metrics

digits = datasets.load_digits()
fig, axes = plt.subplots(5, 10, figsize=(8, 5))
fig.subplots_adjust(hspace=0.1, wspace=0.1)

for i, ax in enumerate(axes.flat):
    ax.imshow(digits.images[i], cmap='binary')
    ax.text(0.05, 0.05, str(digits.target[i]), transform=ax.transAxes, color='green')
    ax.set_xticks([])
    ax.set_yticks([])



In [3]:
print(digits.images.shape)
print(digits.images[0])


(1797, 8, 8)
[[  0.   0.   5.  13.   9.   1.   0.   0.]
 [  0.   0.  13.  15.  10.  15.   5.   0.]
 [  0.   3.  15.   2.   0.  11.   8.   0.]
 [  0.   4.  12.   0.   0.   8.   8.   0.]
 [  0.   5.   8.   0.   0.   9.   8.   0.]
 [  0.   4.  11.   0.   1.  12.   7.   0.]
 [  0.   2.  14.   5.  10.  12.   0.   0.]
 [  0.   0.   6.  13.  10.   0.   0.   0.]]

In [4]:
plt.rcParams['figure.figsize'] = 4, 4
plt.imshow(digits.images[0]);
plt.rcParams['figure.figsize'] = 20, 8



In [5]:
%%time
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(digits.data, digits.target)
pred = clf.predict(digits.data)


CPU times: user 312 ms, sys: 12 ms, total: 324 ms
Wall time: 322 ms

In [6]:
sns.heatmap(metrics.confusion_matrix(digits.target, pred), annot=True, fmt='d')
plt.ylabel('True label')
plt.xlabel('Predicted label');


Základní dělení metod strojového učení

  • Classification (snažíme se vstupnímu vektoru přiřadit jednu z kategorií)
    • textu přiřadit rubriku
    • záznamu z akcelerometru typ pohybu (chůze, běh, jízda tramvají, jízda autem, ...)
    • sentimen komentářů na sociální síti (positivni/negativní)
    • spam
  • Clustering (snažíme se vstupní vektory rozdělit do skupin)
    • "blízké" vektory tvoří skupipy
    • klíčová slova pro dané téma
  • Regression (snažime se vstupnímu vektoru přiřadit (spojitou) hodnotu)
    • předpoveď měnových kurzů
    • předpoveď návstěvnosti
  • Dimension reduction (snažime se zredukovat velikost vstupního vektoru)
    • máme moc dat, neupočítáme je
    • chceme vědět, které vstupní hodnoty jsou podstatne
    • snadnější vizualizace

Wrokflow

Hyperparametry modeů


In [7]:
from sklearn.decomposition import PCA

pca_digits = PCA(n_components=2)
reduced_data_pca_digits = pca_digits.fit_transform(digits.data)

In [8]:
colors = ['black', 'blue', 'purple', 'yellow', 'white', 'red', 'lime', 'cyan', 'orange', 'gray']
for i in range(len(colors)):
    x = reduced_data_pca_digits[:, 0][digits.target == i]
    y = reduced_data_pca_digits[:, 1][digits.target == i]
    plt.scatter(x, y, c=colors[i])
plt.legend(digits.target_names, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show()


Kosatce (Iris) Data


In [9]:
from sklearn.datasets import load_iris
iris = load_iris()
data = {iris.feature_names[it]: iris.data.transpose()[it] for it in range(4)}
data.update({'species': [iris.target_names[it] for it in iris.target]})
pd.DataFrame(data).head(4)


Out[9]:
petal length (cm) petal width (cm) sepal length (cm) sepal width (cm) species
0 1.4 0.2 5.1 3.5 setosa
1 1.4 0.2 4.9 3.0 setosa
2 1.3 0.2 4.7 3.2 setosa
3 1.5 0.2 4.6 3.1 setosa

Studenti: pomocí PCA zredukujte dimenzi u X = iris.data a vykreslete data v redukovaných souřadnicích.


In [10]:
pca_iris = PCA(n_components=2)

reduced_data_pca_iris = pca_iris.fit_transform(iris.data)
colors = ['black', 'blue', 'red']
for i in range(len(colors)):
    x = reduced_data_pca_iris[iris.target == i, 0]
    y = reduced_data_pca_iris[iris.target == i, 1]
    plt.scatter(x, y, c=colors[i])
plt.legend(iris.target_names, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show()



In [11]:
from sklearn.datasets import fetch_olivetti_faces
from numpy.random import RandomState

dataset = fetch_olivetti_faces(shuffle=True, random_state=RandomState(0), download_if_missing=True, data_home='.')
faces = dataset.data
n_samples, n_features = faces.shape
image_shape = (64, 64)

print n_samples, n_features


downloading Olivetti faces from http://cs.nyu.edu/~roweis/data/olivettifaces.mat to .
400 4096

In [12]:
plt.rcParams['figure.figsize'] = 6, 6
plt.imshow(faces[0].reshape(image_shape));
plt.rcParams['figure.figsize'] = 20, 8



In [13]:
def plot_gallery(images, n_col, n_row):
    plt.figure(figsize=(2. * n_col, 2.26 * n_row))
    for i, comp in enumerate(images):
        plt.subplot(n_row, n_col, i + 1)
        vmax = max(comp.max(), -comp.min())
        plt.imshow(comp.reshape(image_shape), cmap=plt.cm.gray,
                   interpolation='nearest',
                   vmin=-vmax, vmax=vmax)
        plt.xticks(())
        plt.yticks(())
    plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)
plot_gallery(faces[:10], 5, 2)



In [14]:
from sklearn.decomposition import PCA

estimator_faces = PCA(n_components=10)
estimator_faces.fit(faces);

In [15]:
plt.imshow(estimator_faces.components_[0].reshape(image_shape));



In [16]:
plot_gallery(estimator_faces.components_[:10], 5, 2);



In [17]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)

In [18]:
from IPython.display import Image
import pydotplus
dot_data = tree.export_graphviz(clf, out_file=None, 
                                feature_names=iris.feature_names,  
                                class_names=iris.target_names,  
                                filled=True, rounded=True,
                                special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png(), width=600)


Out[18]:

Vyhodnocení modeů

Accuracy: Overall, how often is the classifier correct?
(TP+TN)/total = (100+50)/165 = 0.91

Misclassification/Error Rate: Overall, how often is it wrong?
(FP+FN)/total = (10+5)/165 = 0.09

True Positive Rate/Recall: When it's actually yes, how often does it predict yes?
TP/actual yes = 100/105 = 0.95

False Positive Rate: When it's actually no, how often does it predict yes?
FP/actual no = 10/60 = 0.17

Specificity: When it's actually no, how often does it predict no?
TN/actual no = 50/60 = 0.83

Precision: When it predicts yes, how often is it correct?
TP/predicted yes = 100/110 = 0.91

Prevalence: How often does the yes condition actually occur in our sample?
actual yes/total = 105/165 = 0.64

F1-score: Harmonic mean of precision and recall — multiplying the constant of 2 scales the score to 1 when both recall and precision are 1:

Přeučení/Overfitting


In [19]:
from sklearn.model_selection import train_test_split
from sklearn import svm

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)
print 'Original data set:', iris.data.shape, iris.target.shape
print 'Training part:', X_train.shape, y_train.shape
print 'Test part:', X_test.shape, y_test.shape

clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
print 'Accuracy: {:0.2f}'.format(clf.score(X_test, y_test))


Original data set: (150, 4) (150,)
Training part: (90, 4) (90,)
Test part: (60, 4) (60,)
Accuracy: 0.97

In [20]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, iris.data, iris.target, scoring='f1_macro', cv=5)
print scores
print "Accuracy: {:0.2f} (+/- {:0.2f})".format(scores.mean(), scores.std() * 2)


[ 0.96658312  1.          0.96658312  0.96658312  1.        ]
Accuracy: 0.98 (+/- 0.03)


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [22]:
%%time
clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring='f1_macro')
clf.fit(X_train, y_train);


CPU times: user 4.14 s, sys: 0 ns, total: 4.14 s
Wall time: 4.16 s

In [23]:
clf.best_params_


Out[23]:
{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}

In [24]:
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print "{:0.3f} (+/-{:0.03f}) for {}".format(mean, std*2, params)


0.986 (+/-0.019) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
0.957 (+/-0.029) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
0.987 (+/-0.019) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
0.981 (+/-0.028) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
0.987 (+/-0.019) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
0.981 (+/-0.026) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}
0.987 (+/-0.019) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}
0.981 (+/-0.026) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.0001}
0.973 (+/-0.013) for {'kernel': 'linear', 'C': 1}
0.973 (+/-0.013) for {'kernel': 'linear', 'C': 10}
0.973 (+/-0.013) for {'kernel': 'linear', 'C': 100}
0.973 (+/-0.013) for {'kernel': 'linear', 'C': 1000}

Studenti: nalezněte optimálni hodnoty hyperparametrů criterion a max_depth pro DecisionTreeClassifier, pro klasifikaci iris.data.


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(iris.data)
X = iris.data.reshape((n_samples, -1))
y = iris.target

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = {'criterion':['gini','entropy'], 'max_depth':range(3,20)}

clf = GridSearchCV(tree.DecisionTreeClassifier(), tuned_parameters, cv=5, scoring='f1_macro')
clf.fit(X_train, y_train);

print 'Best:', clf.best_params_
print

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print "{:0.3f} (+/-{:0.03f}) for {}".format(mean, std*2, params)


Best: {'criterion': 'gini', 'max_depth': 5}

0.926 (+/-0.096) for {'criterion': 'gini', 'max_depth': 3}
0.926 (+/-0.096) for {'criterion': 'gini', 'max_depth': 4}
0.941 (+/-0.113) for {'criterion': 'gini', 'max_depth': 5}
0.941 (+/-0.113) for {'criterion': 'gini', 'max_depth': 6}
0.926 (+/-0.096) for {'criterion': 'gini', 'max_depth': 7}
0.941 (+/-0.113) for {'criterion': 'gini', 'max_depth': 8}
0.941 (+/-0.113) for {'criterion': 'gini', 'max_depth': 9}
0.941 (+/-0.113) for {'criterion': 'gini', 'max_depth': 10}
0.926 (+/-0.096) for {'criterion': 'gini', 'max_depth': 11}
0.926 (+/-0.096) for {'criterion': 'gini', 'max_depth': 12}
0.941 (+/-0.113) for {'criterion': 'gini', 'max_depth': 13}
0.941 (+/-0.113) for {'criterion': 'gini', 'max_depth': 14}
0.941 (+/-0.113) for {'criterion': 'gini', 'max_depth': 15}
0.941 (+/-0.113) for {'criterion': 'gini', 'max_depth': 16}
0.926 (+/-0.096) for {'criterion': 'gini', 'max_depth': 17}
0.926 (+/-0.096) for {'criterion': 'gini', 'max_depth': 18}
0.926 (+/-0.096) for {'criterion': 'gini', 'max_depth': 19}
0.926 (+/-0.096) for {'criterion': 'entropy', 'max_depth': 3}
0.941 (+/-0.113) for {'criterion': 'entropy', 'max_depth': 4}
0.941 (+/-0.113) for {'criterion': 'entropy', 'max_depth': 5}
0.926 (+/-0.096) for {'criterion': 'entropy', 'max_depth': 6}
0.926 (+/-0.096) for {'criterion': 'entropy', 'max_depth': 7}
0.941 (+/-0.113) for {'criterion': 'entropy', 'max_depth': 8}
0.941 (+/-0.113) for {'criterion': 'entropy', 'max_depth': 9}
0.926 (+/-0.096) for {'criterion': 'entropy', 'max_depth': 10}
0.926 (+/-0.096) for {'criterion': 'entropy', 'max_depth': 11}
0.941 (+/-0.113) for {'criterion': 'entropy', 'max_depth': 12}
0.926 (+/-0.096) for {'criterion': 'entropy', 'max_depth': 13}
0.926 (+/-0.096) for {'criterion': 'entropy', 'max_depth': 14}
0.941 (+/-0.113) for {'criterion': 'entropy', 'max_depth': 15}
0.941 (+/-0.113) for {'criterion': 'entropy', 'max_depth': 16}
0.926 (+/-0.096) for {'criterion': 'entropy', 'max_depth': 17}
0.941 (+/-0.113) for {'criterion': 'entropy', 'max_depth': 18}
0.926 (+/-0.096) for {'criterion': 'entropy', 'max_depth': 19}

In [ ]:

Co dále?

  • Datová špína
  • Normalizace (sklearn.preprocessing.Normalizer)
  • Použité metriky (Distance Metric Learning)
  • sklearn.preprocessing.LabelEncoder / sklearn.preprocessing.OneHotEncoder
  • Big Data

In [26]:
from sklearn.preprocessing import LabelEncoder

labels = ['one', 'two', 'three']
encoder = LabelEncoder().fit_transform(labels)
encoder


Out[26]:
array([0, 2, 1])

In [27]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
e = encoder.fit_transform(np.array([0, 1, 2, 3]).reshape(-1, 1))
e.todense()


Out[27]:
matrix([[ 1.,  0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.],
        [ 0.,  0.,  1.,  0.],
        [ 0.,  0.,  0.,  1.]])

?


In [ ]: