In [1]:
import warnings
warnings.filterwarnings('ignore')
In [2]:
%matplotlib inline
%pylab inline
In [3]:
import matplotlib.pylab as plt
import numpy as np
In [4]:
from distutils.version import StrictVersion
In [5]:
import sklearn
print(sklearn.__version__)
assert StrictVersion(sklearn.__version__ ) >= StrictVersion('0.18.1')
In [6]:
# Evtl. hat Azure nur 0.19, wir brauchen aber .20 für das Plotting, dann das hier installieren und Notebook neu starten
# !conda update pandas -y
In [7]:
import pandas as pd
print(pd.__version__)
assert StrictVersion(pd.__version__) >= StrictVersion('0.20.0')
In [8]:
from sklearn.datasets import load_iris
iris = load_iris()
In [9]:
print(iris.DESCR)
In [10]:
X = iris.data
y = iris.target
In [11]:
X.shape, y.shape
Out[11]:
In [12]:
X[0]
Out[12]:
In [13]:
y[0]
Out[13]:
In [14]:
X_sepal_length = X[:, 0]
X_sepal_width = X[:, 1]
X_petal_length = X[:, 2]
X_petal_width = X[:, 3]
In [15]:
X_petal_width.shape
Out[15]:
In [16]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
CMAP = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
pd.plotting.scatter_matrix(iris_df, c=iris.target, edgecolor='black', figsize=(15, 15), cmap=CMAP)
plt.show()
http://scikit-learn.org/stable/modules/cross_validation.html
In [17]:
from sklearn.model_selection import train_test_split
In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
In [19]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape
Out[19]:
http://scikit-learn.org/stable/modules/neighbors.html#classification
In [20]:
from sklearn import neighbors
In [21]:
# ignore this, it is just technical code
# should come from a lib, consider it to appear magically
# http://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
font_size=25
def meshGrid(x_data, y_data):
h = .02 # step size in the mesh
x_min, x_max = x_data.min() - 1, x_data.max() + 1
y_min, y_max = y_data.min() - 1, y_data.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
return (xx,yy)
def plotPrediction(clf, x_data, y_data, x_label, y_label, colors, title="", mesh=True):
xx,yy = meshGrid(x_data, y_data)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(20,10))
if mesh:
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.scatter(x_data, y_data, c=colors, cmap=cmap_bold, s=80, marker='o')
plt.xlabel(x_label, fontsize=font_size)
plt.ylabel(y_label, fontsize=font_size)
plt.title(title, fontsize=font_size)
In [22]:
X_train_sepal_only = X_train[:, :2]
X_test_sepal_only = X_test[:, :2]
In [23]:
X_train_sepal_only[0]
Out[23]:
In [24]:
X_train[0]
Out[24]:
In [25]:
clf_sepal = neighbors.KNeighborsClassifier(1)
%time clf_sepal.fit(X_train_sepal_only, y_train)
Out[25]:
In [26]:
plotPrediction(clf_sepal, X_train_sepal_only[:, 0], X_train_sepal_only[:, 1],
'Sepal length', 'Sepal width', y_train, mesh=False,
title="Train Data for Sepal Features")
In [27]:
# 8 ist schwer, weil direkt zwischen 1 und 2
sample_id = 8
# sample_id = 50
sample_feature = X_test_sepal_only[sample_id]
sample_label = y_test[sample_id]
In [28]:
sample_feature
Out[28]:
In [29]:
sample_label
Out[29]:
In [30]:
clf_sepal.predict([sample_feature])
Out[30]:
In [31]:
clf_sepal.predict([[6.0, 4.5]]) # slightly different from above, still gives 0
Out[31]:
In [32]:
# clf_sepal.score?
In [33]:
clf_sepal.score(X_train_sepal_only, y_train)
Out[33]:
In [34]:
clf_sepal.score(X_test_sepal_only, y_test)
Out[34]:
In [35]:
plotPrediction(clf_sepal, X_train_sepal_only[:, 0], X_train_sepal_only[:, 1],
'Sepal length', 'Sepal width', y_train,
title="Highly Fragmented Decision Boundaries for Train Data")
In [36]:
plotPrediction(clf_sepal, X_test_sepal_only[:, 0], X_test_sepal_only[:, 1],
'Sepal length', 'Sepal width', y_test,
title="Same Decision Boundaries don't work well for Test Data")
In [37]:
# neighbors.KNeighborsClassifier?
In [38]:
clf_sepal_10 = neighbors.KNeighborsClassifier(10)
clf_sepal_10.fit(X_train_sepal_only, y_train)
Out[38]:
In [39]:
clf_sepal_10.score(X_train_sepal_only, y_train)
Out[39]:
In [40]:
clf_sepal_10.score(X_test_sepal_only, y_test)
Out[40]:
In [41]:
plotPrediction(clf_sepal_10, X_train_sepal_only[:, 0], X_train_sepal_only[:, 1],
'Sepal length', 'Sepal width', y_train,
title="Model too simple even for Train Data")
In [60]:
plotPrediction(clf_sepal_10, X_test_sepal_only[:, 0], X_test_sepal_only[:, 1],
'Sepal length', 'Sepal width', y_test,
title="Model also too simplefor Test Data")
In [43]:
X_train_petal_only = X_train[:, 2:]
X_test_petal_only = X_test[:, 2:]
In [44]:
X_train_petal_only[0]
Out[44]:
In [45]:
X_train[0]
Out[45]:
In [46]:
clf_petal_10 = neighbors.KNeighborsClassifier(10)
clf_petal_10.fit(X_train_petal_only, y_train)
Out[46]:
In [47]:
plotPrediction(clf_petal_10, X_train_petal_only[:, 0], X_train_petal_only[:, 1],
'Petal length', 'Petal width', y_train,
title="Simple model looks good for Train Data")
In [48]:
plotPrediction(clf_petal_10, X_test_petal_only[:, 0], X_test_petal_only[:, 1],
'Petal length', 'Petal width', y_test,
title="Simple model looks good even for Test Data")
In [49]:
clf_petal_10.score(X_train_petal_only, y_train)
Out[49]:
In [50]:
clf_petal_10.score(X_test_petal_only, y_test)
Out[50]:
In [51]:
clf = neighbors.KNeighborsClassifier(1)
clf.fit(X_train, y_train)
Out[51]:
In [52]:
clf.score(X_train, y_train)
Out[52]:
In [53]:
clf.score(X_test, y_test)
Out[53]:
In [57]:
clf = neighbors.KNeighborsClassifier(13)
clf.fit(X_train, y_train)
Out[57]:
In [58]:
clf.score(X_train, y_train)
Out[58]:
In [59]:
clf.score(X_test, y_test)
Out[59]: