In [12]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
In [54]:
np.random.seed(0)
x1 = np.random.randn(30)
y1 = 4*x1 + 10 + np.random.randn(len(x1))
x2 = x1*10
y2 = (x2-5)**2 + np.random.randn(len(x2))*10
x3_1 = np.random.randn(10)
x3_2 = np.random.randn(len(x3_1)) + 2
x3_3 = np.random.randn(len(x3_1)) + 4
y3_1 = np.random.randn(len(x3_1))
y3_2 = np.random.randn(len(x3_1)) + 4
y3_3 = np.random.randn(len(x3_1)) + 2
x3 = np.concatenate([x3_1, x3_2, x3_3+2])
y3 = np.concatenate([y3_1, y3_2+2, y3_3])
In [55]:
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plt.plot(x1,y1,'k.')
plt.subplot(1,3,2)
plt.plot(x2,y2,'k.')
plt.subplot(1,3,3)
plt.plot(x3,y3,'k.')
Out[55]:
In [47]:
plt.figure(figsize=(6,6))
plt.plot(x3_1,y3_1,'g.',ms=12)
plt.plot(x3_2,y3_2,'r.',ms=12)
plt.plot(x3_3,y3_3,'b.',ms=12)
Out[47]:
In [51]:
plt.figure(figsize=(6,6))
plt.plot(x3_1,y3_1,'g.',ms=12)
plt.plot(x3_2,y3_2,'r.',ms=12)
plt.plot(x3_3,y3_3,'b.',ms=12)
plt.plot(3,3.7,'k*',ms=12)
Out[51]:
In [60]:
plt.figure(figsize=(6,6))
plt.plot(x3_1,y3_1,'g.',ms=12)
plt.plot(x3_2,y3_2,'r.',ms=12)
plt.plot(x3_3,y3_3,'b.',ms=12)
plt.plot(3,3.7,'k*',ms=12)
plt.plot([3],[4],'ko',ms=12, mfc='none')
Out[60]:
In [ ]:
In [61]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
n_neighbors = 15
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target
h = .02 # step size in the mesh
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
for weights in ['uniform', 'distance']:
# we create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(X, y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i, weights = '%s')"
% (n_neighbors, weights))
plt.show()
In [36]:
plt.figure(figsize=(6,6))
plt.plot(x3,y3,'k.',ms=12)
Out[36]:
Clustering is a process of partitioning a set of data (or objects) into a set of meaningful sub-classes, called clusters. • Help users understand the natural grouping or structure in a data set. • Clustering: unsupervised classification: no predefined classes. • Used either as a stand-alone tool to get insight into data distribution or as a preprocessing step for other algorithms. • Moreover, data compression, outliers detection, understand human concept formation
In [4]:
from IPython.display import Image
Image('/gh/img/clustering/unsup.png')
Out[4]:
In [ ]:
# TODO: Example scatterplot of three clusters
# don't understand anything more about the system with regression
Advantages • Simple, understandable • items automatically assigned to clusters
Disadvantages • Must pick number of clusters before hand • Often terminates at a local optimum. • All items forced into a cluster • Too sensitive to outliers
In [ ]:
give them link to clustering slides. basically a textbook
k-medians screenshot The k-means algorithm is sensitive to outliers ! • Since an object with an extremely large value may substantially distort the distribution of the data. • There are other limitations – still a need for reducing costs of calculating distances to centroids. • K-Medoids: Instead of taking the mean value of the object in a cluster as a reference point, medoids can be used, which is the most centrally located object in a cluster.
Different initial seed --> different result
Measured by manually labeled data • We manually assign tuples into clusters according to their properties (e.g., professors in different research areas) • Accuracy of clustering: Percentage of pairs of tuples in the same cluster that share common label • This measure favors many small clusters • We let each approach generate the same number of clusters