In [1]:
%pylab
%matplotlib inline
import pandas as pd
import sklearn
from sklearn import datasets
data = datasets.load_iris()
In [2]:
# jeu de donnees autos
autos = pd.read_csv('donnees/autos.txt', delimiter='\t')
numericAutos = autos[['wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size',
'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']]
In [3]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
X = numericAutos.as_matrix()
scaledX = scale(X)
K = 3
kmeans = KMeans(n_clusters=K).fit(scaledX)
centroids = []
for i in range(3):
c = X[kmeans.labels_==i].mean(0)
print('centre du groupe {} ({} observations): {}'.format(i, sum(kmeans.labels_==i), c))
In [4]:
def plotClusters(K, labels, data, xname, yname):
figure()
for i in range(K):
scatter(data.loc[labels==i, xname], data.loc[labels==i, yname],
label = 'groupe {}'.format(i))
legend()
xlabel(xname)
ylabel(yname)
plotClusters(K, kmeans.labels_, autos, 'curb-weight', 'city-mpg')
In [5]:
from sklearn.cluster import AgglomerativeClustering
K = 3
hac = AgglomerativeClustering(n_clusters=K, linkage = "complete").fit(scaledX)
centroids = []
for i in range(3):
c = X[hac.labels_==i].mean(0)
print('centre du groupe {} ({} observations): {}'.format(i, sum(hac.labels_==i), c))
plotClusters(K, hac.labels_, autos, 'curb-weight', 'city-mpg')