Taller 5

Reading data of power consumption


In [38]:
%matplotlib inline


import pandas as pd
from sklearn.cross_validation import train_test_split
import seaborn as sns


/Users/camilogarcia/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [10]:
data = pd.read_csv('/Users/camilogarcia/Downloads/household_power_consumption.txt', delimiter=';')

In [23]:
data.head(5)


Out[23]:
Date Time Global_active_power Global_reactive_power Voltage Global_intensity Sub_metering_1 Sub_metering_2 Sub_metering_3
0 16/12/2006 17:24:00 4.216 0.418 234.840 18.400 0.000 1.000 17.0
1 16/12/2006 17:25:00 5.360 0.436 233.630 23.000 0.000 1.000 16.0
2 16/12/2006 17:26:00 5.374 0.498 233.290 23.000 0.000 2.000 17.0
3 16/12/2006 17:27:00 5.388 0.502 233.740 23.000 0.000 1.000 17.0
4 16/12/2006 17:28:00 3.666 0.528 235.680 15.800 0.000 1.000 17.0

In [21]:
data.shape


Out[21]:
(2075259, 9)

In [15]:
data.describe()


Out[15]:
Sub_metering_3
count 2.049280e+06
mean 6.458447e+00
std 8.437154e+00
min 0.000000e+00
25% 0.000000e+00
50% 1.000000e+00
75% 1.700000e+01
max 3.100000e+01

In [17]:
data.dtypes


Out[17]:
Date                      object
Time                      object
Global_active_power       object
Global_reactive_power     object
Voltage                   object
Global_intensity          object
Sub_metering_1            object
Sub_metering_2            object
Sub_metering_3           float64
dtype: object

In [18]:
power_consumption = data.iloc[0:, 2:9].dropna() # here all the rows are selected from columns 2 trough 9 and the others are dropped

In [20]:
power_consumption.head(5)


Out[20]:
Global_active_power Global_reactive_power Voltage Global_intensity Sub_metering_1 Sub_metering_2 Sub_metering_3
0 4.216 0.418 234.840 18.400 0.000 1.000 17.0
1 5.360 0.436 233.630 23.000 0.000 1.000 16.0
2 5.374 0.498 233.290 23.000 0.000 2.000 17.0
3 5.388 0.502 233.740 23.000 0.000 1.000 17.0
4 3.666 0.528 235.680 15.800 0.000 1.000 17.0

In [41]:
pc_toarray = power_consumption.values 

hpc_fit, hpc_fit1 = train_test_split(pc_toarray, train_size=.01)

In [43]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA # It helps to reduce or reduce the dimensionality

hpc = PCA(n_components=2).fit_transform(hpc_fit)
k_means = KMeans()
k_means.fit(hpc)

x_min, x_max = hpc[:, 0].min() - 5, hpc[:, 0].max() - 1
y_min, y_max = hpc[:, 1].min(), hpc[:, 1].max() + 5
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02), np.arange(y_min, y_max, .02))
Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
          extent=(xx.min(), xx.max(), yy.min(), yy.max()),
          cmap=plt.cm.Paired,
          aspect='auto', origin='lower')

plt.plot(hpc[:, 0], hpc[:, 1], 'k.', markersize=4)
centroids = k_means.cluster_centers_
inert = k_means.inertia_
plt.scatter(centroids[:, 0], centroids[:, 1],
           marker='x', s=169, linewidths=3,
           color='w', zorder=8)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()



In [44]:
import numpy as np
from scipy.spatial.distance import cdist, pdist
from matplotlib import pyplot as plt

# Determine your k range
k_range = range(1,14)

# Fit the kmeans model for each n_clusters = k
k_means_var = [KMeans(n_clusters=k).fit(hpc) for k in k_range]

# Pull out the cluster centers for each model
centroids = [X.cluster_centers_ for X in k_means_var]

# Calculate the Euclidean distance from 
# each point to each cluster center
k_euclid = [cdist(hpc, cent, 'euclidean') for cent in centroids]
dist = [np.min(ke,axis=1) for ke in k_euclid]

# Total within-cluster sum of squares
wcss = [sum(d**2) for d in dist]

# The total sum of squares
tss = sum(pdist(hpc)**2)/hpc.shape[0]

# The between-cluster sum of squares
bss = tss - wcss

# elbow curve
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(k_range, bss/tss*100, 'b*-')
ax.set_ylim((0,100))
plt.grid(True)
plt.xlabel('n_clusters')
plt.ylabel('Percentage of variance explained')
plt.title('Variance Explained vs. k')


Out[44]:
<matplotlib.text.Text at 0x12eacd750>

In [45]:
# run 8/random, 7/kmeans++ for silhouette
k_means = KMeans(n_clusters=7)
k_means.fit(hpc)

x_min, x_max = hpc[:, 0].min() - 5, hpc[:, 0].max() - 1
y_min, y_max = hpc[:, 1].min() + 1, hpc[:, 1].max() + 5
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02), np.arange(y_min, y_max, .02))
Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
          extent=(xx.min(), xx.max(), yy.min(), yy.max()),
          cmap=plt.cm.Paired,
          aspect='auto', origin='lower')

plt.plot(hpc[:, 0], hpc[:, 1], 'k.', markersize=4)
# Plot the centroids as a white X
centroids = k_means.cluster_centers_
inert = k_means.inertia_
plt.scatter(centroids[:, 0], centroids[:, 1],
           marker='x', s=169, linewidths=3,
           color='w', zorder=8)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()



In [ ]:
from sklearn.metrics import silhouette_score

labels = k_means.labels_
silhouette_score(hpc, labels, metric='euclidean')

In [ ]: