Taller 5

Reading data of power consumption



In [38]:

    
%matplotlib inline


import pandas as pd
from sklearn.cross_validation import train_test_split
import seaborn as sns









    



/Users/camilogarcia/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [10]:

    
data = pd.read_csv('/Users/camilogarcia/Downloads/household_power_consumption.txt', delimiter=';')



In [23]:

    
data.head(5)









    Out[23]:






  
    
      
      Date
      Time
      Global_active_power
      Global_reactive_power
      Voltage
      Global_intensity
      Sub_metering_1
      Sub_metering_2
      Sub_metering_3
    
  
  
    
      0
      16/12/2006
      17:24:00
      4.216
      0.418
      234.840
      18.400
      0.000
      1.000
      17.0
    
    
      1
      16/12/2006
      17:25:00
      5.360
      0.436
      233.630
      23.000
      0.000
      1.000
      16.0
    
    
      2
      16/12/2006
      17:26:00
      5.374
      0.498
      233.290
      23.000
      0.000
      2.000
      17.0
    
    
      3
      16/12/2006
      17:27:00
      5.388
      0.502
      233.740
      23.000
      0.000
      1.000
      17.0
    
    
      4
      16/12/2006
      17:28:00
      3.666
      0.528
      235.680
      15.800
      0.000
      1.000
      17.0



In [21]:

    
data.shape









    Out[21]:





(2075259, 9)



In [15]:

    
data.describe()









    Out[15]:






  
    
      
      Sub_metering_3
    
  
  
    
      count
      2.049280e+06
    
    
      mean
      6.458447e+00
    
    
      std
      8.437154e+00
    
    
      min
      0.000000e+00
    
    
      25%
      0.000000e+00
    
    
      50%
      1.000000e+00
    
    
      75%
      1.700000e+01
    
    
      max
      3.100000e+01



In [17]:

    
data.dtypes









    Out[17]:





Date                      object
Time                      object
Global_active_power       object
Global_reactive_power     object
Voltage                   object
Global_intensity          object
Sub_metering_1            object
Sub_metering_2            object
Sub_metering_3           float64
dtype: object



In [18]:

    
power_consumption = data.iloc[0:, 2:9].dropna() # here all the rows are selected from columns 2 trough 9 and the others are dropped



In [20]:

    
power_consumption.head(5)









    Out[20]:






  
    
      
      Global_active_power
      Global_reactive_power
      Voltage
      Global_intensity
      Sub_metering_1
      Sub_metering_2
      Sub_metering_3
    
  
  
    
      0
      4.216
      0.418
      234.840
      18.400
      0.000
      1.000
      17.0
    
    
      1
      5.360
      0.436
      233.630
      23.000
      0.000
      1.000
      16.0
    
    
      2
      5.374
      0.498
      233.290
      23.000
      0.000
      2.000
      17.0
    
    
      3
      5.388
      0.502
      233.740
      23.000
      0.000
      1.000
      17.0
    
    
      4
      3.666
      0.528
      235.680
      15.800
      0.000
      1.000
      17.0



In [41]:

    
pc_toarray = power_consumption.values 

hpc_fit, hpc_fit1 = train_test_split(pc_toarray, train_size=.01)



In [43]:

    
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA # It helps to reduce or reduce the dimensionality

hpc = PCA(n_components=2).fit_transform(hpc_fit)
k_means = KMeans()
k_means.fit(hpc)

x_min, x_max = hpc[:, 0].min() - 5, hpc[:, 0].max() - 1
y_min, y_max = hpc[:, 1].min(), hpc[:, 1].max() + 5
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02), np.arange(y_min, y_max, .02))
Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
          extent=(xx.min(), xx.max(), yy.min(), yy.max()),
          cmap=plt.cm.Paired,
          aspect='auto', origin='lower')

plt.plot(hpc[:, 0], hpc[:, 1], 'k.', markersize=4)
centroids = k_means.cluster_centers_
inert = k_means.inertia_
plt.scatter(centroids[:, 0], centroids[:, 1],
           marker='x', s=169, linewidths=3,
           color='w', zorder=8)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()



In [44]:

    
import numpy as np
from scipy.spatial.distance import cdist, pdist
from matplotlib import pyplot as plt

# Determine your k range
k_range = range(1,14)

# Fit the kmeans model for each n_clusters = k
k_means_var = [KMeans(n_clusters=k).fit(hpc) for k in k_range]

# Pull out the cluster centers for each model
centroids = [X.cluster_centers_ for X in k_means_var]

# Calculate the Euclidean distance from 
# each point to each cluster center
k_euclid = [cdist(hpc, cent, 'euclidean') for cent in centroids]
dist = [np.min(ke,axis=1) for ke in k_euclid]

# Total within-cluster sum of squares
wcss = [sum(d**2) for d in dist]

# The total sum of squares
tss = sum(pdist(hpc)**2)/hpc.shape[0]

# The between-cluster sum of squares
bss = tss - wcss

# elbow curve
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(k_range, bss/tss*100, 'b*-')
ax.set_ylim((0,100))
plt.grid(True)
plt.xlabel('n_clusters')
plt.ylabel('Percentage of variance explained')
plt.title('Variance Explained vs. k')









    Out[44]:





<matplotlib.text.Text at 0x12eacd750>



In [45]:

    
# run 8/random, 7/kmeans++ for silhouette
k_means = KMeans(n_clusters=7)
k_means.fit(hpc)

x_min, x_max = hpc[:, 0].min() - 5, hpc[:, 0].max() - 1
y_min, y_max = hpc[:, 1].min() + 1, hpc[:, 1].max() + 5
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02), np.arange(y_min, y_max, .02))
Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
          extent=(xx.min(), xx.max(), yy.min(), yy.max()),
          cmap=plt.cm.Paired,
          aspect='auto', origin='lower')

plt.plot(hpc[:, 0], hpc[:, 1], 'k.', markersize=4)
# Plot the centroids as a white X
centroids = k_means.cluster_centers_
inert = k_means.inertia_
plt.scatter(centroids[:, 0], centroids[:, 1],
           marker='x', s=169, linewidths=3,
           color='w', zorder=8)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()



In [ ]:

    
from sklearn.metrics import silhouette_score

labels = k_means.labels_
silhouette_score(hpc, labels, metric='euclidean')



In [ ]:

	Date	Time	Global_active_power	Global_reactive_power	Voltage	Global_intensity	Sub_metering_2	Sub_metering_3
0	16/12/2006	17:24:00	4.216	0.418	234.840	18.400	1.000	17.0
1	16/12/2006	17:25:00	5.360	0.436	233.630	23.000	1.000	16.0
2	16/12/2006	17:26:00	5.374	0.498	233.290	23.000	2.000	17.0
3	16/12/2006	17:27:00	5.388	0.502	233.740	23.000	1.000	17.0
4	16/12/2006	17:28:00	3.666	0.528	235.680	15.800	1.000	17.0

	Sub_metering_3
count	2.049280e+06
mean	6.458447e+00
std	8.437154e+00
min	0.000000e+00
25%	0.000000e+00
50%	1.000000e+00
75%	1.700000e+01
max	3.100000e+01