In [119]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%load_ext sql
%pylab inline
from sklearn.preprocessing import scale
In [26]:
%sql postgresql://sharknado:sharknado@sharknado-dse.ceg3hdkdq8l0.us-east-1.rds.amazonaws.com/sharknado
Out[26]:
In [121]:
result = %sql select * from panel_demos_11
In [122]:
panelist_df = result.DataFrame()
In [ ]:
In [ ]:
In [ ]:
In [123]:
panelist_df.set_index(['panelist_id'], inplace=True)
panelist_df.fillna(panelist_df.mean(), inplace=True)
panelist_df = scale(panelist_df)
In [ ]:
In [ ]:
In [126]:
from sklearn.decomposition import PCA
pca = PCA(n_components=None)
pca.fit(panelist_df)
Out[126]:
In [127]:
cumulative_explained = np.cumsum(pca.explained_variance_ratio_)
plt.plot(cumulative_explained);
plt.grid()
In [ ]:
In [128]:
plt.figure(1, figsize=(8, 6))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_ratio_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')
plt.grid()
In [129]:
plt.plot(pca.components_[0,:])
plt.plot(pca.components_[1,:])
plt.plot(pca.components_[2,:])
Out[129]:
In [154]:
reduced_data = PCA(n_components=3).fit_transform(panelist_df)
reduced_data.shape
Out[154]:
In [160]:
kmeans_model = KMeans(n_clusters=1, random_state=1, init='k-means++')
kmeans_model.fit(reduced_data)
Out[160]:
In [161]:
pd.Series(kmeans_model.labels_).value_counts()
Out[161]:
In [165]:
fig = plt.figure(1, figsize=(8, 6))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
plt.cla()
labels = kmeans_model.labels_
ax.scatter(reduced_data[:, 0], reduced_data[:, 1], reduced_data[:, 2], c=labels.astype(np.float), marker='x')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
Out[165]:
In [ ]:
In [ ]: