Load data from http://media.wiley.com/product_ancillary/6X/11186614/DOWNLOAD/ch02.zip, WineKMC.xlsx
In [85]:
# code written in py_3.0
import pandas
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
# find path to your WineKMC.xlsx
df_offers = pandas.read_excel(open('C:/Users/craigrshenton/Desktop/Dropbox/excel_data_sci/ch02/WineKMC.xlsx','rb'), sheetname=0)
df_offers.head() # use .head() to just show top 5 results
Out[85]:
In [4]:
df_sales = pandas.read_excel(open('C:/Users/craigrshenton/Desktop/Dropbox/excel_data_sci/ch02/WineKMC.xlsx','rb'), sheetname=1)
df_sales.head()
Out[4]:
In [19]:
pivot = pandas.pivot_table(df_sales, index=["Offer #"], columns=["Customer Last Name"], aggfunc=len, fill_value='0')
#pivot.index.name = None
#pivot.columns = pivot.columns.get_level_values(1) # sets cols to product categories
pivot.head()
Out[19]:
In [64]:
# convert it to a numpy matrix
X = pivot.as_matrix()
X = np.matrix(X)
# take the transpose of x
X = X.T
In [88]:
kmeans = KMeans(n_clusters=4, random_state=10).fit_predict(X) # seed of 10 for reproducibility.
kmeans
Out[88]:
In [72]:
# get list unique customer names
names = df_sales["Customer Last Name"].unique()
names
Out[72]:
In [83]:
# make dataframe of customer names
df_names = pandas.DataFrame({"Customer Last Name": names})
# add list clusters customers belong to
df_names = df_names.assign(Cluster = kmeans)
df_names.head()
Out[83]:
In [90]:
range_n_clusters = [2, 3, 4, 5, 6, 7]
for n_clusters in range_n_clusters:
# initialize kmeans for each n clusters between 2--6
kmeans = KMeans(n_clusters=n_clusters, random_state=10) # seed of 10 for reproducibility.
cluster_labels = kmeans.fit_predict(X)
# silhouette_score for n clusters
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
kmeans with 5 clusters is optimal for this dataset
In [ ]: