In [85]:
# code written in py_3.0

import pandas
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

# find path to your WineKMC.xlsx
df_offers = pandas.read_excel(open('C:/Users/craigrshenton/Desktop/Dropbox/excel_data_sci/ch02/WineKMC.xlsx','rb'), sheetname=0) 
df_offers.head() # use .head() to just show top 5 results


Out[85]:
Offer # Campaign Varietal Minimum Qty (kg) Discount (%) Origin Past Peak
0 1 January Malbec 72 56 France False
1 2 January Pinot Noir 72 17 France False
2 3 February Espumante 144 32 Oregon True
3 4 February Champagne 72 48 France True
4 5 February Cabernet Sauvignon 144 44 New Zealand True

In [4]:
df_sales = pandas.read_excel(open('C:/Users/craigrshenton/Desktop/Dropbox/excel_data_sci/ch02/WineKMC.xlsx','rb'), sheetname=1) 
df_sales.head()


Out[4]:
Customer Last Name Offer #
0 Smith 2
1 Smith 24
2 Johnson 17
3 Johnson 24
4 Johnson 26

In [19]:
pivot = pandas.pivot_table(df_sales, index=["Offer #"], columns=["Customer Last Name"], aggfunc=len, fill_value='0')
#pivot.index.name = None
#pivot.columns = pivot.columns.get_level_values(1) # sets cols to product categories
pivot.head()


Out[19]:
Customer Last Name Adams Allen Anderson Bailey Baker Barnes Bell Bennett Brooks Brown ... Turner Walker Ward Watson White Williams Wilson Wood Wright Young
Offer #
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 1 0 1 0 0 0 0 0 1 0
5 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 100 columns


In [64]:
# convert it to a numpy matrix
X = pivot.as_matrix()
X = np.matrix(X)

# take the transpose of x
X = X.T

In [88]:
kmeans = KMeans(n_clusters=4, random_state=10).fit_predict(X) # seed of 10 for reproducibility.

kmeans


Out[88]:
array([2, 1, 3, 2, 1, 1, 3, 2, 1, 2, 0, 3, 2, 0, 0, 3, 0, 3, 2, 1, 2, 1, 1,
       0, 3, 1, 1, 0, 1, 3, 1, 1, 1, 1, 2, 2, 1, 2, 0, 2, 3, 3, 1, 0, 2, 1,
       2, 1, 0, 0, 1, 1, 3, 3, 1, 2, 3, 1, 2, 1, 1, 1, 0, 2, 2, 3, 3, 1, 1,
       1, 1, 1, 1, 2, 1, 2, 3, 1, 1, 3, 0, 0, 1, 3, 2, 2, 2, 0, 0, 1, 1, 2,
       1, 2, 1, 1, 2, 1, 1, 0])

In [72]:
# get list unique customer names
names = df_sales["Customer Last Name"].unique()
names


Out[72]:
array(['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Miller', 'Davis',
       'Garcia', 'Rodriguez', 'Wilson', 'Martinez', 'Anderson', 'Taylor',
       'Thomas', 'Hernandez', 'Moore', 'Martin', 'Jackson', 'Thompson',
       'White', 'Lopez', 'Lee', 'Gonzalez', 'Harris', 'Clark', 'Lewis',
       'Robinson', 'Walker', 'Perez', 'Hall', 'Young', 'Allen', 'Sanchez',
       'Wright', 'King', 'Scott', 'Green', 'Baker', 'Adams', 'Nelson',
       'Hill', 'Ramirez', 'Campbell', 'Mitchell', 'Roberts', 'Carter',
       'Phillips', 'Evans', 'Turner', 'Torres', 'Parker', 'Collins',
       'Edwards', 'Stewart', 'Flores', 'Morris', 'Nguyen', 'Murphy',
       'Rivera', 'Cook', 'Rogers', 'Morgan', 'Peterson', 'Cooper', 'Reed',
       'Bailey', 'Bell', 'Gomez', 'Kelly', 'Howard', 'Ward', 'Cox', 'Diaz',
       'Richardson', 'Wood', 'Watson', 'Brooks', 'Bennett', 'Gray',
       'James', 'Reyes', 'Cruz', 'Hughes', 'Price', 'Myers', 'Long',
       'Foster', 'Sanders', 'Ross', 'Morales', 'Powell', 'Sullivan',
       'Russell', 'Ortiz', 'Jenkins', 'Gutierrez', 'Perry', 'Butler',
       'Barnes', 'Fisher'], dtype=object)

In [83]:
# make dataframe of customer names
df_names = pandas.DataFrame({"Customer Last Name": names})

# add list clusters customers belong to
df_names = df_names.assign(Cluster = kmeans)
df_names.head()


Out[83]:
Customer Last Name Cluster
0 Smith 2
1 Johnson 3
2 Williams 1
3 Brown 2
4 Jones 3

In [90]:
range_n_clusters = [2, 3, 4, 5, 6, 7]

for n_clusters in range_n_clusters:
    # initialize kmeans for each n clusters between 2--6
    kmeans = KMeans(n_clusters=n_clusters, random_state=10) # seed of 10 for reproducibility.
    cluster_labels = kmeans.fit_predict(X)

    # silhouette_score for n clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)


For n_clusters = 2 The average silhouette_score is : 0.0936557328349
For n_clusters = 3 The average silhouette_score is : 0.118899428636
For n_clusters = 4 The average silhouette_score is : 0.123470539196
For n_clusters = 5 The average silhouette_score is : 0.14092516242
For n_clusters = 6 The average silhouette_score is : 0.137179893911
For n_clusters = 7 The average silhouette_score is : 0.116109245662

kmeans with 5 clusters is optimal for this dataset


In [ ]: