Load data from http://media.wiley.com/product_ancillary/6X/11186614/DOWNLOAD/ch02.zip, WineKMC.xlsx



In [85]:

    
# code written in py_3.0

import pandas
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

# find path to your WineKMC.xlsx
df_offers = pandas.read_excel(open('C:/Users/craigrshenton/Desktop/Dropbox/excel_data_sci/ch02/WineKMC.xlsx','rb'), sheetname=0) 
df_offers.head() # use .head() to just show top 5 results









    Out[85]:






  
    
      
      Offer #
      Campaign
      Varietal
      Minimum Qty (kg)
      Discount (%)
      Origin
      Past Peak
    
  
  
    
      0
      1
      January
      Malbec
      72
      56
      France
      False
    
    
      1
      2
      January
      Pinot Noir
      72
      17
      France
      False
    
    
      2
      3
      February
      Espumante
      144
      32
      Oregon
      True
    
    
      3
      4
      February
      Champagne
      72
      48
      France
      True
    
    
      4
      5
      February
      Cabernet Sauvignon
      144
      44
      New Zealand
      True



In [4]:

    
df_sales = pandas.read_excel(open('C:/Users/craigrshenton/Desktop/Dropbox/excel_data_sci/ch02/WineKMC.xlsx','rb'), sheetname=1) 
df_sales.head()









    Out[4]:






  
    
      
      Customer Last Name
      Offer #
    
  
  
    
      0
      Smith
      2
    
    
      1
      Smith
      24
    
    
      2
      Johnson
      17
    
    
      3
      Johnson
      24
    
    
      4
      Johnson
      26



In [19]:

    
pivot = pandas.pivot_table(df_sales, index=["Offer #"], columns=["Customer Last Name"], aggfunc=len, fill_value='0')
#pivot.index.name = None
#pivot.columns = pivot.columns.get_level_values(1) # sets cols to product categories
pivot.head()









    Out[19]:






  
    
      Customer Last Name
      Adams
      Allen
      Anderson
      Bailey
      Baker
      Barnes
      Bell
      Bennett
      Brooks
      Brown
      ...
      Turner
      Walker
      Ward
      Watson
      White
      Williams
      Wilson
      Wood
      Wright
      Young
    
    
      Offer #
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      2
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      1
      0
      1
      0
      0
      0
      0
      0
      1
      0
    
    
      5
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 100 columns



In [64]:

    
# convert it to a numpy matrix
X = pivot.as_matrix()
X = np.matrix(X)

# take the transpose of x
X = X.T



In [88]:

    
kmeans = KMeans(n_clusters=4, random_state=10).fit_predict(X) # seed of 10 for reproducibility.

kmeans









    Out[88]:





array([2, 1, 3, 2, 1, 1, 3, 2, 1, 2, 0, 3, 2, 0, 0, 3, 0, 3, 2, 1, 2, 1, 1,
       0, 3, 1, 1, 0, 1, 3, 1, 1, 1, 1, 2, 2, 1, 2, 0, 2, 3, 3, 1, 0, 2, 1,
       2, 1, 0, 0, 1, 1, 3, 3, 1, 2, 3, 1, 2, 1, 1, 1, 0, 2, 2, 3, 3, 1, 1,
       1, 1, 1, 1, 2, 1, 2, 3, 1, 1, 3, 0, 0, 1, 3, 2, 2, 2, 0, 0, 1, 1, 2,
       1, 2, 1, 1, 2, 1, 1, 0])



In [72]:

    
# get list unique customer names
names = df_sales["Customer Last Name"].unique()
names









    Out[72]:





array(['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Miller', 'Davis',
       'Garcia', 'Rodriguez', 'Wilson', 'Martinez', 'Anderson', 'Taylor',
       'Thomas', 'Hernandez', 'Moore', 'Martin', 'Jackson', 'Thompson',
       'White', 'Lopez', 'Lee', 'Gonzalez', 'Harris', 'Clark', 'Lewis',
       'Robinson', 'Walker', 'Perez', 'Hall', 'Young', 'Allen', 'Sanchez',
       'Wright', 'King', 'Scott', 'Green', 'Baker', 'Adams', 'Nelson',
       'Hill', 'Ramirez', 'Campbell', 'Mitchell', 'Roberts', 'Carter',
       'Phillips', 'Evans', 'Turner', 'Torres', 'Parker', 'Collins',
       'Edwards', 'Stewart', 'Flores', 'Morris', 'Nguyen', 'Murphy',
       'Rivera', 'Cook', 'Rogers', 'Morgan', 'Peterson', 'Cooper', 'Reed',
       'Bailey', 'Bell', 'Gomez', 'Kelly', 'Howard', 'Ward', 'Cox', 'Diaz',
       'Richardson', 'Wood', 'Watson', 'Brooks', 'Bennett', 'Gray',
       'James', 'Reyes', 'Cruz', 'Hughes', 'Price', 'Myers', 'Long',
       'Foster', 'Sanders', 'Ross', 'Morales', 'Powell', 'Sullivan',
       'Russell', 'Ortiz', 'Jenkins', 'Gutierrez', 'Perry', 'Butler',
       'Barnes', 'Fisher'], dtype=object)



In [83]:

    
# make dataframe of customer names
df_names = pandas.DataFrame({"Customer Last Name": names})

# add list clusters customers belong to
df_names = df_names.assign(Cluster = kmeans)
df_names.head()









    Out[83]:






  
    
      
      Customer Last Name
      Cluster
    
  
  
    
      0
      Smith
      2
    
    
      1
      Johnson
      3
    
    
      2
      Williams
      1
    
    
      3
      Brown
      2
    
    
      4
      Jones
      3



In [90]:

    
range_n_clusters = [2, 3, 4, 5, 6, 7]

for n_clusters in range_n_clusters:
    # initialize kmeans for each n clusters between 2--6
    kmeans = KMeans(n_clusters=n_clusters, random_state=10) # seed of 10 for reproducibility.
    cluster_labels = kmeans.fit_predict(X)

    # silhouette_score for n clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)









    



For n_clusters = 2 The average silhouette_score is : 0.0936557328349
For n_clusters = 3 The average silhouette_score is : 0.118899428636
For n_clusters = 4 The average silhouette_score is : 0.123470539196
For n_clusters = 5 The average silhouette_score is : 0.14092516242
For n_clusters = 6 The average silhouette_score is : 0.137179893911
For n_clusters = 7 The average silhouette_score is : 0.116109245662

kmeans with 5 clusters is optimal for this dataset



In [ ]:

	Offer #	Campaign	Varietal	Minimum Qty (kg)	Discount (%)	Origin	Past Peak
0	1	January	Malbec	72	56	France	False
1	2	January	Pinot Noir	72	17	France	False
2	3	February	Espumante	144	32	Oregon	True
3	4	February	Champagne	72	48	France	True
4	5	February	Cabernet Sauvignon	144	44	New Zealand	True

Customer Last Name	Adams	Allen	Anderson	Bailey	Baker	Barnes	Bell	Bennett	Brooks	Brown	...	Turner	Walker	Ward	Watson	White	Williams	Wilson	Wood	Wright	Young
Offer #
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
2	0	0	0	0	0	0	1	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	1	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	1	0	1	0	0	0	0	0	1	0
5	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0