In [1]:
# code written in py_3.0

import pandas as pd
import numpy as np

df_sales = pd.read_excel(open('C:/Users/craigrshenton/Desktop/Dropbox/excel_data_sci/ch02/WineKMC.xlsx','rb'), sheetname=1)
df_sales.columns = ['name', 'offer']
df_sales.head()


Out[1]:
name offer
0 Smith 2
1 Smith 24
2 Johnson 17
3 Johnson 24
4 Johnson 26

In [2]:
# get list unique customer names
names = df_sales.name.unique()
names


Out[2]:
array(['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Miller', 'Davis',
       'Garcia', 'Rodriguez', 'Wilson', 'Martinez', 'Anderson', 'Taylor',
       'Thomas', 'Hernandez', 'Moore', 'Martin', 'Jackson', 'Thompson',
       'White', 'Lopez', 'Lee', 'Gonzalez', 'Harris', 'Clark', 'Lewis',
       'Robinson', 'Walker', 'Perez', 'Hall', 'Young', 'Allen', 'Sanchez',
       'Wright', 'King', 'Scott', 'Green', 'Baker', 'Adams', 'Nelson',
       'Hill', 'Ramirez', 'Campbell', 'Mitchell', 'Roberts', 'Carter',
       'Phillips', 'Evans', 'Turner', 'Torres', 'Parker', 'Collins',
       'Edwards', 'Stewart', 'Flores', 'Morris', 'Nguyen', 'Murphy',
       'Rivera', 'Cook', 'Rogers', 'Morgan', 'Peterson', 'Cooper', 'Reed',
       'Bailey', 'Bell', 'Gomez', 'Kelly', 'Howard', 'Ward', 'Cox', 'Diaz',
       'Richardson', 'Wood', 'Watson', 'Brooks', 'Bennett', 'Gray',
       'James', 'Reyes', 'Cruz', 'Hughes', 'Price', 'Myers', 'Long',
       'Foster', 'Sanders', 'Ross', 'Morales', 'Powell', 'Sullivan',
       'Russell', 'Ortiz', 'Jenkins', 'Gutierrez', 'Perry', 'Butler',
       'Barnes', 'Fisher'], dtype=object)

In [3]:
# make dataframe of customer names
df_names = pd.DataFrame({"name": names})
id = df_names.index+1 # give each name a unique id number
id = id.unique()
id


Out[3]:
Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
             14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
             27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
             40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
             53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
             66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
             79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
             92,  93,  94,  95,  96,  97,  98,  99, 100],
           dtype='int64')

In [4]:
id_dict = dict(zip(names, id))
df_sales['id']=df_sales.name.map(id_dict)
df_sales.head()


Out[4]:
name offer id
0 Smith 2 1
1 Smith 24 1
2 Johnson 17 2
3 Johnson 24 2
4 Johnson 26 2

In [5]:
pivot = pd.pivot_table(df_sales, index=["offer"], columns=["id"], aggfunc=len, fill_value='0')
pivot.index.name = None
pivot.columns = pivot.columns.get_level_values(1) # sets cols to product categories
X = pivot.as_matrix()
X = np.matrix(X)
X = X.astype(int)
X


Out[5]:
matrix([[0, 0, 0, ..., 1, 0, 1],
        [1, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 1, 0, 1],
        [0, 0, 1, ..., 0, 1, 1],
        [0, 0, 0, ..., 0, 0, 0]])

In [52]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine

dist_out = 1-pairwise_distances(X.T, metric="cosine")
dist_out


Out[52]:
array([[ 1.        ,  0.40824829,  0.        , ...,  0.        ,
         0.        ,  0.26726124],
       [ 0.40824829,  1.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  1.        , ...,  0.25819889,
         0.57735027,  0.43643578],
       ..., 
       [ 0.        ,  0.        ,  0.25819889, ...,  1.        ,
         0.2236068 ,  0.6761234 ],
       [ 0.        ,  0.        ,  0.57735027, ...,  0.2236068 ,
         1.        ,  0.37796447],
       [ 0.26726124,  0.        ,  0.43643578, ...,  0.6761234 ,
         0.37796447,  1.        ]])

In [53]:
import networkx as nx
import matplotlib.pyplot as plt
G = nx.from_numpy_matrix(dist_out)
G.graph['name']='cosine similarity graph'

# create network layout for visualizations
pos = nx.spring_layout(G)

nx.draw(G, pos, node_size=50)
print(nx.info(G))
plt.show()


Name: cosine similarity graph
Type: Graph
Number of nodes: 100
Number of edges: 1575
Average degree:  31.5000

In [54]:
r_hood = dist_out < 0.5  # filter out low similarity edges
dist_out[r_hood] = 0      # low values set to 0
G = nx.from_numpy_matrix(dist_out)
G.graph['name']='r-filtered similarity graph'

# create network layout for visualizations
pos = nx.spring_layout(G)

nx.draw(G, pos, node_size=50)
print(nx.info(G))
plt.show() # show filtered graph


Name: r-filtered similarity graph
Type: Graph
Number of nodes: 100
Number of edges: 442
Average degree:   8.8400

In [83]:
import community

# find communities
part = community.best_partition(G)
G.graph['name']='community graph'

# create network layout for visualizations
pos = nx.spring_layout(G)

# plot and color nodes using community structure
community_num = [part.get(node) for node in G.nodes()]
nx.draw(G, pos, cmap = plt.get_cmap("jet"), node_color = community_num, node_size = 50)
print(nx.info(G))
plt.show()


Name: community graph
Type: Graph
Number of nodes: 100
Number of edges: 442
Average degree:   8.8400

In [76]:
# find modularity
mod = community.modularity(part,G)
print("modularity:", mod)


modularity: 0.6749434330467096

In [77]:
community_num = [x+1 for x in community_num] # non-zero indexing for commmunity list
community_dict = dict(zip(names, community_num))
df_sales['community']=df_sales.name.map(community_dict) # map communities to sales
df_sales.head() # note: first five all in same community


Out[77]:
name offer community
0 Smith 2 1
1 Smith 24 1
2 Johnson 17 1
3 Johnson 24 1
4 Johnson 26 1

In [78]:
from collections import Counter

count_dict = dict(zip(df_sales['community'], df_sales['offer'])) # create dictonary of all offers purchased by each community
count_list = Counter(count_dict)
df_communities = pd.DataFrame(sorted(count_list.most_common())) # find most common offer purchased by each community
df_communities.columns = ['Community', 'Offer']
df_communities


Out[78]:
Community Offer
0 1 26
1 2 31
2 3 30
3 4 30
4 5 5
5 6 23
6 7 29
7 8 31
8 9 22
9 10 31
10 11 21

In [79]:
# load info about offers
df_offers = pd.read_excel(open('C:/Users/craigrshenton/Desktop/Dropbox/excel_data_sci/ch02/WineKMC.xlsx','rb'), sheetname=0)
df_offers.rename(columns={'Offer #':'Offer'}, inplace=True)
df_offers.head()


Out[79]:
Offer Campaign Varietal Minimum Qty (kg) Discount (%) Origin Past Peak
0 1 January Malbec 72 56 France False
1 2 January Pinot Noir 72 17 France False
2 3 February Espumante 144 32 Oregon True
3 4 February Champagne 72 48 France True
4 5 February Cabernet Sauvignon 144 44 New Zealand True

In [80]:
df_communities = df_communities.merge(df_offers, on='Offer', how='left') # merge info on offers with community index
df_communities.rename(columns={'Offer':'Offer Most Purchased'}, inplace=True) # add more accurate lable 
df_communities


Out[80]:
Community Offer Most Purchased Campaign Varietal Minimum Qty (kg) Discount (%) Origin Past Peak
0 1 26 October Pinot Noir 144 83 Australia False
1 2 31 December Champagne 72 89 France False
2 3 30 December Malbec 6 54 France False
3 4 30 December Malbec 6 54 France False
4 5 5 February Cabernet Sauvignon 144 44 New Zealand True
5 6 23 September Chardonnay 144 39 South Africa False
6 7 29 November Pinot Grigio 6 87 France False
7 8 31 December Champagne 72 89 France False
8 9 22 August Champagne 72 63 France False
9 10 31 December Champagne 72 89 France False
10 11 21 August Champagne 12 50 California False

In [ ]: