Load data from http://media.wiley.com/product_ancillary/6X/11186614/DOWNLOAD/ch02.zip, WineKMC.xlsx
In [1]:
# code written in py_3.0
import pandas as pd
import numpy as np
df_sales = pd.read_excel(open('C:/Users/craigrshenton/Desktop/Dropbox/excel_data_sci/ch02/WineKMC.xlsx','rb'), sheetname=1)
df_sales.columns = ['name', 'offer']
df_sales.head()
Out[1]:
In [2]:
# get list unique customer names
names = df_sales.name.unique()
names
Out[2]:
In [3]:
# make dataframe of customer names
df_names = pd.DataFrame({"name": names})
id = df_names.index+1 # give each name a unique id number
id = id.unique()
id
Out[3]:
In [4]:
id_dict = dict(zip(names, id))
df_sales['id']=df_sales.name.map(id_dict)
df_sales.head()
Out[4]:
In [5]:
pivot = pd.pivot_table(df_sales, index=["offer"], columns=["id"], aggfunc=len, fill_value='0')
pivot.index.name = None
pivot.columns = pivot.columns.get_level_values(1) # sets cols to product categories
X = pivot.as_matrix()
X = np.matrix(X)
X = X.astype(int)
X
Out[5]:
In [52]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
dist_out = 1-pairwise_distances(X.T, metric="cosine")
dist_out
Out[52]:
In [53]:
import networkx as nx
import matplotlib.pyplot as plt
G = nx.from_numpy_matrix(dist_out)
G.graph['name']='cosine similarity graph'
# create network layout for visualizations
pos = nx.spring_layout(G)
nx.draw(G, pos, node_size=50)
print(nx.info(G))
plt.show()
In [54]:
r_hood = dist_out < 0.5 # filter out low similarity edges
dist_out[r_hood] = 0 # low values set to 0
G = nx.from_numpy_matrix(dist_out)
G.graph['name']='r-filtered similarity graph'
# create network layout for visualizations
pos = nx.spring_layout(G)
nx.draw(G, pos, node_size=50)
print(nx.info(G))
plt.show() # show filtered graph
In [83]:
import community
# find communities
part = community.best_partition(G)
G.graph['name']='community graph'
# create network layout for visualizations
pos = nx.spring_layout(G)
# plot and color nodes using community structure
community_num = [part.get(node) for node in G.nodes()]
nx.draw(G, pos, cmap = plt.get_cmap("jet"), node_color = community_num, node_size = 50)
print(nx.info(G))
plt.show()
In [76]:
# find modularity
mod = community.modularity(part,G)
print("modularity:", mod)
In [77]:
community_num = [x+1 for x in community_num] # non-zero indexing for commmunity list
community_dict = dict(zip(names, community_num))
df_sales['community']=df_sales.name.map(community_dict) # map communities to sales
df_sales.head() # note: first five all in same community
Out[77]:
In [78]:
from collections import Counter
count_dict = dict(zip(df_sales['community'], df_sales['offer'])) # create dictonary of all offers purchased by each community
count_list = Counter(count_dict)
df_communities = pd.DataFrame(sorted(count_list.most_common())) # find most common offer purchased by each community
df_communities.columns = ['Community', 'Offer']
df_communities
Out[78]:
In [79]:
# load info about offers
df_offers = pd.read_excel(open('C:/Users/craigrshenton/Desktop/Dropbox/excel_data_sci/ch02/WineKMC.xlsx','rb'), sheetname=0)
df_offers.rename(columns={'Offer #':'Offer'}, inplace=True)
df_offers.head()
Out[79]:
In [80]:
df_communities = df_communities.merge(df_offers, on='Offer', how='left') # merge info on offers with community index
df_communities.rename(columns={'Offer':'Offer Most Purchased'}, inplace=True) # add more accurate lable
df_communities
Out[80]:
In [ ]: