In [156]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
from collections import defaultdict
In [157]:
def read_transactions(flname):
fl = open(flname)
customer_trans = defaultdict(lambda: defaultdict(list))
for ln in fl:
customer_name, customer_zipcode, trans_time, trans_product = ln.split(",")
trans_product = trans_product.split(":")
customer_trans[customer_name][float(trans_time)].append(trans_product)
fl.close()
return customer_trans
In [158]:
transactions = read_transactions("../src/python/transactions.txt")
In [159]:
len(transactions)
Out[159]:
In [160]:
customer_trans_counts = []
for customer, customer_trans in transactions.iteritems():
customer_trans_counts.append(len(customer_trans) / 5.0)
plt.hist(customer_trans_counts)
plt.xlabel("Average Transactions per Year", fontsize=16)
plt.ylabel("Number of Customers", fontsize=16)
Out[160]:
In [161]:
time_between_trans = []
for customer, customer_trans in transactions.iteritems():
times = sorted(customer_trans.keys())
previous_time = times[0]
for next_time in times[1:]:
diff = next_time - previous_time
previous_time = next_time
time_between_trans.append(diff)
plt.hist(time_between_trans)
plt.xlabel("Elapsed Time Between Transactions (Days)", fontsize=16)
plt.ylabel("Number of Transactions", fontsize=16)
Out[161]:
In [162]:
dry_dog_food_purchases = defaultdict(list)
for customer, customer_trans in transactions.iteritems():
for trans_time in sorted(customer_trans.keys()):
items = customer_trans[trans_time]
for item in items:
category = item[0]
if category == "dry dog food":
dry_dog_food_purchases[customer].append(item)
In [163]:
prob_same_item = dict()
for customer, items in dry_dog_food_purchases.items():
count_same = 0
previous_item = items[0]
for next_item in items[1:]:
if previous_item == next_item:
count_same += 1
previous_item = next_item
prob_same_item[customer] = float(count_same) / float(len(items) - 1)
plt.hist(prob_same_item.values(), bins=20)
plt.xlabel("Probability of Choosing the Same Item Again", fontsize=16)
plt.ylabel("Number of Customers", fontsize=16)
Out[163]:
In [164]:
prob_same_brand = dict()
for customer, items in dry_dog_food_purchases.iteritems():
count_same = 0
previous_item_brand = items[0][1]
for next_item in items[1:]:
if previous_item_brand == next_item[1]:
count_same += 1
previous_item_brand = next_item[1]
prob_same_brand[customer] = float(count_same) / float(len(items) - 1)
plt.hist(prob_same_brand.values())
plt.xlabel("Probability of Buying the Same Brand", fontsize=16)
plt.ylabel("Number of Customers", fontsize=16)
Out[164]:
In [165]:
prob_same_flavor = dict()
for customer, items in dry_dog_food_purchases.iteritems():
count_same = 0
previous_item_flavor = items[0][2]
for next_item in items[1:]:
if previous_item_flavor == next_item[2]:
count_same += 1
previous_item_flavor = next_item[2]
prob_same_flavor[customer] = float(count_same) / float(len(items) - 1)
plt.hist(prob_same_flavor.values())
plt.xlabel("Probability of Buying the Same Flavor", fontsize=16)
plt.ylabel("Number of Customers", fontsize=16)
Out[165]:
In [166]:
loopback_weights = [float(ln.strip()) for ln in open("../src/python/loopback_weights.txt")]
plt.hist(loopback_weights)
Out[166]:
In [167]:
item_ids = dict()
ident = 0
for customer, items in dry_dog_food_purchases.iteritems():
for item in items:
item = tuple(item)
if item not in item_ids:
item_ids[item] = ident
ident += 1
In [168]:
print len(item_ids)
In [169]:
print item_ids.keys()
In [170]:
def prob_same_item(items):
count_same = 0
previous_item = items[0]
for next_item in items[1:]:
if previous_item == next_item:
count_same += 1
previous_item = next_item
return float(count_same) / float(len(items) - 1)
def prob_same_brand(items):
count_same = 0
previous_item = items[0][1]
for next_item in items[1:]:
if previous_item == next_item[1]:
count_same += 1
previous_item = next_item[1]
return float(count_same) / float(len(items) - 1)
def prob_same_flavor(items):
count_same = 0
previous_item = items[0][2]
for next_item in items[1:]:
if previous_item == next_item[2]:
count_same += 1
previous_item = next_item[2]
return float(count_same) / float(len(items) - 1)
In [171]:
customer_features = []
for customer, items in dry_dog_food_purchases.iteritems():
# features are:
# p(same item), p(same brand), p(same flavor)
# favorite item, favorite brand, favorite flavor
features = np.zeros(3)
features[0] = prob_same_item(items)
features[1] = prob_same_brand(items)
features[2] = prob_same_flavor(items)
customer_features.append(features)
customer_features = np.array(customer_features)
In [172]:
from sklearn.cluster import k_means
all_centroids = dict()
center_counts = [2, 4, 8, 12, 16, 20, 24, 30, 36]
inertia_values = []
for k in center_counts:
centroids, labels, inertia = k_means(customer_features, k)
inertia_values.append(inertia)
all_centroids[k] = centroids
print k, inertia
In [173]:
plt.plot([2, 4, 8, 12, 16, 20, 24, 30, 36], [27.3, 12.4, 6.7, 4.8, 3.4, 3.3, 2.8, 2.3, 2.0], "b.-")
plt.xlabel("Number of Clusters", fontsize=16)
plt.ylabel("Inertia", fontsize=16)
plt.xlim([1, 37])
plt.title("Dry Dog Food Features", fontsize=18)
Out[173]:
In [174]:
plt.clf()
plt.scatter(customer_features[:, 1], customer_features[:, 2], marker="o", c="k", alpha=0.3)
plt.hold(True)
plt.scatter(all_centroids[16][:, 1], all_centroids[16][:, 2], s=50.0 * np.ones(16), marker="o", c="c")
plt.hold(False)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel("Prob. Purchase Same Brand", fontsize=16)
plt.ylabel("Prob. Purchase Same Flavor", fontsize=16)
Out[174]:
In [ ]: