In [156]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
from collections import defaultdict


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [157]:
def read_transactions(flname):
    fl = open(flname)
    customer_trans = defaultdict(lambda: defaultdict(list))
    for ln in fl:
        customer_name, customer_zipcode, trans_time, trans_product = ln.split(",")
        trans_product = trans_product.split(":")
        customer_trans[customer_name][float(trans_time)].append(trans_product)
    fl.close()
    return customer_trans

In [158]:
transactions = read_transactions("../src/python/transactions.txt")

In [159]:
len(transactions)


Out[159]:
1000

In [160]:
customer_trans_counts = []
for customer, customer_trans in transactions.iteritems():
    customer_trans_counts.append(len(customer_trans) / 5.0)
plt.hist(customer_trans_counts)
plt.xlabel("Average Transactions per Year", fontsize=16)
plt.ylabel("Number of Customers", fontsize=16)


Out[160]:
<matplotlib.text.Text at 0x10acc0dd0>

In [161]:
time_between_trans = []
for customer, customer_trans in transactions.iteritems():
    times = sorted(customer_trans.keys())
    previous_time = times[0]
    for next_time in times[1:]:
        diff = next_time - previous_time
        previous_time = next_time
        time_between_trans.append(diff)
plt.hist(time_between_trans)
plt.xlabel("Elapsed Time Between Transactions (Days)", fontsize=16)
plt.ylabel("Number of Transactions", fontsize=16)


Out[161]:
<matplotlib.text.Text at 0x115c45210>

In [162]:
dry_dog_food_purchases = defaultdict(list)
for customer, customer_trans in transactions.iteritems():
    for trans_time in sorted(customer_trans.keys()):
        items = customer_trans[trans_time]
        for item in items:
            category = item[0]
            if category == "dry dog food":
                dry_dog_food_purchases[customer].append(item)

In [163]:
prob_same_item = dict()
for customer, items in dry_dog_food_purchases.items():
    count_same = 0
    previous_item = items[0]
    for next_item in items[1:]:
        if previous_item == next_item:
            count_same += 1
        previous_item = next_item
    prob_same_item[customer] = float(count_same) / float(len(items) - 1)
plt.hist(prob_same_item.values(), bins=20)
plt.xlabel("Probability of Choosing the Same Item Again", fontsize=16)
plt.ylabel("Number of Customers", fontsize=16)


Out[163]:
<matplotlib.text.Text at 0x11987c110>

In [164]:
prob_same_brand = dict()
for customer, items in dry_dog_food_purchases.iteritems():
    count_same = 0
    previous_item_brand = items[0][1]
    for next_item in items[1:]:
        if previous_item_brand == next_item[1]:
            count_same += 1
        previous_item_brand = next_item[1]
    prob_same_brand[customer] = float(count_same) / float(len(items) - 1)
plt.hist(prob_same_brand.values())
plt.xlabel("Probability of Buying the Same Brand", fontsize=16)
plt.ylabel("Number of Customers", fontsize=16)


Out[164]:
<matplotlib.text.Text at 0x114756050>

In [165]:
prob_same_flavor = dict()
for customer, items in dry_dog_food_purchases.iteritems():
    count_same = 0
    previous_item_flavor = items[0][2]
    for next_item in items[1:]:
        if previous_item_flavor == next_item[2]:
            count_same += 1
        previous_item_flavor = next_item[2]
    prob_same_flavor[customer] = float(count_same) / float(len(items) - 1)
plt.hist(prob_same_flavor.values())
plt.xlabel("Probability of Buying the Same Flavor", fontsize=16)
plt.ylabel("Number of Customers", fontsize=16)


Out[165]:
<matplotlib.text.Text at 0x1105072d0>

In [166]:
loopback_weights = [float(ln.strip()) for ln in open("../src/python/loopback_weights.txt")]
plt.hist(loopback_weights)


Out[166]:
(array([ 208.,  459.,  533.,  318.,   86.,   82.,  304.,  512.,  453.,  213.]),
 array([ 0.05,  0.14,  0.23,  0.32,  0.41,  0.5 ,  0.59,  0.68,  0.77,
        0.86,  0.95]),
 <a list of 10 Patch objects>)

In [167]:
item_ids = dict()
ident = 0
for customer, items in dry_dog_food_purchases.iteritems():
    for item in items:
        item = tuple(item)
        if item not in item_ids:
            item_ids[item] = ident
            ident += 1

In [168]:
print len(item_ids)


24

In [169]:
print item_ids.keys()


[('dry dog food', 'Dog Days', 'Pork', '30.0\n'), ('dry dog food', 'Happy Pup', 'Pork', '30.0\n'), ('dry dog food', 'Wellfed', 'Chicken', '15.0\n'), ('dry dog food', 'Dog Days', 'Fish & Potato', '30.0\n'), ('dry dog food', 'Wellfed', 'Fish & Potato', '30.0\n'), ('dry dog food', 'Happy Pup', 'Lamb & Rice', '15.0\n'), ('dry dog food', 'Dog Days', 'Pork', '15.0\n'), ('dry dog food', 'Wellfed', 'Chicken', '30.0\n'), ('dry dog food', 'Happy Pup', 'Fish & Potato', '15.0\n'), ('dry dog food', 'Dog Days', 'Fish & Potato', '15.0\n'), ('dry dog food', 'Wellfed', 'Pork', '15.0\n'), ('dry dog food', 'Happy Pup', 'Lamb & Rice', '30.0\n'), ('dry dog food', 'Happy Pup', 'Fish & Potato', '30.0\n'), ('dry dog food', 'Dog Days', 'Chicken', '15.0\n'), ('dry dog food', 'Wellfed', 'Lamb & Rice', '15.0\n'), ('dry dog food', 'Wellfed', 'Pork', '30.0\n'), ('dry dog food', 'Dog Days', 'Lamb & Rice', '30.0\n'), ('dry dog food', 'Wellfed', 'Lamb & Rice', '30.0\n'), ('dry dog food', 'Dog Dogs', 'Chicken', '30.0\n'), ('dry dog food', 'Happy Pup', 'Chicken', '15.0\n'), ('dry dog food', 'Happy Pup', 'Pork', '15.0\n'), ('dry dog food', 'Dog Days', 'Lamb & Rice', '15.0\n'), ('dry dog food', 'Happy Pup', 'Chicken', '30.0\n'), ('dry dog food', 'Wellfed', 'Fish & Potato', '15.0\n')]

In [170]:
def prob_same_item(items):
    count_same = 0
    previous_item = items[0]
    for next_item in items[1:]:
        if previous_item == next_item:
            count_same += 1
        previous_item = next_item
    return float(count_same) / float(len(items) - 1)

def prob_same_brand(items):
    count_same = 0
    previous_item = items[0][1]
    for next_item in items[1:]:
        if previous_item == next_item[1]:
            count_same += 1
        previous_item = next_item[1]
    return float(count_same) / float(len(items) - 1)

def prob_same_flavor(items):
    count_same = 0
    previous_item = items[0][2]
    for next_item in items[1:]:
        if previous_item == next_item[2]:
            count_same += 1
        previous_item = next_item[2]
    return float(count_same) / float(len(items) - 1)

In [171]:
customer_features = []
for customer, items in dry_dog_food_purchases.iteritems():
    # features are:
    # p(same item), p(same brand), p(same flavor)
    # favorite item, favorite brand, favorite flavor
    features = np.zeros(3)
    features[0] = prob_same_item(items)
    features[1] = prob_same_brand(items)
    features[2] = prob_same_flavor(items)
    customer_features.append(features)
customer_features = np.array(customer_features)

In [172]:
from sklearn.cluster import k_means
all_centroids = dict()
center_counts = [2, 4, 8, 12, 16, 20, 24, 30, 36]
inertia_values = []
for k in center_counts:
    centroids, labels, inertia = k_means(customer_features, k)
    inertia_values.append(inertia)
    all_centroids[k] = centroids
    print k, inertia


2 27.446766406
4 14.1202029966
8 7.69996123795
12 5.50566057291
16 4.5889706433
20 3.94575986733
24 3.41770900495
30 2.85761295172
36 2.5312369434

In [173]:
plt.plot([2, 4, 8, 12, 16, 20, 24, 30, 36], [27.3, 12.4, 6.7, 4.8, 3.4, 3.3, 2.8, 2.3, 2.0], "b.-")
plt.xlabel("Number of Clusters", fontsize=16)
plt.ylabel("Inertia", fontsize=16)
plt.xlim([1, 37])
plt.title("Dry Dog Food Features", fontsize=18)


Out[173]:
<matplotlib.text.Text at 0x114bdb710>

In [174]:
plt.clf()
plt.scatter(customer_features[:, 1], customer_features[:, 2], marker="o", c="k", alpha=0.3)
plt.hold(True)
plt.scatter(all_centroids[16][:, 1], all_centroids[16][:, 2], s=50.0 * np.ones(16), marker="o", c="c")
plt.hold(False)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel("Prob. Purchase Same Brand", fontsize=16)
plt.ylabel("Prob. Purchase Same Flavor", fontsize=16)


Out[174]:
<matplotlib.text.Text at 0x110cb5a10>

In [ ]: