notebook.community

Edit and run



In [156]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
from collections import defaultdict









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [157]:

    
def read_transactions(flname):
    fl = open(flname)
    customer_trans = defaultdict(lambda: defaultdict(list))
    for ln in fl:
        customer_name, customer_zipcode, trans_time, trans_product = ln.split(",")
        trans_product = trans_product.split(":")
        customer_trans[customer_name][float(trans_time)].append(trans_product)
    fl.close()
    return customer_trans



In [158]:

    
transactions = read_transactions("../src/python/transactions.txt")



In [159]:

    
len(transactions)









    Out[159]:





1000



In [160]:

    
customer_trans_counts = []
for customer, customer_trans in transactions.iteritems():
    customer_trans_counts.append(len(customer_trans) / 5.0)
plt.hist(customer_trans_counts)
plt.xlabel("Average Transactions per Year", fontsize=16)
plt.ylabel("Number of Customers", fontsize=16)









    Out[160]:





<matplotlib.text.Text at 0x10acc0dd0>



In [161]:

    
time_between_trans = []
for customer, customer_trans in transactions.iteritems():
    times = sorted(customer_trans.keys())
    previous_time = times[0]
    for next_time in times[1:]:
        diff = next_time - previous_time
        previous_time = next_time
        time_between_trans.append(diff)
plt.hist(time_between_trans)
plt.xlabel("Elapsed Time Between Transactions (Days)", fontsize=16)
plt.ylabel("Number of Transactions", fontsize=16)









    Out[161]:





<matplotlib.text.Text at 0x115c45210>



In [162]:

    
dry_dog_food_purchases = defaultdict(list)
for customer, customer_trans in transactions.iteritems():
    for trans_time in sorted(customer_trans.keys()):
        items = customer_trans[trans_time]
        for item in items:
            category = item[0]
            if category == "dry dog food":
                dry_dog_food_purchases[customer].append(item)



In [163]:

    
prob_same_item = dict()
for customer, items in dry_dog_food_purchases.items():
    count_same = 0
    previous_item = items[0]
    for next_item in items[1:]:
        if previous_item == next_item:
            count_same += 1
        previous_item = next_item
    prob_same_item[customer] = float(count_same) / float(len(items) - 1)
plt.hist(prob_same_item.values(), bins=20)
plt.xlabel("Probability of Choosing the Same Item Again", fontsize=16)
plt.ylabel("Number of Customers", fontsize=16)









    Out[163]:





<matplotlib.text.Text at 0x11987c110>



In [164]:

    
prob_same_brand = dict()
for customer, items in dry_dog_food_purchases.iteritems():
    count_same = 0
    previous_item_brand = items[0][1]
    for next_item in items[1:]:
        if previous_item_brand == next_item[1]:
            count_same += 1
        previous_item_brand = next_item[1]
    prob_same_brand[customer] = float(count_same) / float(len(items) - 1)
plt.hist(prob_same_brand.values())
plt.xlabel("Probability of Buying the Same Brand", fontsize=16)
plt.ylabel("Number of Customers", fontsize=16)









    Out[164]:





<matplotlib.text.Text at 0x114756050>



In [165]:

    
prob_same_flavor = dict()
for customer, items in dry_dog_food_purchases.iteritems():
    count_same = 0
    previous_item_flavor = items[0][2]
    for next_item in items[1:]:
        if previous_item_flavor == next_item[2]:
            count_same += 1
        previous_item_flavor = next_item[2]
    prob_same_flavor[customer] = float(count_same) / float(len(items) - 1)
plt.hist(prob_same_flavor.values())
plt.xlabel("Probability of Buying the Same Flavor", fontsize=16)
plt.ylabel("Number of Customers", fontsize=16)









    Out[165]:





<matplotlib.text.Text at 0x1105072d0>



In [166]:

    
loopback_weights = [float(ln.strip()) for ln in open("../src/python/loopback_weights.txt")]
plt.hist(loopback_weights)









    Out[166]:





(array([ 208.,  459.,  533.,  318.,   86.,   82.,  304.,  512.,  453.,  213.]),
 array([ 0.05,  0.14,  0.23,  0.32,  0.41,  0.5 ,  0.59,  0.68,  0.77,
        0.86,  0.95]),
 <a list of 10 Patch objects>)



In [167]:

    
item_ids = dict()
ident = 0
for customer, items in dry_dog_food_purchases.iteritems():
    for item in items:
        item = tuple(item)
        if item not in item_ids:
            item_ids[item] = ident
            ident += 1



In [168]:

    
print len(item_ids)



In [169]:

    
print item_ids.keys()









    



[('dry dog food', 'Dog Days', 'Pork', '30.0\n'), ('dry dog food', 'Happy Pup', 'Pork', '30.0\n'), ('dry dog food', 'Wellfed', 'Chicken', '15.0\n'), ('dry dog food', 'Dog Days', 'Fish & Potato', '30.0\n'), ('dry dog food', 'Wellfed', 'Fish & Potato', '30.0\n'), ('dry dog food', 'Happy Pup', 'Lamb & Rice', '15.0\n'), ('dry dog food', 'Dog Days', 'Pork', '15.0\n'), ('dry dog food', 'Wellfed', 'Chicken', '30.0\n'), ('dry dog food', 'Happy Pup', 'Fish & Potato', '15.0\n'), ('dry dog food', 'Dog Days', 'Fish & Potato', '15.0\n'), ('dry dog food', 'Wellfed', 'Pork', '15.0\n'), ('dry dog food', 'Happy Pup', 'Lamb & Rice', '30.0\n'), ('dry dog food', 'Happy Pup', 'Fish & Potato', '30.0\n'), ('dry dog food', 'Dog Days', 'Chicken', '15.0\n'), ('dry dog food', 'Wellfed', 'Lamb & Rice', '15.0\n'), ('dry dog food', 'Wellfed', 'Pork', '30.0\n'), ('dry dog food', 'Dog Days', 'Lamb & Rice', '30.0\n'), ('dry dog food', 'Wellfed', 'Lamb & Rice', '30.0\n'), ('dry dog food', 'Dog Dogs', 'Chicken', '30.0\n'), ('dry dog food', 'Happy Pup', 'Chicken', '15.0\n'), ('dry dog food', 'Happy Pup', 'Pork', '15.0\n'), ('dry dog food', 'Dog Days', 'Lamb & Rice', '15.0\n'), ('dry dog food', 'Happy Pup', 'Chicken', '30.0\n'), ('dry dog food', 'Wellfed', 'Fish & Potato', '15.0\n')]



In [170]:

    
def prob_same_item(items):
    count_same = 0
    previous_item = items[0]
    for next_item in items[1:]:
        if previous_item == next_item:
            count_same += 1
        previous_item = next_item
    return float(count_same) / float(len(items) - 1)

def prob_same_brand(items):
    count_same = 0
    previous_item = items[0][1]
    for next_item in items[1:]:
        if previous_item == next_item[1]:
            count_same += 1
        previous_item = next_item[1]
    return float(count_same) / float(len(items) - 1)

def prob_same_flavor(items):
    count_same = 0
    previous_item = items[0][2]
    for next_item in items[1:]:
        if previous_item == next_item[2]:
            count_same += 1
        previous_item = next_item[2]
    return float(count_same) / float(len(items) - 1)



In [171]:

    
customer_features = []
for customer, items in dry_dog_food_purchases.iteritems():
    # features are:
    # p(same item), p(same brand), p(same flavor)
    # favorite item, favorite brand, favorite flavor
    features = np.zeros(3)
    features[0] = prob_same_item(items)
    features[1] = prob_same_brand(items)
    features[2] = prob_same_flavor(items)
    customer_features.append(features)
customer_features = np.array(customer_features)



In [172]:

    
from sklearn.cluster import k_means
all_centroids = dict()
center_counts = [2, 4, 8, 12, 16, 20, 24, 30, 36]
inertia_values = []
for k in center_counts:
    centroids, labels, inertia = k_means(customer_features, k)
    inertia_values.append(inertia)
    all_centroids[k] = centroids
    print k, inertia









    



2 27.446766406
4 14.1202029966
8 7.69996123795
12 5.50566057291
16 4.5889706433
20 3.94575986733
24 3.41770900495
30 2.85761295172
36 2.5312369434



In [173]:

    
plt.plot([2, 4, 8, 12, 16, 20, 24, 30, 36], [27.3, 12.4, 6.7, 4.8, 3.4, 3.3, 2.8, 2.3, 2.0], "b.-")
plt.xlabel("Number of Clusters", fontsize=16)
plt.ylabel("Inertia", fontsize=16)
plt.xlim([1, 37])
plt.title("Dry Dog Food Features", fontsize=18)









    Out[173]:





<matplotlib.text.Text at 0x114bdb710>



In [174]:

    
plt.clf()
plt.scatter(customer_features[:, 1], customer_features[:, 2], marker="o", c="k", alpha=0.3)
plt.hold(True)
plt.scatter(all_centroids[16][:, 1], all_centroids[16][:, 2], s=50.0 * np.ones(16), marker="o", c="c")
plt.hold(False)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel("Prob. Purchase Same Brand", fontsize=16)
plt.ylabel("Prob. Purchase Same Flavor", fontsize=16)









    Out[174]:





<matplotlib.text.Text at 0x110cb5a10>



In [ ]: