In [39]:
%pylab inline

import pandas as pd

from soln.bracket import get_fixed_and_var_cost
from soln.dataset import get_augmented_train_and_test_set
from soln.dataset import get_component_info_df
from soln.dataset import inverse_log_transform_y
from soln.dataset import load_raw_components
from soln.utils import count_components


Populating the interactive namespace from numpy and matplotlib

In [2]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()


CPU times: user 13.3 s, sys: 148 ms, total: 13.5 s
Wall time: 13.8 s

In [26]:
# From bracket_pricing.ipynb.

taid = 'TA-00002'
df = aug_train_set[aug_train_set.tube_assembly_id == taid]
quantities = df.quantity.values
costs = inverse_log_transform_y(df.log_cost.values)
fixed_cost, var_cost, r2 = get_fixed_and_var_cost(quantities, costs)
print fixed_cost, var_cost, r2

plt.figure(figsize=(8, 5))
plt.scatter(1.0 / quantities, costs, s=200, marker='+')
plt.xlabel('1 / quantity')
plt.ylabel('per-unit cost')
plt.xlim(-0.1, 1.1)
plt.ylim(0, 25)
xx = np.linspace(-0.1, 1.1, 100)
yy = fixed_cost * xx + var_cost
plt.plot(xx, yy)
plt.savefig('images/bracket.png')


19.0447619647 2.83680093249 0.999937805986

In [27]:
# From bracket_pricing.ipynb.

brapa = (1, 2, 5, 10, 25, 50, 100, 250)
df = aug_train_set[aug_train_set.bracketing_pattern == brapa]
grouped = df.groupby('tube_assembly_id')
taids = []
fixed_costs = []
var_costs = []
for taid, indices in grouped.groups.iteritems():
    quantities = df.quantity[indices].values
    costs = inverse_log_transform_y(df.log_cost[indices].values)
    fixed_cost, var_cost, r2 = get_fixed_and_var_cost(quantities, costs)
    if r2 < 0.9999:
        print "{} has bad r2".format(taid)
    taids.append(taid)
    fixed_costs.append(fixed_cost)
    var_costs.append(var_cost)

In [33]:
plt.figure(figsize=(8, 5))
plt.xlabel('fixed_cost')
plt.ylabel('number of tubes (log scale)')
plt.hist(fixed_costs, bins=100, log=True);
plt.savefig('images/fixed_costs.png')



In [40]:
# From components.ipynb.

comp_types, group_dfs, cluster_dfs = load_raw_components()
cinfo_df = get_component_info_df(comp_types, group_dfs, cluster_dfs)
train_counts = count_components(aug_train_set, cinfo_df)
train_counts.rename(columns={'count': 'train_count'}, inplace=True)
test_counts = count_components(aug_test_set, cinfo_df)
test_counts.rename(columns={'count': 'test_count'}, inplace=True)
all_counts = cinfo_df[['component_id', 'component_type_id', 'component_group_id']]
all_counts = all_counts.merge(train_counts, on='component_id')
all_counts = all_counts.merge(test_counts, on='component_id')

In [56]:
bins = [(0, 0), (1, 1), (2, 5), (5, 10), (10, 20), (20, 50), (50, 100), (100, np.inf)]
all_counts['train_bin'] = -1
for i, (cmin, cmax) in enumerate(bins):
    all_counts.train_bin.loc[(all_counts.train_count >= cmin) & (all_counts.train_count <= cmax)] = i
all_counts['test_bin'] = -1
for i, (cmin, cmax) in enumerate(bins):
    all_counts.test_bin.loc[(all_counts.test_count >= cmin) & (all_counts.test_count <= cmax)] = i

In [57]:
grouped = all_counts.groupby(['train_bin', 'test_bin'])
df = grouped.size().unstack()
str_bins = ["{}..{}".format(a, b) for (a, b) in bins]
df.index = pd.Index(str_bins, name='train_seen_count')
df.columns = pd.Index(str_bins, name='test_seen_count')
df.fillna('', inplace=True)
df


Out[57]:
test_seen_count 0..0 1..1 2..5 5..10 10..20 20..50 50..100 100..inf
train_seen_count
0..0 346 399 81 2
1..1 407 140 83 9
2..5 111 94 112 38 2
5..10 2 5 43 37 8
10..20 13 41 2
20..50 9 23 1
50..100 3 6
100..inf 30

In [ ]: