In [2]:
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from ggplot import *

from probabinerator import Probabinerator

In [2]:
#try it with a different data set, for public release 
# info is here: http://statsmodels.sourceforge.net/devel/datasets/generated/modechoice.html

In [3]:
df = pd.read_csv("http://people.stern.nyu.edu/wgreene/Text/Edition7/TableF18-2.csv")

In [4]:
df.head()


Out[4]:
MODE TTME INVC INVT GC HINC PSIZE
0 0 69 59 100 70 35 1
1 0 34 31 372 71 35 1
2 0 35 25 417 70 35 1
3 1 0 10 180 30 35 1
4 0 64 58 68 68 30 2

In [5]:
features = ['TTME', 'INVC']

In [6]:
df['TTME'].value_counts()


Out[6]:
0     210
53     94
35     87
69     77
34     77
64     75
44     70
30     27
45     19
15     17
10     16
40     11
60      9
20      9
25      8
50      7
5       7
75      6
90      5
55      2
99      2
16      1
80      1
85      1
2       1
1       1
Name: TTME, dtype: int64

In [23]:
df.sort_values(by='TTME', inplace=True)
df['TTME_diff'] = df['TTME'].diff()
df['TTME_diff'].fillna(0, inplace=True)
df.head()


Out[23]:
MODE TTME INVC INVT GC HINC PSIZE TTME_diff combine_these
419 1 0 20 1440 238 27 2 0.0 False
555 0 0 35 825 125 26 1 0.0 True
551 0 0 64 859 157 50 1 0.0 True
547 1 0 36 750 117 45 1 0.0 True
543 1 0 19 720 128 12 2 0.0 True

In [24]:
bins = df.groupby('TTME_diff')['TTME'].count() #try this instead of inverted index
bins


Out[24]:
TTME_diff
0.0    815
1.0      5
2.0      1
3.0      2
4.0      4
5.0     11
6.0      1
9.0      1
Name: TTME, dtype: int64

In [25]:
#then still want to combine adjacent bins if they're similar - need to define similar
bins.plot(kind='bar')


Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x10f931908>

In [13]:
df.shape


Out[13]:
(840, 8)

In [26]:
stdev = df['TTME'].std()
stdev


Out[26]:
24.948607571393737

In [33]:
#flag if counts (from bins) < stdev
mask = bins < stdev #can't actually do it this way


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-33-3d7db5e4f866> in <module>()
      1 #flag if counts < stdev
      2 mask = bins < stdev
----> 3 df['combine_these'] = np.where(mask, True, False)

//anaconda/envs/auction_test/lib/python3.5/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
   2417         else:
   2418             # set column
-> 2419             self._set_item(key, value)
   2420 
   2421     def _setitem_slice(self, key, value):

//anaconda/envs/auction_test/lib/python3.5/site-packages/pandas/core/frame.py in _set_item(self, key, value)
   2483 
   2484         self._ensure_valid_index(value)
-> 2485         value = self._sanitize_column(key, value)
   2486         NDFrame._set_item(self, key, value)
   2487 

//anaconda/envs/auction_test/lib/python3.5/site-packages/pandas/core/frame.py in _sanitize_column(self, key, value, broadcast)
   2654 
   2655             # turn me into an ndarray
-> 2656             value = _sanitize_index(value, self.index, copy=False)
   2657             if not isinstance(value, (np.ndarray, Index)):
   2658                 if isinstance(value, list) and len(value) > 0:

//anaconda/envs/auction_test/lib/python3.5/site-packages/pandas/core/series.py in _sanitize_index(data, index, copy)
   2796 
   2797     if len(data) != len(index):
-> 2798         raise ValueError('Length of values does not match length of ' 'index')
   2799 
   2800     if isinstance(data, PeriodIndex):

ValueError: Length of values does not match length of index

In [28]:
#use bins as reference to decide what bins to use and combine the rest - this example is very simple
bins.to_dict()


Out[28]:
{0.0: 815, 1.0: 5, 2.0: 1, 3.0: 2, 4.0: 4, 5.0: 11, 6.0: 1, 9.0: 1}

In [29]:
df.head()


Out[29]:
MODE TTME INVC INVT GC HINC PSIZE TTME_diff combine_these
419 1 0 20 1440 238 27 2 0.0 True
555 0 0 35 825 125 26 1 0.0 True
551 0 0 64 859 157 50 1 0.0 True
547 1 0 36 750 117 45 1 0.0 True
543 1 0 19 720 128 12 2 0.0 True

In [32]:
df.groupby(['TTME_diff', 'combine_these']).count()


Out[32]:
MODE TTME INVC INVT GC HINC PSIZE
TTME_diff combine_these
0.0 True 815 815 815 815 815 815 815
1.0 True 5 5 5 5 5 5 5
2.0 True 1 1 1 1 1 1 1
3.0 True 2 2 2 2 2 2 2
4.0 True 4 4 4 4 4 4 4
5.0 True 11 11 11 11 11 11 11
6.0 True 1 1 1 1 1 1 1
9.0 True 1 1 1 1 1 1 1

In [7]:
prob = Probabinerator(df, features[0])
prob.count_index()
prob.invind


Out[7]:
{1: [1, 2, 16, 85, 80],
 2: [99, 55],
 5: [90],
 6: [75],
 7: [5, 50],
 8: [25],
 9: [20, 60],
 11: [40],
 16: [10],
 17: [15],
 19: [45],
 27: [30],
 70: [44],
 75: [64],
 77: [69, 34],
 87: [35],
 94: [53],
 210: [0]}

In [8]:
count_target = sum(list(prob.invind.keys()))/3
count_target


Out[8]:
247.0

In [9]:
kv_list = sorted(list(prob.invind.copy().items()), key=lambda x: x[1][0])
kv_list


Out[9]:
[(210, [0]),
 (1, [1, 2, 16, 85, 80]),
 (7, [5, 50]),
 (16, [10]),
 (17, [15]),
 (9, [20, 60]),
 (8, [25]),
 (27, [30]),
 (87, [35]),
 (11, [40]),
 (70, [44]),
 (19, [45]),
 (94, [53]),
 (75, [64]),
 (77, [69, 34]),
 (6, [75]),
 (5, [90]),
 (2, [99, 55])]

In [10]:
prob.bin_combiner(toplot=True)
prob.newbins


Out[10]:
{246: 64.0, 329: 16.0}

In [11]:
prob.bin_combiner()
prob.bin_ranges #missing 16 between 1 and 2; missing 55 between 2 and 3
#need some logic to check for adjusting the ranges
#probably need to keep track of which ones are being used, in order to find the ones that aren't


Out[11]:
[[0, 35], [44, 90]]

In [12]:
prob.bin_combiner(toplot=True)
prob.plot_with_newbins()


Out[12]:
<ggplot: (272770859)>

In [13]:
df['TTME'].plot(kind='hist', bins=10)


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x104275fd0>

In [14]:
df['TTME'].plot(kind='hist', bins=3)


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x10f960908>

In [15]:
df['INVC'].plot(kind='hist', bins=10)


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x10f7a1080>

In [16]:
prob = Probabinerator(df, features[1])
prob.count_index()
prob.invind


Out[16]:
{1: [2,
  3,
  55,
  63,
  68,
  79,
  80,
  95,
  99,
  101,
  106,
  114,
  117,
  118,
  119,
  124,
  125,
  126,
  127,
  129,
  138,
  142,
  143,
  145,
  147,
  148,
  152,
  165,
  180],
 2: [56, 77, 82, 83, 87, 97, 98, 100, 102, 112, 113, 115, 116, 120, 121, 151],
 3: [40, 41, 42, 53, 57, 72, 76, 78, 84, 86, 88, 93, 105, 109, 110],
 4: [65, 66, 73, 74, 85, 104],
 5: [6, 36, 51, 52, 69, 90, 92, 94, 96, 107, 111],
 6: [10, 20, 22, 23, 29, 37, 39, 49, 54, 62, 64, 67, 75, 81, 91],
 7: [4, 5, 9, 11, 27, 34, 48],
 8: [7, 8, 46],
 9: [28, 61, 71, 108],
 10: [15, 18, 47, 89, 103],
 11: [38],
 12: [16, 24, 50, 70],
 13: [12, 14, 21, 30, 45],
 14: [17, 60],
 15: [33, 43, 44],
 16: [19],
 17: [35],
 18: [26, 58],
 20: [32],
 24: [59],
 27: [25],
 33: [31],
 36: [13]}

In [17]:
prob.bin_combiner(bins=4, toplot=True)
prob.plot_with_newbins()


Out[17]:
<ggplot: (-9223372036570876412)>

In [ ]: