notebook.community

Edit and run



In [2]:

    
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from ggplot import *

from probabinerator import Probabinerator



In [2]:

    
#try it with a different data set, for public release 
# info is here: http://statsmodels.sourceforge.net/devel/datasets/generated/modechoice.html



In [3]:

    
df = pd.read_csv("http://people.stern.nyu.edu/wgreene/Text/Edition7/TableF18-2.csv")



In [4]:

    
df.head()



In [5]:

    
features = ['TTME', 'INVC']



In [6]:

    
df['TTME'].value_counts()









    Out[6]:





0     210
53     94
35     87
69     77
34     77
64     75
44     70
30     27
45     19
15     17
10     16
40     11
60      9
20      9
25      8
50      7
5       7
75      6
90      5
55      2
99      2
16      1
80      1
85      1
2       1
1       1
Name: TTME, dtype: int64



In [23]:

    
df.sort_values(by='TTME', inplace=True)
df['TTME_diff'] = df['TTME'].diff()
df['TTME_diff'].fillna(0, inplace=True)
df.head()









    Out[23]:






  
    
      
      MODE
      TTME
      INVC
      INVT
      GC
      HINC
      PSIZE
      TTME_diff
      combine_these
    
  
  
    
      419
      1
      0
      20
      1440
      238
      27
      2
      0.0
      False
    
    
      555
      0
      0
      35
      825
      125
      26
      1
      0.0
      True
    
    
      551
      0
      0
      64
      859
      157
      50
      1
      0.0
      True
    
    
      547
      1
      0
      36
      750
      117
      45
      1
      0.0
      True
    
    
      543
      1
      0
      19
      720
      128
      12
      2
      0.0
      True



In [24]:

    
bins = df.groupby('TTME_diff')['TTME'].count() #try this instead of inverted index
bins









    Out[24]:





TTME_diff
0.0    815
1.0      5
2.0      1
3.0      2
4.0      4
5.0     11
6.0      1
9.0      1
Name: TTME, dtype: int64



In [25]:

    
#then still want to combine adjacent bins if they're similar - need to define similar
bins.plot(kind='bar')









    Out[25]:





<matplotlib.axes._subplots.AxesSubplot at 0x10f931908>



In [13]:

    
df.shape









    Out[13]:





(840, 8)



In [26]:

    
stdev = df['TTME'].std()
stdev









    Out[26]:





24.948607571393737



In [33]:

    
#flag if counts (from bins) < stdev
mask = bins < stdev #can't actually do it this way









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-33-3d7db5e4f866> in <module>()
      1 #flag if counts < stdev
      2 mask = bins < stdev
----> 3 df['combine_these'] = np.where(mask, True, False)

//anaconda/envs/auction_test/lib/python3.5/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
   2417         else:
   2418             # set column
-> 2419             self._set_item(key, value)
   2420 
   2421     def _setitem_slice(self, key, value):

//anaconda/envs/auction_test/lib/python3.5/site-packages/pandas/core/frame.py in _set_item(self, key, value)
   2483 
   2484         self._ensure_valid_index(value)
-> 2485         value = self._sanitize_column(key, value)
   2486         NDFrame._set_item(self, key, value)
   2487 

//anaconda/envs/auction_test/lib/python3.5/site-packages/pandas/core/frame.py in _sanitize_column(self, key, value, broadcast)
   2654 
   2655             # turn me into an ndarray
-> 2656             value = _sanitize_index(value, self.index, copy=False)
   2657             if not isinstance(value, (np.ndarray, Index)):
   2658                 if isinstance(value, list) and len(value) > 0:

//anaconda/envs/auction_test/lib/python3.5/site-packages/pandas/core/series.py in _sanitize_index(data, index, copy)
   2796 
   2797     if len(data) != len(index):
-> 2798         raise ValueError('Length of values does not match length of ' 'index')
   2799 
   2800     if isinstance(data, PeriodIndex):

ValueError: Length of values does not match length of index



In [28]:

    
#use bins as reference to decide what bins to use and combine the rest - this example is very simple
bins.to_dict()









    Out[28]:





{0.0: 815, 1.0: 5, 2.0: 1, 3.0: 2, 4.0: 4, 5.0: 11, 6.0: 1, 9.0: 1}



In [29]:

    
df.head()









    Out[29]:






  
    
      
      MODE
      TTME
      INVC
      INVT
      GC
      HINC
      PSIZE
      TTME_diff
      combine_these
    
  
  
    
      419
      1
      0
      20
      1440
      238
      27
      2
      0.0
      True
    
    
      555
      0
      0
      35
      825
      125
      26
      1
      0.0
      True
    
    
      551
      0
      0
      64
      859
      157
      50
      1
      0.0
      True
    
    
      547
      1
      0
      36
      750
      117
      45
      1
      0.0
      True
    
    
      543
      1
      0
      19
      720
      128
      12
      2
      0.0
      True



In [32]:

    
df.groupby(['TTME_diff', 'combine_these']).count()









    Out[32]:






  
    
      
      
      MODE
      TTME
      INVC
      INVT
      GC
      HINC
      PSIZE
    
    
      TTME_diff
      combine_these
      
      
      
      
      
      
      
    
  
  
    
      0.0
      True
      815
      815
      815
      815
      815
      815
      815
    
    
      1.0
      True
      5
      5
      5
      5
      5
      5
      5
    
    
      2.0
      True
      1
      1
      1
      1
      1
      1
      1
    
    
      3.0
      True
      2
      2
      2
      2
      2
      2
      2
    
    
      4.0
      True
      4
      4
      4
      4
      4
      4
      4
    
    
      5.0
      True
      11
      11
      11
      11
      11
      11
      11
    
    
      6.0
      True
      1
      1
      1
      1
      1
      1
      1
    
    
      9.0
      True
      1
      1
      1
      1
      1
      1
      1



In [7]:

    
prob = Probabinerator(df, features[0])
prob.count_index()
prob.invind









    Out[7]:





{1: [1, 2, 16, 85, 80],
 2: [99, 55],
 5: [90],
 6: [75],
 7: [5, 50],
 8: [25],
 9: [20, 60],
 11: [40],
 16: [10],
 17: [15],
 19: [45],
 27: [30],
 70: [44],
 75: [64],
 77: [69, 34],
 87: [35],
 94: [53],
 210: [0]}



In [8]:

    
count_target = sum(list(prob.invind.keys()))/3
count_target









    Out[8]:





247.0



In [9]:

    
kv_list = sorted(list(prob.invind.copy().items()), key=lambda x: x[1][0])
kv_list









    Out[9]:





[(210, [0]),
 (1, [1, 2, 16, 85, 80]),
 (7, [5, 50]),
 (16, [10]),
 (17, [15]),
 (9, [20, 60]),
 (8, [25]),
 (27, [30]),
 (87, [35]),
 (11, [40]),
 (70, [44]),
 (19, [45]),
 (94, [53]),
 (75, [64]),
 (77, [69, 34]),
 (6, [75]),
 (5, [90]),
 (2, [99, 55])]



In [10]:

    
prob.bin_combiner(toplot=True)
prob.newbins









    Out[10]:





{246: 64.0, 329: 16.0}



In [11]:

    
prob.bin_combiner()
prob.bin_ranges #missing 16 between 1 and 2; missing 55 between 2 and 3
#need some logic to check for adjusting the ranges
#probably need to keep track of which ones are being used, in order to find the ones that aren't









    Out[11]:





[[0, 35], [44, 90]]



In [12]:

    
prob.bin_combiner(toplot=True)
prob.plot_with_newbins()









    












    Out[12]:





<ggplot: (272770859)>



In [13]:

    
df['TTME'].plot(kind='hist', bins=10)









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x104275fd0>



In [14]:

    
df['TTME'].plot(kind='hist', bins=3)









    Out[14]:





<matplotlib.axes._subplots.AxesSubplot at 0x10f960908>



In [15]:

    
df['INVC'].plot(kind='hist', bins=10)









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x10f7a1080>



In [16]:

    
prob = Probabinerator(df, features[1])
prob.count_index()
prob.invind









    Out[16]:





{1: [2,
  3,
  55,
  63,
  68,
  79,
  80,
  95,
  99,
  101,
  106,
  114,
  117,
  118,
  119,
  124,
  125,
  126,
  127,
  129,
  138,
  142,
  143,
  145,
  147,
  148,
  152,
  165,
  180],
 2: [56, 77, 82, 83, 87, 97, 98, 100, 102, 112, 113, 115, 116, 120, 121, 151],
 3: [40, 41, 42, 53, 57, 72, 76, 78, 84, 86, 88, 93, 105, 109, 110],
 4: [65, 66, 73, 74, 85, 104],
 5: [6, 36, 51, 52, 69, 90, 92, 94, 96, 107, 111],
 6: [10, 20, 22, 23, 29, 37, 39, 49, 54, 62, 64, 67, 75, 81, 91],
 7: [4, 5, 9, 11, 27, 34, 48],
 8: [7, 8, 46],
 9: [28, 61, 71, 108],
 10: [15, 18, 47, 89, 103],
 11: [38],
 12: [16, 24, 50, 70],
 13: [12, 14, 21, 30, 45],
 14: [17, 60],
 15: [33, 43, 44],
 16: [19],
 17: [35],
 18: [26, 58],
 20: [32],
 24: [59],
 27: [25],
 33: [31],
 36: [13]}



In [17]:

    
prob.bin_combiner(bins=4, toplot=True)
prob.plot_with_newbins()









    












    Out[17]:





<ggplot: (-9223372036570876412)>



In [ ]:

	MODE	TTME	INVC	INVT	GC	HINC	PSIZE
0	0	69	59	100	70	35	1
1	0	34	31	372	71	35	1
2	0	35	25	417	70	35	1
3	1	0	10	180	30	35	1
4	0	64	58	68	68	30	2

	MODE	INVC	INVT	GC	HINC	PSIZE	combine_these
419	1	20	1440	238	27	2	False
555	0	35	825	125	26	1	True
551	0	64	859	157	50	1	True
547	1	36	750	117	45	1	True
543	1	19	720	128	12	2	True

		MODE	TTME	INVC	INVT	GC	HINC	PSIZE
TTME_diff	combine_these
0.0	True	815	815	815	815	815	815	815
1.0	True	5	5	5	5	5	5	5
2.0	True	1	1	1	1	1	1	1
3.0	True	2	2	2	2	2	2	2
4.0	True	4	4	4	4	4	4	4
5.0	True	11	11	11	11	11	11	11
6.0	True	1	1	1	1	1	1	1
9.0	True	1	1	1	1	1	1	1

	MODE	TTME	INVC	INVT	GC	HINC	PSIZE
0	0	69	59	100	70	35	1
1	0	34	31	372	71	35	1
2	0	35	25	417	70	35	1
3	1	0	10	180	30	35	1
4	0	64	58	68	68	30	2

	MODE	TTME	INVC	INVT	GC	HINC	PSIZE
0	0	69	59	100	70	35	1
1	0	34	31	372	71	35	1
2	0	35	25	417	70	35	1
3	1	0	10	180	30	35	1
4	0	64	58	68	68	30	2