In [2]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from ggplot import *
from probabinerator import Probabinerator
In [2]:
#try it with a different data set, for public release
# info is here: http://statsmodels.sourceforge.net/devel/datasets/generated/modechoice.html
In [3]:
df = pd.read_csv("http://people.stern.nyu.edu/wgreene/Text/Edition7/TableF18-2.csv")
In [4]:
df.head()
Out[4]:
In [5]:
features = ['TTME', 'INVC']
In [6]:
df['TTME'].value_counts()
Out[6]:
In [23]:
df.sort_values(by='TTME', inplace=True)
df['TTME_diff'] = df['TTME'].diff()
df['TTME_diff'].fillna(0, inplace=True)
df.head()
Out[23]:
In [24]:
bins = df.groupby('TTME_diff')['TTME'].count() #try this instead of inverted index
bins
Out[24]:
In [25]:
#then still want to combine adjacent bins if they're similar - need to define similar
bins.plot(kind='bar')
Out[25]:
In [13]:
df.shape
Out[13]:
In [26]:
stdev = df['TTME'].std()
stdev
Out[26]:
In [33]:
#flag if counts (from bins) < stdev
mask = bins < stdev #can't actually do it this way
In [28]:
#use bins as reference to decide what bins to use and combine the rest - this example is very simple
bins.to_dict()
Out[28]:
In [29]:
df.head()
Out[29]:
In [32]:
df.groupby(['TTME_diff', 'combine_these']).count()
Out[32]:
In [7]:
prob = Probabinerator(df, features[0])
prob.count_index()
prob.invind
Out[7]:
In [8]:
count_target = sum(list(prob.invind.keys()))/3
count_target
Out[8]:
In [9]:
kv_list = sorted(list(prob.invind.copy().items()), key=lambda x: x[1][0])
kv_list
Out[9]:
In [10]:
prob.bin_combiner(toplot=True)
prob.newbins
Out[10]:
In [11]:
prob.bin_combiner()
prob.bin_ranges #missing 16 between 1 and 2; missing 55 between 2 and 3
#need some logic to check for adjusting the ranges
#probably need to keep track of which ones are being used, in order to find the ones that aren't
Out[11]:
In [12]:
prob.bin_combiner(toplot=True)
prob.plot_with_newbins()
Out[12]:
In [13]:
df['TTME'].plot(kind='hist', bins=10)
Out[13]:
In [14]:
df['TTME'].plot(kind='hist', bins=3)
Out[14]:
In [15]:
df['INVC'].plot(kind='hist', bins=10)
Out[15]:
In [16]:
prob = Probabinerator(df, features[1])
prob.count_index()
prob.invind
Out[16]:
In [17]:
prob.bin_combiner(bins=4, toplot=True)
prob.plot_with_newbins()
Out[17]:
In [ ]: