notebook.community

Edit and run



In [14]:

    
%matplotlib inline
import hinc
import thinkstats2 as ts
import thinkplot as tp
import numpy as np
import pandas as pd
import chap06soln
import density

df = hinc.ReadData()



In [15]:

    
df.head()



In [16]:

    
# interpolate the findings function
def InterpolateSample(df, log_upper=6.0):
    """Makes a sample of log10 household income.

    Assumes that log10 income is uniform in each range.

    df: DataFrame with columns income and freq
    log_upper: log10 of the assumed upper bound for the highest range

    returns: NumPy array of log10 household income
    """
    # compute the log10 of the upper bound for each range
    df['log_upper'] = np.log10(df.income)

    # get the lower bounds by shifting the upper bound and filling in
    # the first element
    df['log_lower'] = df.log_upper.shift(1)
    df.log_lower[0] = 3.0

    # plug in a value for the unknown upper bound of the highest range
    df.log_upper[41] = log_upper

    # use the freq column to generate the right number of values in
    # each range
    arrays = []
    for _, row in df.iterrows():
        vals = np.linspace(row.log_lower, row.log_upper, row.freq)
        arrays.append(vals)

    # collect the arrays into a single sample
    log_sample = np.concatenate(arrays)
    return log_sample



In [17]:

    
log_sample = InterpolateSample(df, log_upper=6.0)









    



/Users/johnkeating/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/johnkeating/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [18]:

    
log_sample









    Out[18]:





array([ 3.        ,  3.00016628,  3.00033256, ...,  5.99958621,
        5.99979311,  6.        ])



In [19]:

    
log_cdf = ts.Cdf(log_sample)
tp.Cdf(log_cdf)
tp.show(xlabel='household income',
       ylabel='CDF')









    












    





<matplotlib.figure.Figure at 0x10ae59990>



In [20]:

    
# now this is the bit I don't understand

sample = np.power(10, log_sample)
mean, median = density.Summarize(sample) #this removes the log and it provides the density for each









    



mean 74278.7075312
std 93946.9299635
median 51226.4544789
skewness 4.94992024443
pearson skewness 0.736125801914



In [ ]:

    
cdf = ts.Cdf(sample)



In [ ]:

	income	freq	cumsum	ps
0	4999	4204	4204	0.034330
1	9999	4729	8933	0.072947
2	14999	6982	15915	0.129963
3	19999	7157	23072	0.188407
4	24999	7131	30203	0.246640