In [14]:
%matplotlib inline
import hinc
import thinkstats2 as ts
import thinkplot as tp
import numpy as np
import pandas as pd
import chap06soln
import density

df = hinc.ReadData()

In [15]:
df.head()


Out[15]:
income freq cumsum ps
0 4999 4204 4204 0.034330
1 9999 4729 8933 0.072947
2 14999 6982 15915 0.129963
3 19999 7157 23072 0.188407
4 24999 7131 30203 0.246640

In [16]:
# interpolate the findings function
def InterpolateSample(df, log_upper=6.0):
    """Makes a sample of log10 household income.

    Assumes that log10 income is uniform in each range.

    df: DataFrame with columns income and freq
    log_upper: log10 of the assumed upper bound for the highest range

    returns: NumPy array of log10 household income
    """
    # compute the log10 of the upper bound for each range
    df['log_upper'] = np.log10(df.income)

    # get the lower bounds by shifting the upper bound and filling in
    # the first element
    df['log_lower'] = df.log_upper.shift(1)
    df.log_lower[0] = 3.0

    # plug in a value for the unknown upper bound of the highest range
    df.log_upper[41] = log_upper

    # use the freq column to generate the right number of values in
    # each range
    arrays = []
    for _, row in df.iterrows():
        vals = np.linspace(row.log_lower, row.log_upper, row.freq)
        arrays.append(vals)

    # collect the arrays into a single sample
    log_sample = np.concatenate(arrays)
    return log_sample

In [17]:
log_sample = InterpolateSample(df, log_upper=6.0)


/Users/johnkeating/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/johnkeating/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [18]:
log_sample


Out[18]:
array([ 3.        ,  3.00016628,  3.00033256, ...,  5.99958621,
        5.99979311,  6.        ])

In [19]:
log_cdf = ts.Cdf(log_sample)
tp.Cdf(log_cdf)
tp.show(xlabel='household income',
       ylabel='CDF')


<matplotlib.figure.Figure at 0x10ae59990>

In [20]:
# now this is the bit I don't understand

sample = np.power(10, log_sample)
mean, median = density.Summarize(sample) #this removes the log and it provides the density for each


mean 74278.7075312
std 93946.9299635
median 51226.4544789
skewness 4.94992024443
pearson skewness 0.736125801914

In [ ]:
cdf = ts.Cdf(sample)

In [ ]: