In [14]:
%matplotlib inline
import hinc
import thinkstats2 as ts
import thinkplot as tp
import numpy as np
import pandas as pd
import chap06soln
import density
df = hinc.ReadData()
In [15]:
df.head()
Out[15]:
In [16]:
# interpolate the findings function
def InterpolateSample(df, log_upper=6.0):
"""Makes a sample of log10 household income.
Assumes that log10 income is uniform in each range.
df: DataFrame with columns income and freq
log_upper: log10 of the assumed upper bound for the highest range
returns: NumPy array of log10 household income
"""
# compute the log10 of the upper bound for each range
df['log_upper'] = np.log10(df.income)
# get the lower bounds by shifting the upper bound and filling in
# the first element
df['log_lower'] = df.log_upper.shift(1)
df.log_lower[0] = 3.0
# plug in a value for the unknown upper bound of the highest range
df.log_upper[41] = log_upper
# use the freq column to generate the right number of values in
# each range
arrays = []
for _, row in df.iterrows():
vals = np.linspace(row.log_lower, row.log_upper, row.freq)
arrays.append(vals)
# collect the arrays into a single sample
log_sample = np.concatenate(arrays)
return log_sample
In [17]:
log_sample = InterpolateSample(df, log_upper=6.0)
In [18]:
log_sample
Out[18]:
In [19]:
log_cdf = ts.Cdf(log_sample)
tp.Cdf(log_cdf)
tp.show(xlabel='household income',
ylabel='CDF')
In [20]:
# now this is the bit I don't understand
sample = np.power(10, log_sample)
mean, median = density.Summarize(sample) #this removes the log and it provides the density for each
In [ ]:
cdf = ts.Cdf(sample)
In [ ]: