In [6]:
%matplotlib inline
In [7]:
from IPython.core.display import HTML
css_file = './inet.css'
HTML(open(css_file, "r").read())
Out[7]:
In [8]:
import matplotlib.pyplot as plt
from matplotlib import collections as mc
plt.xkcd()
import numpy as np
import pandas as pd
import seaborn as sn
import statsmodels.formula.api as smf
from IPython.html import widgets
from IPython.display import display
import pypwt
A: Drawing a "line" through a scatter of data points in a principled way...
A: Drawing a straight "line" through a scatter of data points in a principled way...
In [9]:
# grab the entire Penn World Tables data from the web...
pwt = pypwt.load_pwt_data()
In [11]:
#...this gives us a panel (i.e., two dimensional) data set
pwt
Out[11]:
In [23]:
def labor_supply(data, year="1950-01-01"):
"""
Labor supply in a given year is the product of number of employed
persons, 'emp', and the average number of hours worked, 'avh'.
Parameters
----------
data : pandas.Panel
The Penn World Tables (PWT) data set.
year : str (default="1950-01-01")
Some year between 1950 and 2011. Format should be "YYYY-01-01".
Returns
-------
L : pandas.Series
Effective labor supply in units of employed person-years.
"""
L = data.minor_xs(year)["emp"] * data.minor_xs(year)["avh"]
return L
def real_gdp_per_unit_labor(data, year="1950-01-01"):
"""
Real gross domestic product (GDP) per unit labor is the ratio of
some measure of real output and some measure of labor supply.
Parameters
----------
data : pandas.Panel
The Penn World Tables (PWT) data set.
year : str (default="1950-01-01")
Some year between 1950 and 2011. Format should be "YYYY-01-01".
Returns
-------
rgdppul : pandas.Series
Real gdp per unit labor supply.
"""
rgdppul = data.minor_xs(year)["rgdpo"] / labor_supply(data, year)
return rgdppul
def growth_rate_real_gdp_per_unit_labor(data, start="1950-01-01", end="2011-01-01"):
"""
Plot the growth rate of real GDP per unit labor over some time period
against the level of real GDP per unit labor at the start of the time
period.
Parameters
----------
data : pandas.Panel
The Penn World Tables (PWT) data set.
start : str (default="1950-01-01")
Some year between 1950 and 2011. Format should be "YYYY-01-01".
end : str (default="2011-01-01")
Some year between 1950 and 2011. Format should be "YYYY-01-01".
"""
gr = (np.log(real_gdp_per_unit_labor(data, end)) -
np.log(real_gdp_per_unit_labor(data, start)))
return gr
def some_interesting_plot(data, start="1950-01-01", end="2011-01-01"):
"""
Plot the growth rate of real GDP per unit labor over some time period
against the level of real GDP per unit labor at the start of the time
period.
Parameters
----------
data : pandas.Panel
The Penn World Tables (PWT) data set.
start : str (default="1950-01-01")
Some year between 1950 and 2011. Format should be "YYYY-01-01".
end : str (default="2011-01-01")
Some year between 1950 and 2011. Format should be "YYYY-01-01".
"""
# create the scatter plot
fig, ax = plt.subplots(1, 1, figsize=(12,9))
xs = np.log(real_gdp_per_unit_labor(data, start))
ys = growth_rate_real_gdp_per_unit_labor(data, start, end)
ax.scatter(xs, ys, color='k')
# axis labels, title, etc
ax.set_xlabel('Log income (per unit labor) in 1960', fontsize=25)
ax.set_xlim(0.95 * xs.min(), 1.05 * xs.max())
ax.set_ylabel('Income (per unit labor) growth\n({}-{})'.format(start[:4], end[:4]),
fontsize=25)
ax.set_ylim(1.05 * ys.min(), 1.05 * ys.max())
ax.set_title('Do poor countries grow faster than rich countries?',
fontsize=25, family="serif")
Let's focus on a relation between...
...note that income is "per unit labor supply"
In [24]:
some_interesting_plot(data=pwt, start="1960-01-01", end="2010-01-01")
In [26]:
def another_interesting_plot(data, start, end, intercept=2.5, slope=-0.5):
"""
Plot the growth rate of real GDP per unit labor over some time period
against the level of real GDP per unit labor at the start of the time
period. Then add a regression line associated with given values for the
intercept and slope.
Parameters
----------
data : pandas.Panel
The Penn World Tables (PWT) data set.
start : str (default="1950-01-01")
Some year between 1950 and 2011. Format should be "YYYY-01-01".
end : str (default="2011-01-01")
Some year between 1950 and 2011. Format should be "YYYY-01-01".
intercept: float (defalut=2.5)
Intercept for the regression line.
slope : float (default=-0.5)
Slope for the regression line.
"""
# create the scatter plot
fig, ax = plt.subplots(1, 1, figsize=(12,9))
xs = np.log(real_gdp_per_unit_labor(data, start))
ys = growth_rate_real_gdp_per_unit_labor(data, start, end)
ax.scatter(xs, ys, color='k')
# compute the regression line given params
grid = np.linspace(0.95 * xs.min(), 1.05 * xs.max(), 1000)
predicted = lambda x: intercept + slope * x
yhat, = ax.plot(grid, predicted(grid) , color='b',
label=r"$\hat{y}_i=%.2f + %.2fx_i$" %(intercept, slope))
# axis labels, title, etc
ax.set_xlabel('Log income (per unit labor) in 1960', fontsize=25)
ax.set_xlim(0.95 * xs.min(), 1.05 * xs.max())
ax.set_ylabel('Income (per unit labor) growth\n({}-{})'.format(start[:4], end[:4]),
fontsize=25)
ax.set_ylim(1.05 * ys.min(), 1.05 * ys.max())
ax.legend(bbox_to_anchor=(1.0, 0.95), prop={'size': 25})
Since we are drawing straight lines...
$$y_i = \beta_0 + \beta_1 x_i + \epsilon_i$$where
\begin{align} y_i=& \text{income growth from 1960-2010} \\ x_i=& \ln (\text{income in 1960}) \end{align}and $i=\text{AGO},\dots,\text{ZWE}$.
In [27]:
another_interesting_plot(pwt, start="1960-01-01", end="2010-01-01")
Given our choice of model...
$$ y_i = \beta_0 + \beta_1 x_i + \epsilon_i,\ i=1,\dots,N $$...focus on the sum of squared errors (SSE):
$$ SSE \equiv \sum_{i=1}^N \epsilon_i^2 = \sum_{i=1}^N \big(y_i - (\beta_0 + \beta_1x_i)\big)^2 $$
In [30]:
def another_static_plot(data, start, end, intercept=2.5, slope=-0.5):
"""
Plot the growth rate of real GDP per unit labor over some time period
against the level of real GDP per unit labor at the start of the time
period. Then add a regression line associated with given values for the
intercept and slope.
Parameters
----------
data : pandas.Panel
The Penn World Tables (PWT) data set.
start : str (default="1950-01-01")
Some year between 1950 and 2011. Format should be "YYYY-01-01".
end : str (default="2011-01-01")
Some year between 1950 and 2011. Format should be "YYYY-01-01".
intercept: float (default=2.5)
Intercept for the regression line.
slope : float (default=-0.5)
Slope for the regression line.
"""
# create the scatter plot
fig, ax = plt.subplots(1, 1, figsize=(12,9))
xs = np.log(real_gdp_per_unit_labor(data, start))
ys = growth_rate_real_gdp_per_unit_labor(data, start, end)
ax.scatter(xs, ys, color='k')
# compute and plot the regression line given current params
predicted = lambda x: intercept + slope * x
grid = np.linspace(0.95 * xs.min(), 1.05 * xs.max(), 1000)
yhat, = ax.plot(grid, predicted(grid) , color='b')
ssr = np.sum((predicted(xs) - ys)**2)
# add the residuals to the plot
lines = zip(zip(xs, ys), zip(xs, predicted(xs)))
lc = mc.LineCollection(lines, colors='r', linewidths=2)
ax.add_collection(lc)
# axis labels, title, etc
ax.set_xlabel('Log income (per unit labor) in 1960', fontsize=25)
ax.set_xlim(0.95 * xs.min(), 1.05 * xs.max())
ax.set_ylabel('Income (per unit labor) growth\n({}-{})'.format(start[:4], end[:4]),
fontsize=25)
fig.suptitle(r'Sum of squared errors (SSE) $\equiv \sum_{i=1}^N (y_i - (\beta_0 + \beta_1x_i))^2$',
x=0.5, y=1.025, fontsize=25, family="serif")
fig.legend([yhat, lc], [r"$\hat{y}_i=\hat{\beta}_0 + \hat{\beta}_1x_i$", r"$SSE={0:.4f}$".format(ssr)],
bbox_to_anchor=(0.8, 0.85), prop={'size': 25})
# create the interactive widget
intercept_widget = widgets.FloatSlider(value=2.5e0, min=0.0, max=5.0, step=5e-2, description=r"$\hat{\beta}_0$")
slope_widget = widgets.FloatSlider(value=-0.5, min=-1.0, max=1.0, step=5e-2, description=r"$\hat{\beta}_1$")
some_interactive_plot = widgets.interactive(another_static_plot,
data=widgets.fixed(pwt),
start=widgets.fixed("1960-01-01"),
end=widgets.fixed("2010-01-01"),
intercept=intercept_widget,
slope=slope_widget,
)
In [31]:
display(some_interactive_plot)
Our estimates $\hat{\beta}_0$ and $\hat{\beta}_1$ solve the following optimization problem...
$$ \min_{\beta_0, \beta_1} \sum_{i=1}^N (y_i - (\beta_0 + \beta_1x_i))^2 $$Taking of derivatives yields a system of first-order conditions...
...which after some algebra yields...
Intuition for $\hat{\beta}_1$...
\begin{align} \hat{\beta}_1 =& \frac{Cov(x, y)}{Var(x)} \\ \equiv& \frac{\text{How much do $x$ and $y$ vary together?}}{\text{How much does $x$ vary in general?}} \end{align}
In [13]:
# compute the optimal parameter estimates "by hand"
estimated_slope = xs.cov(ys) / xs.var()
estimated_intercept = ys.mean() - estimated_slope * xs.mean()
In [14]:
print "Estimated intercept: {}".format(estimated_intercept)
print "Estimated slope: {}".format(estimated_slope)
In [45]:
# define the variables
xs = np.log(real_gdp_per_unit_labor(pwt, '1960-01-01'))
ys = growth_rate_real_gdp_per_unit_labor(pwt, '1960-01-01', '2010-01-01')
# compute and plot the regression line given current params
data = pd.DataFrame.from_dict({'x': xs, 'y': ys})
lm = smf.ols(formula="y ~ x", data=data).fit()
In [55]:
lm.summary()
Out[55]:
In [53]:
def plot_fitted_model(data, start="1950-01-01", end="2011-01-01"):
"""
Plot the growth rate of real GDP per unit labor over some time period
against the level of real GDP per unit labor at the start of the time
period. Then add a regression line associated with given values for the
intercept and slope.
Parameters
----------
data : pandas.Panel
The Penn World Tables (PWT) data set.
start : str (default="1950-01-01")
Some year between 1950 and 2011. Format should be "YYYY-01-01".
end : str (default="2011-01-01")
Some year between 1950 and 2011. Format should be "YYYY-01-01".
"""
# create the scatter plot
fig, ax = plt.subplots(1, 1, figsize=(12,9))
xs = np.log(real_gdp_per_unit_labor(data, start))
ys = (np.log(real_gdp_per_unit_labor(data, end)) -
np.log(real_gdp_per_unit_labor(data, start)))
ax.scatter(xs, ys, color='k')
# compute and plot the regression line given current params
data = pd.DataFrame.from_dict({'x': xs, 'y': ys})
lm = smf.ols(formula="y ~ x", data=data).fit()
predicted = lambda x: lm.params[0] + lm.params[1] * x
grid = np.linspace(0.95 * xs.min(), 1.05 * xs.max(), 1000)
yhat, = ax.plot(grid, predicted(grid) , color='b')
ssr = np.sum((predicted(xs) - ys)**2)
# add the residuals to the plot
lines = zip(zip(xs, ys), zip(xs, predicted(xs)))
lc = mc.LineCollection(lines, colors='r', linewidths=2)
ax.add_collection(lc)
# axis labels, title, etc
ax.set_xlabel('Log income (per unit labor) in 1960', fontsize=25)
ax.set_xlim(0.95 * xs.min(), 1.05 * xs.max())
ax.set_ylabel('Income (per unit labor) growth\n({}-{})'.format(start[:4], end[:4]),
fontsize=25)
ax.set_ylim(1.05 * ys.min(), 1.05 * ys.max())
fig.legend([yhat, lc], [r"$\hat{y}_i=%.4f + %.4f x_i$" % (lm.params[0], lm.params[1]),
r"$SSR={0:.4f}$".format(ssr)],
bbox_to_anchor=(0.85, 0.85), prop={'size': 25})
In [54]:
plot_fitted_model(pwt, '1960-01-01', '2010-01-01')
Yes...
...but...
What about...