In [1]:
import requests
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import numpy as np
import scipy.stats as ss
# For inline pictures
%matplotlib inline
sns.set_context('notebook')
# For nicer output of Pandas dataframes
pd.set_option('float_format', '{:8.2f}'.format)
np.set_printoptions(precision = 3, suppress = True)
The data is available here: http://www.statsci.org/data/oz/rabbit.html
In [2]:
url = 'http://www.statsci.org/data/oz/rabbit.txt'
response = requests.get(url)
path = '../data/rabbit.txt'
with open(path, "wb") as file:
file.write(response.content)
df = pd.read_csv('../data/rabbit.txt', sep='\t')
print(df.head())
In [3]:
X, Y = np.array(df['Age']), np.array(df['Lens'])
plt.scatter(X, Y)
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
In [4]:
N = 100
U = np.linspace(X.min(), X.max(), N)
fxhat1 = ss.gaussian_kde(X, 'silverman')
fxhat2 = ss.gaussian_kde(X, .2)
plt.plot(U, fxhat1(U), label='Silverman')
plt.plot(U, fxhat2(U), label='Undersmoothed')
plt.xlabel('$x$')
plt.ylabel('$\hat{f}(x)$')
plt.legend()
plt.show()
Truncated (Uniform): $k_{0}\left(u\right)=\frac{1}{2}1\left(\left|u\right|\leq1\right)$
Epanechnikov: $k_{1}\left(u\right)=\frac{3}{4}\left(1-u^{2}\right)1\left(\left|u\right|\leq1\right)$
Biweight: $k_{2}\left(u\right)=\frac{15}{16}\left(1-u^{2}\right)^{2}1\left(\left|u\right|\leq1\right)$
Triweight: $k_{2}\left(u\right)=\frac{35}{36}\left(1-u^{2}\right)^{3}1\left(\left|u\right|\leq1\right)$
Gaussian: $k_{\phi}\left(u\right)=\frac{1}{\sqrt{2\pi}}\exp\left(-\frac{1}{2}u^2\right)$
In [5]:
def indicator(x):
return np.asfarray((np.abs(x) <= 1.) & (np.abs(x) >= 0.))
def kernel(x, ktype = 'Truncated'):
if ktype == 'Truncated':
return .5 * indicator(x)
if ktype == 'Epanechnikov':
return 3./4. * (1 - x**2) * indicator(x)
if ktype == 'Biweight':
return 15./16. * (1 - x**2)**2 * indicator(x)
if ktype == 'Triweight':
return 35./36. * (1 - x**2)**3 * indicator(x)
if ktype == 'Gaussian':
return 1./np.sqrt(2. * np.pi) * np.exp(- .5 * x**2)
def roughness(ktype = 'Truncated'):
if ktype == 'Truncated':
return 1./2.
if ktype == 'Epanechnikov':
return 3./5.
if ktype == 'Biweight':
return 5./7.
if ktype == 'Triweight':
return 350./429.
if ktype == 'Gaussian':
return np.pi**(-.5)/2.
def sigmak(ktype = 'Truncated'):
if ktype == 'Truncated':
return 1./3.
if ktype == 'Epanechnikov':
return 1./5.
if ktype == 'Biweight':
return 1./7.
if ktype == 'Triweight':
return 1./9.
if ktype == 'Gaussian':
return 1.
x = np.linspace(0., 2., 100)
names = ['Truncated', 'Epanechnikov', 'Biweight', 'Triweight', 'Gaussian']
for name in names:
plt.plot(x, kernel(x, ktype = name), label = name, lw = 2)
plt.legend()
plt.show()
In [6]:
def weight(U, X, h=.1, ktype='Truncated'):
# X - N-array
# U - M-array
# XmU - M*N-array
XmU = (X - np.atleast_2d(U).T) / h
# K - M*N-array
K = kernel(XmU, ktype)
# K.sum(1) - M-array
# K.T - N*M-array
# K.T / K.sum(1) - N*M-array
return (K.T / K.sum(1)).T
def NW(U, X, Y, h=.1, ktype='Truncated'):
return np.dot(weight(U, X, h, ktype), Y)
$K(x)$ - $N\times N$
$Z(x)$ - $N\times 2$
$Y$ - $N\times 1$
In [7]:
def LL(U, X, Y, h=.1, ktype='Truncated'):
# X - N-array
# U - M-array
# K - M*N-array
W = weight(U, X, h, ktype)
alpha = np.empty(U.shape[0])
beta = np.empty(U.shape[0])
for i in range(U.shape[0]):
# N*N-array
K = np.diag(W[i])
# N-array
Z1 = (X - U[i]) / h
Z0 = np.ones(Z1.shape)
# 2*N-array
Z = np.vstack([Z0, Z1]).T
# 2*2-array
A = np.dot(Z.T, np.dot(K, Z))
# 2-array
B = np.dot(Z.T, np.dot(K, Y))
# 2-array
coef = np.dot(np.linalg.inv(A), B)
alpha[i] = coef[0]
beta[i] = coef[1]
return alpha, beta
In [8]:
N = 100
U = np.linspace(X.min(), X.max(), N)
h_silv = 1.06 * np.std(X) * N**(-1/5)
print('Silverman\'s Rule-of-Thumb = %.2f' % h_silv)
# Nadaraya-Watson estimator
Yhat_NW = NW(U, X, Y, h=h_silv, ktype='Gaussian')
# Local Linear estimator
Yhat_LL, dYhat_LL = LL(U, X, Y, h=h_silv, ktype='Gaussian')
In [9]:
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 6), sharex=True)
axes[0].plot(U, Yhat_NW, lw=2, color='red', label='NW')
axes[0].plot(U, Yhat_LL, lw=2, color='blue', label='LL')
axes[0].scatter(X, Y, s=20, lw=.5, facecolor='none', label='Realized')
axes[0].set_ylabel('Y')
axes[0].legend(loc='upper left')
axes[0].set_title('Conditional expectation')
axes[1].plot(U, dYhat_LL)
axes[1].set_title('Regression derivative')
axes[1].set_xlabel('X')
axes[1].set_ylabel('dm(x)/dx')
plt.show()
In [10]:
def error(Y, X, h, ktype):
N = len(Y)
ehat = np.empty(N)
for i in range(N):
ehat[i] = Y[i] - NW(X[i], np.delete(X, i), np.delete(Y, i), h=h, ktype=ktype)
return ehat
In [11]:
h = 30
ktype = 'Gaussian'
ehat = error(Y, X, h, ktype)
sigma2hat = NW(U, X, ehat**2, h=h, ktype=ktype)
fxhat = ss.gaussian_kde(X)(U)
V2hat = roughness(ktype) * sigma2hat / fxhat / N / h
shat_NW = V2hat**.5
In [12]:
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 8), sharex=True)
axes[0].scatter(X, Y, s=15, lw=.5, facecolor='none', label='Realized')
axes[0].fill_between(U, Yhat_NW - 2*shat_NW, Yhat_NW + 2*shat_NW,
lw=0, color='red', alpha=.2, label='+2s')
axes[0].plot(U, Yhat_NW, lw=2, color='red', label='Fitted')
axes[0].set_ylabel('Y')
axes[0].legend(loc='best')
axes[0].set_title('Data')
axes[1].plot(U, sigma2hat**.5, lw=2, color='blue')
axes[1].set_xlabel('X')
axes[1].set_ylabel('$\sigma(X)$')
axes[1].set_title('Conditional variance')
plt.show()
In [13]:
ktype = 'Gaussian'
H = np.linspace(1, 30, 100)
CV = np.array([])
for h in H:
ehat = error(Y, X, h, ktype)
CV = np.append(CV, np.nanmean(ehat**2))
h_CV = H[CV.argmin()]
plt.plot(H, CV)
plt.scatter(h_CV, CV.min(), facecolor='none', lw=2, s=100)
plt.xlabel('Bandwidth, h')
plt.ylabel('cross-validation, CV')
plt.show()