In [1]:
from __future__ import division, print_function
from IPython.display import display
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
import os
import sys
sys.path.append('..')
In [3]:
from pummeler.data import election_data, centroids_cartesian
In [89]:
from sklearn import cross_validation as cv, linear_model, metrics
import scipy.stats
import GPy
NOTE: as of early September 2016, the release version of GPy doesn't actually support Laplace inference with binomial likelihoods, but it works on the devel
branch. You can get it with e.g.
pip install 'git+https://github.com/SheffieldML/GPy.git@devel'
The embeddings:
In [5]:
SORT_DIR = '../SORT_DIR'
with np.load(os.path.join(SORT_DIR, 'embeddings.npz') as d:
display(d.keys())
locals().update(**d)
The centroids, which we load in 3d Cartesian coordinates (in units of km) so that distances are accurate.
In [6]:
centroids = np.asarray(centroids_cartesian('00').loc[region_names])
The paper reported the learned lengthscale in some weird units, so let's start it at "a little more than half the median distance between locations".
In [7]:
from sklearn.metrics.pairwise import euclidean_distances
In [8]:
D2 = euclidean_distances(centroids, squared=True)
np.sqrt(np.median(D2[np.triu_indices_from(D2, k=1)]))
Out[8]:
In [9]:
election = election_data().loc[region_names]
In [10]:
election.head()
Out[10]:
Ignore third-party votes; n
is number of votes, y
is number Democratic.
In [11]:
n = np.asarray(election.votes_D + election.votes_R)
y = np.asarray(election.votes_D)
In [12]:
X_lin = np.hstack([emb_lin, centroids])
X_rff = np.hstack([emb_rff, centroids])
In [13]:
n_regions = len(region_names)
n_feats = len(feature_names)
n_rff = emb_rff.shape[1]
Just do a single train/test split for right now.
In [14]:
train, test = next(iter(cv.ShuffleSplit(
n_regions, test_size=.1, random_state=10)))
Here's the model using linear embeddings (i.e. feature means), along with the Matérn kernel on centroids, with hyperparameters more or less what the paper said worked well:
In [15]:
m_lin = GPy.core.GP(
X_lin[train], y[train, np.newaxis],
kernel=
GPy.kern.Linear(
n_feats, variances=4.56,
active_dims=range(0, n_feats))
+ GPy.kern.Matern32(
3, variance=0.18, lengthscale=560,
active_dims=range(n_feats, n_feats + 3)),
likelihood=GPy.likelihoods.Binomial(),
Y_metadata={'trials': n[train, np.newaxis]},
inference_method=GPy.inference.latent_function_inference.Laplace(),
)
In [16]:
m_lin
Out[16]:
In [17]:
def eval_preds(phat):
plt.plot([0, 1], [0, 1], color='r')
plt.scatter(y[test] / n[test], phat.squeeze())
plt.xlim(0, 1)
plt.ylim(0, 1)
return metrics.explained_variance_score(y[test] / n[test], phat)
In [23]:
f, fstd = m_lin._raw_predict(X_lin[test])
phat = scipy.stats.norm.cdf(f.squeeze())
eval_preds(phat)
Out[23]:
Predictions are pretty good. We can try optimizing it, see if that helps at all:
In [24]:
m_lin.optimize()
In [25]:
m_lin
Out[25]:
In [26]:
f, fstd = m_lin._raw_predict(X_lin[test])
phat = scipy.stats.norm.cdf(f.squeeze())
eval_preds(phat)
Out[26]:
...the same.
Of course, linear-kernel GPs are just ridge regression. If we drop the spatial kernel, we can investigate the importance of different features:
In [43]:
ridge = linear_model.RidgeCV()
ridge.fit(X_lin[train], y[train] / n[train])
ridge_phat = ridge.predict(X_lin[test])
eval_preds(ridge_phat)
Out[43]:
In [71]:
w = np.abs(ridge.coef_).argsort()[:-20:-1]
for f, wt in zip(feature_names[w], ridge.coef_[w]):
print('{:>15} {: .4}'.format(f, wt))
RELP_0
shouldn't happen according to the documentation. Dunno what that means.MAR_1
means "married."HISP_1
means "not Hispanic."MAR_5
means "never married."MSP_6.0
also means "never married."POBP_19
means "born in Iowa."POWPUMA_6500.0
means you work in either a specific place in California or a specific place in Texas....MSP_1.0
means "now married, spouse present."ANC1P_32
means German ancestry, listed first.POWSP_2.0
means "works in Alaska".ANC1P_82
means Norwegian ancestry, listed first.ANC1P_21
is Dutch.Note that several of those refer to specific places (POWPUMA
refers to working in a specific area; POWSP_6.0
refers to a person who works in California; etc). We might try to avoid that:
Note that we could have done this in the embedding step by telling pummel featurize
to ignore these features:
--skip-feats POBP RELP POWPUMA POWSP MIGSP MIGPUMA
In [42]:
no_us_places = np.array([
not (s.startswith('POBP_') or s.startswith('RELP_')
or s.startswith('POWPUMA_') or s.startswith('POWSP')
or s.startswith('MIGSP_') or s.startswith('MIGPUMA_'))
for s in feature_names
])
ridge2 = linear_model.RidgeCV()
ridge2.fit(X_lin[np.ix_(train, no_us_places)], y[train] / n[train])
ridge2_phat = ridge2.predict(X_lin[np.ix_(test, no_us_places)])
eval_preds(ridge2_phat)
Out[42]:
Hurts a little, but not much.
In [68]:
w = np.abs(ridge2.coef_).argsort()[:-20:-1]
for f, wt in zip(feature_names[w], ridge2.coef_[w]):
print('{:>15} {: .4}'.format(f, wt))
GCL_2.0
indicates that the person is not a grandparent living with their grandchildren.WKHP
is the number of hours worked per week over the last year.GCL_nan
that either the person being referenced is under 30 or the household is an institution of some kind.SCHG_4.0
indicates that the person is currently in middle school. (Remember that people under 18 are not in this dataset, so this is a rare feature.)LANP_966.0
means that an American Indian language (other than Apache, "Navaho", Dakota, Keres, Cherokee, or Zuni) is spoken at home.MAR_5
means "never married".ANC1P_5
refers to Basque ancestry.MAR_1
means "married".SCHG_7.0
is people currently in graduate or professional school.ANC1P_904
is "Negro" ancestry. (Note that "Afro American", "Afro", "African American", "Black" are also options.)LANP_864.0
means "Navaho" is spoken at home.ANC1P_12
is "British Isles" ancestry ("British" is also an option).
In [84]:
m_rff = GPy.core.GP(
X_rff[train], y[train, np.newaxis],
kernel=
GPy.kern.Linear(
n_rff, variances=4.56,
active_dims=range(0, n_rff))
+ GPy.kern.Matern32(
3, variance=0.18, lengthscale=560,
active_dims=range(n_rff, n_rff + 3)),
likelihood=GPy.likelihoods.Binomial(),
Y_metadata={'trials': n[train, np.newaxis]},
inference_method=GPy.inference.latent_function_inference.Laplace(),
)
In [85]:
m_rff.optimize()
In [91]:
m_rff
Out[91]:
In [94]:
f, fstd = m_rff._raw_predict(X_rff[test])
phat = scipy.stats.norm.cdf(f.squeeze())
eval_preds(phat)
Out[94]:
About the same as, maybe slightly worse than, the straight linear embeddings...