In [1]:
from __future__ import division, print_function
from IPython.display import display
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
import os
import sys
sys.path.append('..')
In [3]:
from pummeler.data import election_data, centroids_cartesian
In [89]:
from sklearn import cross_validation as cv, linear_model, metrics
import scipy.stats
import GPy
NOTE: as of early September 2016, the release version of GPy doesn't actually support Laplace inference with binomial likelihoods, but it works on the devel branch. You can get it with e.g.
pip install 'git+https://github.com/SheffieldML/GPy.git@devel'
The embeddings:
In [5]:
SORT_DIR = '../SORT_DIR'
with np.load(os.path.join(SORT_DIR, 'embeddings.npz') as d:
display(d.keys())
locals().update(**d)
The centroids, which we load in 3d Cartesian coordinates (in units of km) so that distances are accurate.
In [6]:
centroids = np.asarray(centroids_cartesian('00').loc[region_names])
The paper reported the learned lengthscale in some weird units, so let's start it at "a little more than half the median distance between locations".
In [7]:
from sklearn.metrics.pairwise import euclidean_distances
In [8]:
D2 = euclidean_distances(centroids, squared=True)
np.sqrt(np.median(D2[np.triu_indices_from(D2, k=1)]))
Out[8]:
In [9]:
election = election_data().loc[region_names]
In [10]:
election.head()
Out[10]:
Ignore third-party votes; n is number of votes, y is number Democratic.
In [11]:
n = np.asarray(election.votes_D + election.votes_R)
y = np.asarray(election.votes_D)
In [12]:
X_lin = np.hstack([emb_lin, centroids])
X_rff = np.hstack([emb_rff, centroids])
In [13]:
n_regions = len(region_names)
n_feats = len(feature_names)
n_rff = emb_rff.shape[1]
Just do a single train/test split for right now.
In [14]:
train, test = next(iter(cv.ShuffleSplit(
n_regions, test_size=.1, random_state=10)))
Here's the model using linear embeddings (i.e. feature means), along with the Matérn kernel on centroids, with hyperparameters more or less what the paper said worked well:
In [15]:
m_lin = GPy.core.GP(
X_lin[train], y[train, np.newaxis],
kernel=
GPy.kern.Linear(
n_feats, variances=4.56,
active_dims=range(0, n_feats))
+ GPy.kern.Matern32(
3, variance=0.18, lengthscale=560,
active_dims=range(n_feats, n_feats + 3)),
likelihood=GPy.likelihoods.Binomial(),
Y_metadata={'trials': n[train, np.newaxis]},
inference_method=GPy.inference.latent_function_inference.Laplace(),
)
In [16]:
m_lin
Out[16]:
In [17]:
def eval_preds(phat):
plt.plot([0, 1], [0, 1], color='r')
plt.scatter(y[test] / n[test], phat.squeeze())
plt.xlim(0, 1)
plt.ylim(0, 1)
return metrics.explained_variance_score(y[test] / n[test], phat)
In [23]:
f, fstd = m_lin._raw_predict(X_lin[test])
phat = scipy.stats.norm.cdf(f.squeeze())
eval_preds(phat)
Out[23]:
Predictions are pretty good. We can try optimizing it, see if that helps at all:
In [24]:
m_lin.optimize()
In [25]:
m_lin
Out[25]:
In [26]:
f, fstd = m_lin._raw_predict(X_lin[test])
phat = scipy.stats.norm.cdf(f.squeeze())
eval_preds(phat)
Out[26]:
...the same.
Of course, linear-kernel GPs are just ridge regression. If we drop the spatial kernel, we can investigate the importance of different features:
In [43]:
ridge = linear_model.RidgeCV()
ridge.fit(X_lin[train], y[train] / n[train])
ridge_phat = ridge.predict(X_lin[test])
eval_preds(ridge_phat)
Out[43]:
In [71]:
w = np.abs(ridge.coef_).argsort()[:-20:-1]
for f, wt in zip(feature_names[w], ridge.coef_[w]):
print('{:>15} {: .4}'.format(f, wt))
RELP_0 shouldn't happen according to the documentation. Dunno what that means.MAR_1 means "married."HISP_1 means "not Hispanic."MAR_5 means "never married."MSP_6.0 also means "never married."POBP_19 means "born in Iowa."POWPUMA_6500.0 means you work in either a specific place in California or a specific place in Texas....MSP_1.0 means "now married, spouse present."ANC1P_32 means German ancestry, listed first.POWSP_2.0 means "works in Alaska".ANC1P_82 means Norwegian ancestry, listed first.ANC1P_21 is Dutch.Note that several of those refer to specific places (POWPUMA refers to working in a specific area; POWSP_6.0 refers to a person who works in California; etc). We might try to avoid that:
Note that we could have done this in the embedding step by telling pummel featurize to ignore these features:
--skip-feats POBP RELP POWPUMA POWSP MIGSP MIGPUMA
In [42]:
no_us_places = np.array([
not (s.startswith('POBP_') or s.startswith('RELP_')
or s.startswith('POWPUMA_') or s.startswith('POWSP')
or s.startswith('MIGSP_') or s.startswith('MIGPUMA_'))
for s in feature_names
])
ridge2 = linear_model.RidgeCV()
ridge2.fit(X_lin[np.ix_(train, no_us_places)], y[train] / n[train])
ridge2_phat = ridge2.predict(X_lin[np.ix_(test, no_us_places)])
eval_preds(ridge2_phat)
Out[42]:
Hurts a little, but not much.
In [68]:
w = np.abs(ridge2.coef_).argsort()[:-20:-1]
for f, wt in zip(feature_names[w], ridge2.coef_[w]):
print('{:>15} {: .4}'.format(f, wt))
GCL_2.0 indicates that the person is not a grandparent living with their grandchildren.WKHP is the number of hours worked per week over the last year.GCL_nan that either the person being referenced is under 30 or the household is an institution of some kind.SCHG_4.0 indicates that the person is currently in middle school. (Remember that people under 18 are not in this dataset, so this is a rare feature.)LANP_966.0 means that an American Indian language (other than Apache, "Navaho", Dakota, Keres, Cherokee, or Zuni) is spoken at home.MAR_5 means "never married".ANC1P_5 refers to Basque ancestry.MAR_1 means "married".SCHG_7.0 is people currently in graduate or professional school.ANC1P_904 is "Negro" ancestry. (Note that "Afro American", "Afro", "African American", "Black" are also options.)LANP_864.0 means "Navaho" is spoken at home.ANC1P_12 is "British Isles" ancestry ("British" is also an option).
In [84]:
m_rff = GPy.core.GP(
X_rff[train], y[train, np.newaxis],
kernel=
GPy.kern.Linear(
n_rff, variances=4.56,
active_dims=range(0, n_rff))
+ GPy.kern.Matern32(
3, variance=0.18, lengthscale=560,
active_dims=range(n_rff, n_rff + 3)),
likelihood=GPy.likelihoods.Binomial(),
Y_metadata={'trials': n[train, np.newaxis]},
inference_method=GPy.inference.latent_function_inference.Laplace(),
)
In [85]:
m_rff.optimize()
In [91]:
m_rff
Out[91]:
In [94]:
f, fstd = m_rff._raw_predict(X_rff[test])
phat = scipy.stats.norm.cdf(f.squeeze())
eval_preds(phat)
Out[94]:
About the same as, maybe slightly worse than, the straight linear embeddings...