This is a toy example of using empirical calibration for survey calibration.


In [0]:
#@title Copyright 2019 The Empirical Calibration Authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

Imports


In [0]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('whitegrid')
%config InlineBackend.figure_format='retina'

# install and import ec
!pip install -q git+https://github.com/google/empirical_calibration
import empirical_calibration as ec

Selection Bias

Generate a population of size 1000.


In [0]:
np.random.seed(123)
N = 1000  # population size.
df_population = pd.DataFrame({
    'x1': np.repeat(['F', 'F', 'M', 'M'], repeats=N),
    'x2': np.repeat(['H', 'L', 'H', 'L'], repeats=N),
    'y': (np.repeat([1e1, 2e1, 3e1, 4e1], repeats=N) +
          np.random.normal(size=4*N)),
    'response_rate': np.repeat([1.0e-2, 1.1e-2, 1.2e-2, 1.3e-2],
                               repeats=N)
})
true_mean = df_population['y'].mean()
print('true population mean is {}.'.format(true_mean))


true population mean is 25.0114142022.

Define a function that draws a sample and calculates the estimates of mean.


In [0]:
def simulate(seed):
  """Draws sample and calculates (un)weighted estimates."""
  np.random.seed(seed)
  responded = (np.random.uniform(size=4*N) <=
               df_population['response_rate'])
  df_sample = df_population.loc[responded]
  weights, _ = ec.from_formula(
      formula='~x1+x2',
      df=df_sample,
      target_df=df_population,
      objective=ec.Objective.QUADRATIC)
  return pd.Series({
      'n': df_sample.shape[0],
      'unweighted': df_sample['y'].mean(),
      'weighted': df_sample['y'].mul(weights).sum()})

In [0]:
# Try it out.
simulate(123)


Out[0]:
n             58.000000
unweighted    27.434867
weighted      25.135381
dtype: float64

In [0]:
# Do it 1000 times and collect results.
result = pd.DataFrame(simulate(seed) for seed in range(1000))

In [0]:
# Calculate rmse.
print('unweighted rmse is {}'.format(
    np.sqrt(np.mean((result['unweighted']-true_mean)**2))))

print('weighted rmse is {}'.format(
    np.sqrt(np.mean((result['weighted']-true_mean)**2))))


unweighted rmse is 1.99259828407
weighted rmse is 0.150640397789

In [0]:
# visually compare unweighted vs weighted estimates.
def compare_estimates(unweighted, weighted, truth,
                      figsize=(12, 8),
                      fontsize=22,
                      output_path=None):
  f = plt.figure(figsize=figsize)
  plt.rcParams.update({'font.size': fontsize})
  plt.hist(unweighted,
           alpha=0.4,
           color='orange',
           edgecolor='none',
           label='unweighted')
  plt.hist(weighted,
           alpha=0.4,
           color='blue',
           edgecolor='none',
           label='weighted')
  plt.axvline(truth, linestyle='dashed', color='red')
  plt.legend(loc='upper left')
  plt.show()
  if output_path is not None:
     f.savefig(output_path, bbox_inches='tight')

compare_estimates(unweighted=result['unweighted'],
                  weighted=result['weighted'],
                  truth=true_mean)