In [1]:
import pandas as pd
import pymc3 as pm
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

$ AnnualIncome = 20000 + 1000 * Age $

education_level -> state -> county


In [2]:
set_parameters = (
    (('no degree', 'CA', 'Riverside County'), (5000, 1250)),
    (('no degree', 'IL', 'Cook County'), (6500, 1150)),
    (('no degree', 'IL', 'Lake County'), (7000, 1350)),
    
    (('degree', 'CA', 'Riverside County'), (6000, 1250)),
    (('degree', 'IL', 'Cook County'), (7500, 1150)),
    (('degree', 'IL', 'Lake County'), (8000, 1350))
)

In [5]:
data_points = np.random.randint(1,24, size=len(set_parameters))

In [6]:
rows = []
for set_parameter, N in zip(set_parameters, data_points):
  key, parameters = set_parameter
  population_rows = [{
      'degree': key[0],
      'state': key[1],
      'county': key[2],
      'month_index': i,
      'salary': i*parameters[0] + parameters[1] + np.random.normal(loc=0, scale=500)
  } for i in range(N)]
  rows += population_rows

salary_df = pd.DataFrame(rows)

In [7]:
plt.scatter(salary_df.month_index, salary_df.salary, s=2);



In [8]:
# Index each of the unique variable values
degree_index = salary_df.groupby('degree').all().reset_index().reset_index()[['index', 'degree']]
degree_state_index = salary_df.groupby(['degree', 'state']).all().reset_index().reset_index()[['index', 'degree', 'state']]
degree_state_county_index = salary_df.groupby(['degree', 'state', 'county']).all().reset_index().reset_index()[['index', 'degree', 'state', 'county']]

degree_state_indexes_df = pd.merge(degree_index, degree_state_index, how='inner', on='degree', suffixes=('_d', '_ds'))
degree_state_county_indexes_df = pd.merge(degree_state_indexes_df, degree_state_county_index, how='inner', on=['degree', 'state'])
indexed_salary_df = pd.merge(salary_df, degree_state_county_indexes_df, how='inner', on=['degree', 'state', 'county']).reset_index()

In [ ]:
import pymc3 as pm

degree_indexes = degree_index['index'].values
degree_count = len(degree_indexes)
degree_state_indexes = degree_state_indexes_df['index_d'].values
degree_state_count = len(degree_state_indexes)
degree_state_county_indexes = degree_state_county_indexes_df['index_ds'].values
degree_state_county_count = len(degree_state_county_indexes)

with pm.Model() as model:
    global_m = pm.Normal('global_m', mu=6500, sd=100**2)
    global_m_sd = pm.Uniform('global_m_sd', lower=0, upper=1000)
    global_b = pm.Normal('global_b', mu=1000, sd=100**2)
    global_b_sd = pm.Uniform('global_b_sd', lower=0, upper=1000)
    
    degree_m = pm.Normal('degree_m', mu=global_m, sd=global_m_sd, shape=degree_count)
    degree_m_sd = pm.Uniform('degree_m_sd', lower=0, upper=1000, shape=degree_count)
    degree_b = pm.Normal('degree_b', mu=global_b, sd=global_b_sd, shape=degree_count)
    degree_b_sd = pm.Uniform('degree_b_sd', lower=0, upper=1000, shape=degree_count)
    
    degree_state_m = pm.Normal('degree_state_m', mu=degree_m[degree_state_indexes], sd=degree_m_sd[degree_state_indexes], shape=degree_state_count)
    degree_state_m_sd = pm.Uniform('degree_state_m_sd', lower=0, upper=1000, shape=degree_state_count)
    degree_state_b = pm.Normal('degree_state_b', mu=degree_b[degree_state_indexes], sd=degree_b_sd[degree_state_indexes], shape=degree_state_count)
    degree_state_b_sd = pm.Uniform('degree_state_b_sd', lower=0, upper=1000, shape=degree_state_count)
    
    degree_state_county_m = pm.Normal('degree_state_county_m', mu=degree_state_m[degree_state_county_indexes], sd=degree_state_m_sd[degree_state_county_indexes], shape=degree_state_county_count)
    degree_state_county_b = pm.Normal('degree_state_county_b', mu=degree_state_b[degree_state_county_indexes], sd=degree_state_b_sd[degree_state_county_indexes], shape=degree_state_county_count)
    
    error = pm.Uniform('error', lower=0, upper=10000)
    
    y_prediction = degree_state_county_m[indexed_salary_df['index'].values] * indexed_salary_df.month_index.values + degree_state_county_b[indexed_salary_df['index'].values]
    
    pm.StudentT('y_like', nu=1, mu=0, sd=error, observed=y_prediction - indexed_salary_df.salary.values)
    
    model_trace = pm.sample(5000)


INFO (theano.gof.compilelock): Refreshing lock /Users/justin/.theano/compiledir_Darwin-16.5.0-x86_64-i386-64bit-i386-3.4.4-64/lock_dir/lock

In [ ]:
pm.traceplot(model_trace[1000:]);

In [184]:
%matplotlib inline

pm.traceplot(model_trace[1000:]);



In [183]:
pm.summary(model_trace)


global_m:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  6637.890         657.386          22.021           [5447.803, 7831.476]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  5412.829       6270.124       6659.213       7036.623       7809.716


global_b:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  1100.806         516.587          12.610           [95.963, 2146.075]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  81.232         791.504        1093.225       1416.669       2137.425


degree_m:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  6827.809         617.538          22.737           [5791.423, 7947.460]
  6457.639         597.799          19.491           [5303.314, 7453.457]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  5734.830       6458.979       6840.172       7208.993       7917.561
  5285.357       6111.874       6504.478       6845.772       7448.185


degree_b:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  1082.626         429.765          11.789           [283.239, 1985.533]
  1107.094         415.101          11.161           [294.382, 1924.461]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  213.379        819.810        1079.290       1346.213       1937.559
  302.904        842.953        1104.791       1374.076       1938.070


degree_state_m:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  6526.097         652.551          21.967           [5455.314, 7787.917]
  7471.306         504.897          16.165           [6527.842, 8248.564]
  5793.032         744.165          27.064           [4652.087, 7179.841]
  6772.999         413.587          13.086           [5915.829, 7419.182]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  5430.144       6111.652       6498.543       6918.001       7774.572
  6444.091       7223.276       7551.172       7790.272       8197.445
  4591.368       5164.688       5774.055       6355.814       7135.475
  5848.539       6617.905       6848.839       6995.881       7385.547


degree_state_b:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  1145.504         526.469          14.347           [120.080, 2187.238]
  1013.836         322.004          6.395            [336.437, 1693.378]
  1210.731         428.389          10.469           [345.597, 2027.721]
  1027.478         378.172          9.195            [346.501, 1817.127]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  93.077         815.564        1137.780       1489.252       2178.011
  369.136        821.958        1003.566       1190.960       1740.869
  346.713        950.443        1224.215       1474.460       2031.712
  331.719        774.359        995.180        1281.946       1806.937


degree_state_county_m:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  6234.364         453.169          14.339           [5459.361, 7047.597]
  7557.714         222.636          7.703            [7414.427, 7725.296]
  8022.839         130.426          3.649            [7992.165, 8057.496]
  4966.937         36.182           0.925            [4918.424, 5022.354]
  6771.225         232.253          8.202            [6453.235, 7048.797]
  7007.220         113.902          3.577            [6911.272, 7088.046]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  5465.668       5993.661       6223.539       6492.834       7061.663
  7411.147       7517.098       7563.571       7611.036       7724.472
  7991.736       8015.398       8027.133       8038.390       8057.391
  4923.493       4949.271       4962.762       4979.636       5029.966
  6423.383       6694.867       6795.934       6878.324       7031.081
  6909.615       6983.685       7016.376       7042.065       7087.484


degree_state_county_b:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  1235.416         535.705          14.629           [136.368, 2118.808]
  910.562          257.572          3.881            [390.066, 1426.452]
  1036.061         205.892          3.510            [627.678, 1441.128]
  1329.579         274.648          5.852            [815.470, 1781.944]
  929.393          409.203          9.366            [209.963, 1815.893]
  1032.215         324.244          6.109            [497.002, 1730.257]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  70.929         873.310        1296.305       1642.962       2091.181
  390.256        754.031        904.844        1067.328       1428.805
  646.079        899.123        1024.599       1164.778       1466.136
  739.661        1188.630       1367.320       1506.936       1742.047
  254.376        653.091        873.905        1153.664       1898.994
  540.248        796.271        976.991        1218.794       1780.470


global_m_sd:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  489.045          282.533          9.279            [11.024, 939.175]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  28.532         241.561        493.566        731.669        966.790


global_b_sd:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  435.048          277.041          7.768            [5.690, 921.081]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  26.051         200.306        393.806        658.787        964.451


degree_m_sd:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  630.175          252.251          6.534            [142.125, 999.933]
  616.445          267.450          8.946            [94.702, 999.334]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  77.399         459.821        672.030        841.908        984.015
  52.489         429.736        665.495        842.223        982.027


degree_b_sd:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  452.529          273.037          6.856            [21.048, 943.170]
  427.219          272.795          6.522            [1.296, 912.937]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  31.233         217.734        431.742        669.170        962.117
  19.657         194.106        395.747        635.748        952.809


degree_state_m_sd:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  524.419          285.014          6.559            [60.585, 998.826]
  577.584          241.049          5.441            [201.517, 999.589]
  649.150          275.612          8.127            [97.117, 999.999]
  419.477          265.002          5.892            [1.847, 907.213]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  30.230         284.172        535.845        765.331        984.074
  168.264        371.331        571.927        783.571        977.119
  47.855         466.271        716.952        883.574        988.839
  34.130         193.899        373.979        619.493        951.998


degree_state_b_sd:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  475.224          283.238          6.067            [6.233, 936.747]
  384.100          272.692          7.152            [1.912, 899.766]
  466.646          282.723          5.907            [3.686, 935.485]
  410.651          273.488          6.426            [1.970, 898.377]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  24.089         230.854        462.549        713.230        969.996
  17.332         150.857        327.962        588.025        949.299
  20.723         221.480        447.135        703.825        964.570
  19.760         172.872        375.199        621.209        946.915


error:

  Mean             SD               MC Error         95% HPD interval
  -------------------------------------------------------------------
  
  286.719          273.701          9.339            [182.963, 377.074]

  Posterior quantiles:
  2.5            25             50             75             97.5
  |--------------|==============|==============|--------------|
  
  191.006        242.445        273.820        308.318        389.187

$$ Y = m X + b \\$$

In [ ]: