Hierarchical part with jags


In [1]:
#you have to make sure that you have all of these installed
import cProfile
import re
import math
import numpy as np
import scipy as sp
from scipy import stats
from scipy import optimize as opt
import pandas as pd
import random as rnd
from matplotlib import pyplot as plt
import time
import numpy.random
import warnings
warnings.filterwarnings('ignore')
import multiprocessing as mp

In [2]:
import chen_utils as ch
import user_simulation_utils as sim

In [3]:
###Generation of people and their characteristics
np.random.seed(123)
# beta predictors
# assume beta (rate parameter) is determined by avg. transaction size
# avg income and whether public transport was used and loyalty programm membership
# create this feature matrix
Std = 0.02
People = 50
mean_transaction = np.random.normal(np.log(15), Std, People)
mean_transaction = np.exp(mean_transaction)

mean_income = np.random.normal(1800, 100, People)

public_transport = [0] * 5 + [1] * 5
public_transport = np.array(public_transport * 5)
# p predictors
# assume p (SOW) is driven by loyalty card membership and whether discounter is present
# create this feature matrix
# membership in loyalty
loyalty = np.random.binomial(1, 0.6, 50)

discounter = np.array([0, 1] * 25)

preference = np.array(stats.uniform.rvs(0, 1, size=People, random_state = 123))

theta_beta = 1 -0.2*mean_transaction + 0.0005*mean_income + 0.5*public_transport + 0.6*loyalty + np.random.normal(0.01, 0.01, People) 
beta = np.exp(theta_beta) 

theta_p = -0.2 +0.4*loyalty - 0.8*discounter + preference + np.random.normal(0.01, 0.01, People)
p = sim.expit(theta_p)

In [4]:
covariates = pd.DataFrame({'mean_transaction': mean_transaction, 'mean_income': mean_income, 'public_transport': public_transport, 'loyalty': loyalty, 'discounter': discounter, 'preference': preference, 'true_beta': beta, 'true_p': p})
covariates


Out[4]:
discounter loyalty mean_income mean_transaction preference public_transport true_beta true_p
0 0 1 1670.591468 14.677821 0.696469 0 0.620273 0.712686
1 1 1 1696.121179 15.302208 0.286139 0 0.549331 0.424306
2 0 1 1974.371223 15.085134 0.226851 0 0.660819 0.607978
3 1 1 1720.193726 14.554851 0.551315 0 0.642111 0.482259
4 0 1 1802.968323 14.827420 0.719469 0 0.637987 0.716421
5 1 0 1906.931597 15.503704 0.423106 1 0.536961 0.361698
6 0 0 1889.070639 14.289380 0.980764 1 0.666485 0.687262
7 1 1 1975.488618 14.871877 0.684830 1 1.132924 0.523146
8 0 1 1949.564414 15.384629 0.480932 1 1.010449 0.667736
9 1 1 1906.939267 14.742219 0.392118 1 1.125090 0.449210
10 0 0 1722.729129 14.797711 0.343178 0 0.332108 0.543685
11 1 0 1879.486267 14.971614 0.729050 0 0.345298 0.436823
12 0 1 1831.427199 15.454156 0.438572 0 0.562528 0.656674
13 1 1 1667.373454 14.809549 0.059678 0 0.596579 0.369960
14 0 0 1941.729905 14.867395 0.398044 0 0.372665 0.551610
15 1 1 1880.723653 14.870259 0.737995 1 1.073585 0.534655
16 0 0 1804.549008 15.676594 0.182492 1 0.492002 0.497885
17 1 1 1776.690794 15.670593 0.175452 1 0.871791 0.398490
18 0 0 1680.169886 15.304261 0.531551 1 0.491367 0.585974
19 1 0 1819.952407 15.116304 0.531828 1 0.545952 0.388796
20 0 0 1846.843912 15.222850 0.634401 0 0.329622 0.608653
21 1 1 1716.884502 15.453953 0.849432 0 0.540477 0.561012
22 0 1 1916.220405 14.721861 0.724455 0 0.691007 0.716621
23 1 1 1690.279695 15.356929 0.611024 0 0.535138 0.509285
24 0 1 1587.689965 14.628513 0.722443 0 0.602528 0.719383
25 1 1 1903.972709 14.809890 0.322959 1 1.093144 0.434542
26 0 1 1759.663396 15.274615 0.361789 1 0.937725 0.637418
27 1 1 1787.397041 14.577461 0.228263 1 1.089220 0.410548
28 0 0 1716.248328 14.958038 0.293714 1 0.530480 0.522774
29 1 1 1639.403724 14.743689 0.630976 1 0.980407 0.508863
30 0 1 1925.523737 14.923510 0.092105 0 0.657475 0.574358
31 1 0 1731.113102 14.183487 0.433701 0 0.382735 0.363567
32 0 0 1966.095249 14.477845 0.430863 0 0.407184 0.562282
33 1 1 1880.730819 14.791499 0.493685 0 0.674884 0.472403
34 0 0 1768.524185 15.280835 0.425830 0 0.313853 0.556551
35 1 0 1691.409760 14.948000 0.312261 1 0.527289 0.333703
36 0 1 1726.753801 15.000854 0.426351 1 0.969757 0.651123
37 1 0 1678.747687 15.207894 0.893389 1 0.501172 0.476175
38 0 1 2008.711336 14.738446 0.944160 1 1.182465 0.757341
39 1 1 1816.444123 15.085330 0.501837 1 1.015877 0.479855
40 0 0 1915.020554 14.760325 0.623953 0 0.378855 0.606228
41 1 1 1673.264795 14.490551 0.115618 0 0.634405 0.383742
42 0 0 1818.103513 14.883187 0.317285 0 0.345393 0.532582
43 1 0 1917.786194 15.173133 0.414826 0 0.335778 0.359044
44 0 0 1766.498924 15.101921 0.866309 0 0.322187 0.665218
45 1 1 1903.111446 14.996451 0.250455 1 1.053887 0.415448
46 0 1 1691.543209 15.735157 0.483034 1 0.822993 0.667960
47 1 0 1663.652846 15.124387 0.985560 1 0.507055 0.498392
48 0 1 1837.940061 15.296513 0.519485 1 0.971843 0.675330
49 1 1 1762.082357 15.686698 0.612895 1 0.863641 0.506422

In [16]:
covariates.to_csv('covariates.csv', sep=',', encoding='utf-8',index=False)

In [5]:
#check distribution of beta and p
plt.hist(beta)
plt.show()

plt.hist(p)
plt.show()



In [6]:
observations = []
for i in range(len(p)):
    observations.append(sim.simulate_data(share_observed = p[i], rate_input = beta[i], observations_total = 20))

In [7]:
observations[0:3]


Out[7]:
[[array([  4.36805206,   3.70012025,   7.74081173,   4.11348882,
           2.93417865,   2.67425042,   4.8110449 ,   1.64625645,
           4.23982854,   3.3239371 ,   1.63387758,   0.76906907,
           1.52036594,   6.95458129,   0.83353605,   0.89894822,
           0.60399416,   1.83686863,   0.50555509,  11.5760237 ]), 0],
 [array([  5.3673338 ,   8.82421469,   1.73359213,  11.53221115,
          11.65080895,  10.28599646,   4.20299802,   1.48951677,
          10.02067792,   6.92168086,  39.99050057,  20.1515926 ,
           4.90796792,   2.16179439,  14.17140275,   5.41132932,
           3.5467406 ,   6.54431988,   3.36264   ,   2.04154673]),
  3.9612540327046806],
 [array([  4.40121998,   1.0214549 ,   1.90461193,   5.78793025,
           1.39369296,   2.33420136,   2.12500885,   1.99441669,
           3.03251756,   2.20718302,   3.02953091,  15.94255655,
           2.59299191,  10.26842046,   4.22992941,   2.33406201,
           8.9084555 ,   0.94104006,  13.85426445,   0.48376873]),
  5.5694139652427248]]

In [8]:
npinput = np.asarray(observations)

# delete all last purchase bigger than 30
# check mapping and potentially keep user_id --> check
print("2 parameter model")
pool = mp.Pool(processes = None)
t0 = time.time()
results_2par = pool.map(ch.metropolis_solution_trajectory, npinput)
t1 = time.time()
print(t1-t0)


2 parameter model
182.2447919845581

In [12]:
results_2par[0:2]


Out[12]:
[array([[ 0.4829912 ,  1.52604078],
        [ 0.41026325,  1.53184497],
        [ 0.41026325,  1.53184497],
        ..., 
        [ 0.30234187,  1.55747671],
        [ 0.30234187,  1.55747671],
        [ 0.30234187,  1.55747671]]), array([[ 0.66971353,  0.32633532],
        [ 0.46073976,  0.40642289],
        [ 0.46073976,  0.40642289],
        ..., 
        [ 0.2697629 ,  1.03362594],
        [ 0.2697629 ,  1.03362594],
        [ 0.25167985,  1.23818131]])]

In [13]:
final_results = np.zeros((len(results_2par)*len(results_2par[0]), 2))
final_ids = np.zeros((len(results_2par)*len(results_2par[0]), 1))

for i in range(len(results_2par)):
    for j in range(len(results_2par[i])):
        final_results[i*len(results_2par[i])+j] = [results_2par[i][j][0], results_2par[i][j][1]]
        final_ids[i*len(results_2par[i])+j] = i + 1

In [14]:
final_results


Out[14]:
array([[ 0.4829912 ,  1.52604078],
       [ 0.41026325,  1.53184497],
       [ 0.41026325,  1.53184497],
       ..., 
       [ 0.32944808,  1.38816398],
       [ 0.32944808,  1.38816398],
       [ 0.32944808,  1.38816398]])

In [15]:
df = pd.DataFrame({'user_category_ids': final_ids.ravel(), '2par_first': final_results[:,0].ravel(), '2par_second': final_results[:,1].ravel()})
df


Out[15]:
2par_first 2par_second user_category_ids
0 0.482991 1.526041 1.0
1 0.410263 1.531845 1.0
2 0.410263 1.531845 1.0
3 0.437632 1.359841 1.0
4 0.437632 1.359841 1.0
5 0.437632 1.359841 1.0
6 0.416020 1.391801 1.0
7 0.624459 1.240210 1.0
8 0.624459 1.240210 1.0
9 0.624459 1.240210 1.0
10 0.624459 1.240210 1.0
11 0.624459 1.240210 1.0
12 0.624459 1.240210 1.0
13 0.624459 1.240210 1.0
14 0.643476 1.140503 1.0
15 0.541417 0.914670 1.0
16 0.594454 1.061232 1.0
17 0.661763 1.140822 1.0
18 0.620130 0.899562 1.0
19 0.620130 0.899562 1.0
20 0.620130 0.899562 1.0
21 0.620130 0.899562 1.0
22 0.620130 0.899562 1.0
23 0.540646 1.052261 1.0
24 0.695248 1.147009 1.0
25 0.695248 1.147009 1.0
26 0.458088 0.966280 1.0
27 0.458088 0.966280 1.0
28 0.458088 0.966280 1.0
29 0.372139 1.141575 1.0
... ... ... ...
249970 0.324064 1.153852 50.0
249971 0.324064 1.153852 50.0
249972 0.324064 1.153852 50.0
249973 0.324064 1.153852 50.0
249974 0.293474 1.277970 50.0
249975 0.293474 1.277970 50.0
249976 0.278748 1.373184 50.0
249977 0.369274 1.309650 50.0
249978 0.369274 1.309650 50.0
249979 0.369274 1.309650 50.0
249980 0.369274 1.309650 50.0
249981 0.369274 1.309650 50.0
249982 0.387279 1.366329 50.0
249983 0.387279 1.366329 50.0
249984 0.387279 1.366329 50.0
249985 0.232915 1.226898 50.0
249986 0.232915 1.226898 50.0
249987 0.244697 1.262230 50.0
249988 0.244697 1.262230 50.0
249989 0.244697 1.262230 50.0
249990 0.271695 1.290524 50.0
249991 0.330447 1.376261 50.0
249992 0.330066 1.384156 50.0
249993 0.328166 1.480841 50.0
249994 0.328166 1.480841 50.0
249995 0.246766 1.369154 50.0
249996 0.366560 1.547855 50.0
249997 0.329448 1.388164 50.0
249998 0.329448 1.388164 50.0
249999 0.329448 1.388164 50.0

250000 rows × 3 columns


In [24]:
df.to_csv('metropolis_trajectory.csv', sep=',', encoding='utf-8',index=False)