Demo of the ChoiceModels library

For Binary Logit and Multinomial Logit, including sampling of alternatives.

To install ChoiceModels, git clone the repository (https://github.com/ual/choicemodels).


In [1]:
import os; os.chdir('../choicemodels')
import choicemodels

In [2]:
import numpy as np
import pandas as pd

import statsmodels.api as sm  # for binary logit
from collections import OrderedDict  # for MNL model specifications

Binary Logit


In [3]:
# Import or generate estimation data

Y = np.random.randint(2, size=50)  # 50x1 vector of random 0's and 1's
X = np.random.rand(50, 3)  # 50x3 matrix of random floats

data = pd.DataFrame(data=X, columns=['x1','x2','x3'])
data['y'] = Y

print(data.describe())


              x1         x2         x3          y
count  50.000000  50.000000  50.000000  50.000000
mean    0.529404   0.453265   0.475061   0.540000
std     0.282240   0.282130   0.288494   0.503457
min     0.009996   0.005031   0.043904   0.000000
25%     0.282585   0.218973   0.190960   0.000000
50%     0.533950   0.419010   0.480034   1.000000
75%     0.749493   0.659642   0.711307   1.000000
max     0.983582   0.993942   0.980646   1.000000

In [4]:
# Fit a model

specification = 'y ~ x1 + x2 + x3'

m = sm.Logit.from_formula(specification, data)
results = m.fit()

print(results.summary())


Optimization terminated successfully.
         Current function value: 0.643666
         Iterations 5
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                      y   No. Observations:                   50
Model:                          Logit   Df Residuals:                       46
Method:                           MLE   Df Model:                            3
Date:                Fri, 17 Mar 2017   Pseudo R-squ.:                 0.06707
Time:                        15:45:18   Log-Likelihood:                -32.183
converged:                       True   LL-Null:                       -34.497
                                        LLR p-value:                    0.2012
==============================================================================
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     -0.4935      0.933     -0.529      0.597        -2.322     1.335
x1             2.0809      1.114      1.868      0.062        -0.102     4.264
x2            -0.9452      1.082     -0.874      0.382        -3.066     1.175
x3            -0.0039      1.054     -0.004      0.997        -2.070     2.062
==============================================================================

In [ ]:


In [ ]:

Multinomial Logit


In [5]:
# Import or generate estimation data

def rand(len, min, max):
    """ Generate `len` random floats uniformly distributed from `min` to `max` """
    return (max - min) * np.random.rand(len) + min

numalts = 50  # alternatives
X = np.concatenate((rand(numalts/2, 0, 10), rand(numalts/2, 100, 110)))

numobs = 1000  # agents/observations
beta = np.zeros(1000) + 1.5

U = [[beta[n]*x + np.random.gumbel() for x in X] for n in range(numobs)]  # utility matrix

choices = [np.argmax(a) for a in U]

This is what the data looks like:

  • N = 1000 agents/observations with true taste coefficients in array "beta"
  • J = 50 alternatives with single attributes in array "X"
  • N choice outcomes in array "choices"

In [6]:
# Set up the estimation dataset in long format

d = [[n, i, int(choices[n]==i), X[i]] for n in range(numobs) for i in range(numalts)]
df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'chosen', 'x'])

In [7]:
print(df.describe())


             obs_id        alt_id        chosen             x
count  50000.000000  50000.000000  50000.000000  50000.000000
mean     499.500000     24.500000      0.020000     54.660992
std      288.677877     14.431014      0.140001     50.201840
min        0.000000      0.000000      0.000000      0.345308
25%      249.750000     12.000000      0.000000      4.806394
50%      499.500000     24.500000      0.000000     54.955417
75%      749.250000     37.000000      0.000000    104.898746
max      999.000000     49.000000      1.000000    109.878125

In [8]:
# Set up model spec

spec = OrderedDict([('x', 'all_same')])
labels = OrderedDict([('x', 'beta_x')])

In [9]:
def estimate_model(init_val):
    """
    Initialize and fit a model, returning it as an object. Will use the 
    current values of `df`, `spec`, and `labels`.
    """
    m = choicemodels.MNLogit(data = df, 
                             alt_id_col = 'alt_id',
                             obs_id_col = 'obs_id', 
                             choice_col = 'chosen', 
                             specification = spec, 
                             names = labels)

    results = m.fit_mle(init_vals = np.array([init_val]))
    return results

In [10]:
%%time
results = estimate_model(init_val = 1.2)
print(results.summary())


Log-likelihood at zero: -3,912.0230
Initial Log-likelihood: -1,616.6744
Estimation Time: 0.04 seconds.
Final log-likelihood: -1,600.1060
                     Multinomial Logit Model Regression Results                    
===================================================================================
Dep. Variable:                      chosen   No. Observations:                1,000
Model:             Multinomial Logit Model   Df Residuals:                      999
Method:                                MLE   Df Model:                            1
Date:                     Fri, 17 Mar 2017   Pseudo R-squ.:                   0.591
Time:                             15:45:40   Pseudo R-bar-squ.:               0.591
converged:                            True   Log-Likelihood:             -1,600.106
                                             LL-Null:                    -3,912.023
==============================================================================
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
beta_x         1.4231      0.040     35.555      0.000         1.345     1.502
==============================================================================
CPU times: user 10.2 s, sys: 19.4 s, total: 29.6 s
Wall time: 12.2 s
/Users/smmaurer/anaconda/lib/python2.7/site-packages/scipy/optimize/_minimize.py:385: RuntimeWarning: Method BFGS does not use Hessian information (hess).
  RuntimeWarning)

In [ ]:


In [ ]:

MNL with random sampling of alternatives

This example reuses the data generated in the first cell under "Multinomial Logit" above.


In [11]:
def alts(obs_id, C, K):
    """
    This function generates a restricted choice set D for a particular
    observation. Expects list `C` of alternatives to sample from (either
    the full choice set or a stratrum), int `K` alternatives to sample,
    and list `choices` of the alt_id chosen for each obs_id. Returns list 
    of K alt_id's including the chosen one.
    """
    chosen = choices[obs_id]  # id of chosen alternative
    unchosen = [i for i in C if chosen != i]  # id's of unchosen alts
    sample_unchosen = np.random.choice(unchosen, size=K-1, replace=False).tolist()
    return np.sort([chosen] + sample_unchosen)

In [12]:
# Set up the estimation dataset, which can use the same spec as earlier

C = range(numalts)  # choice set to sample from
K = 10

d = [[n, i, int(choices[n]==i), X[i]] for n in range(numobs) for i in alts(n, C, K)]
df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'chosen', 'x'])

In [13]:
print df.describe()


             obs_id        alt_id        chosen             x
count  10000.000000  10000.000000  10000.000000  10000.000000
mean     499.500000     25.412500      0.100000     58.908168
std      288.689425     14.177512      0.300015     50.403752
min        0.000000      0.000000      0.000000      0.345308
25%      249.750000     13.000000      0.000000      4.937205
50%      499.500000     27.000000      0.000000    100.318050
75%      749.250000     36.000000      0.000000    106.243667
max      999.000000     49.000000      1.000000    109.878125

In [14]:
%%time
results = estimate_model(init_val = 1.2)
print results.summary()


Log-likelihood at zero: -2,302.5851
Initial Log-likelihood: -549.2434
Estimation Time: 0.01 seconds.
Final log-likelihood: -545.0510
                     Multinomial Logit Model Regression Results                    
===================================================================================
Dep. Variable:                      chosen   No. Observations:                1,000
Model:             Multinomial Logit Model   Df Residuals:                      999
Method:                                MLE   Df Model:                            1
Date:                     Fri, 17 Mar 2017   Pseudo R-squ.:                   0.763
Time:                             15:45:41   Pseudo R-bar-squ.:               0.763
converged:                            True   Log-Likelihood:               -545.051
                                             LL-Null:                    -2,302.585
==============================================================================
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
beta_x         1.3758      0.064     21.570      0.000         1.251     1.501
==============================================================================
CPU times: user 1.41 s, sys: 553 ms, total: 1.96 s
Wall time: 680 ms

In [ ]:


In [ ]:


In [ ]:


In [ ]: