In [1]:
import pandas as pd
from pandas import DataFrame
import pickle
import statsmodels.api as sm
from patsy import dmatrices, dmatrix

In [2]:
df = pd.read_pickle('.././data/pickle/pypf_prep.pkl')
df = df[df['Agegroup'] != 'ALL AGES'] #lets throw away all ages rows

In [3]:
df[['Cause', 'Agegroup', 'Sex', 'Region', 'Year', 'Deaths']].head() #reminder of what the datas look like


Out[3]:
Cause Agegroup Sex Region Year Deaths
3 IPF UNDER 25 Male NORTH EAST 1974-01-01 0
4 IPF 25-34 Male NORTH EAST 1974-01-01 0
5 All Mesothelioma 25-34 Male NORTH EAST 1974-01-01 0
6 IPF 35-44 Male NORTH EAST 1974-01-01 1
7 Asbestosis 35-44 Male NORTH EAST 1974-01-01 0

In [4]:
def model(cause, df):
    
    df = df[df['Cause'] == cause]
    #df = df[df['Sex'] == 'Male'] #uncomment to limit analysis to males
    
    df['Count'] = df['Deaths']
    
    exposure_array = df['Population'].values
    
    df = df[['Count', 'Agegroup', 'Sex', 'Region']] #get rid of extraneous stuff
        
    y, X = dmatrices('Count ~ C(Agegroup) + C(Sex) + C(Region)', data=df, return_type='dataframe') 

    mod = sm.Poisson(y, X, exposure=exposure_array) #http://statsmodels.sourceforge.net/devel/endog_exog.html
    
    res = mod.fit(maxiter=100)

    return res

In [5]:
results = {}
for cause in df.Cause.unique():
        res = model(cause,df)
        results[cause] = res


Optimization terminated successfully.
         Current function value: 2.898808
         Iterations 21
Optimization terminated successfully.
         Current function value: 3.175772
         Iterations 22
Optimization terminated successfully.
         Current function value: 0.870333
         Iterations 21

In [6]:
cols = {}

for cause in df.Cause.unique():
    cols[cause] = np.exp(results[cause].params) 
    
df1 = DataFrame(cols)

result = df1.sort(['IPF'], ascending=[0]) #lets arrange the regions by the incidence of IPF, highest first

result = result[['IPF', 'All Mesothelioma', 'Asbestosis']][7:16] #lets have a look at the regions

#nb EAST is '1'

In [7]:
result[['IPF', 'All Mesothelioma', 'Asbestosis']]


Out[7]:
IPF All Mesothelioma Asbestosis
C(Region)[T.NORTH WEST] 1.303900 0.987568 2.276488
C(Region)[T.WALES] 1.283289 0.613562 1.086402
C(Region)[T.NORTH EAST] 1.239368 1.710220 5.699712
C(Region)[T.WEST MIDLANDS] 1.201513 0.761154 1.187707
C(Region)[T.EAST MIDLANDS] 1.162599 0.782949 1.397579
C(Region)[T.YORKSHIRE AND THE HUMBER] 1.108745 1.104215 1.615266
C(Region)[T.SOUTH WEST] 1.095977 0.865598 1.809654
C(Region)[T.LONDON] 1.009115 0.996979 2.146462
C(Region)[T.SOUTH EAST] 0.902522 0.952437 1.314366

In [8]:
print '\nIPF \n'
print results['IPF'].summary()
             
print '\nAsbestosis \n'
print results['Asbestosis'].summary()

print '\nAll Mesothelioma \n'
print results['All Mesothelioma'].summary()


IPF 

                          Poisson Regression Results                          
==============================================================================
Dep. Variable:                  Count   No. Observations:                 6100
Model:                        Poisson   Df Residuals:                     6082
Method:                           MLE   Df Model:                           17
Date:                Fri, 22 Aug 2014   Pseudo R-squ.:                  0.7529
Time:                        17:50:45   Log-Likelihood:                -17683.
converged:                       True   LL-Null:                       -71553.
                                        LLR p-value:                     0.000
=========================================================================================================
                                            coef    std err          z      P>|z|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------------------------------
Intercept                               -15.1589      0.086   -176.978      0.000       -15.327   -14.991
C(Agegroup)[T.35-44]                      1.1766      0.097     12.116      0.000         0.986     1.367
C(Agegroup)[T.45-54]                      2.6619      0.088     30.308      0.000         2.490     2.834
C(Agegroup)[T.55-64]                      4.1842      0.085     49.025      0.000         4.017     4.351
C(Agegroup)[T.65-74]                      5.4336      0.085     64.056      0.000         5.267     5.600
C(Agegroup)[T.75-84]                      6.3305      0.085     74.699      0.000         6.164     6.497
C(Agegroup)[T.85+]                        6.8072      0.085     80.047      0.000         6.641     6.974
C(Agegroup)[T.UNDER 25]                  -0.8688      0.120     -7.269      0.000        -1.103    -0.635
C(Sex)[T.Male]                            0.8920      0.008    111.359      0.000         0.876     0.908
C(Region)[T.EAST MIDLANDS]                0.1507      0.019      8.033      0.000         0.114     0.187
C(Region)[T.LONDON]                       0.0091      0.018      0.493      0.622        -0.027     0.045
C(Region)[T.NORTH EAST]                   0.2146      0.021     10.276      0.000         0.174     0.256
C(Region)[T.NORTH WEST]                   0.2654      0.016     16.137      0.000         0.233     0.298
C(Region)[T.SOUTH EAST]                  -0.1026      0.016     -6.237      0.000        -0.135    -0.070
C(Region)[T.SOUTH WEST]                   0.0916      0.018      5.202      0.000         0.057     0.126
C(Region)[T.WALES]                        0.2494      0.020     12.662      0.000         0.211     0.288
C(Region)[T.WEST MIDLANDS]                0.1836      0.018     10.388      0.000         0.149     0.218
C(Region)[T.YORKSHIRE AND THE HUMBER]     0.1032      0.018      5.708      0.000         0.068     0.139
=========================================================================================================

Asbestosis 

                          Poisson Regression Results                          
==============================================================================
Dep. Variable:                  Count   No. Observations:                 3648
Model:                        Poisson   Df Residuals:                     3632
Method:                           MLE   Df Model:                           15
Date:                Fri, 22 Aug 2014   Pseudo R-squ.:                  0.4447
Time:                        17:50:45   Log-Likelihood:                -3175.0
converged:                       True   LL-Null:                       -5717.5
                                        LLR p-value:                     0.000
=========================================================================================================
                                            coef    std err          z      P>|z|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------------------------------
Intercept                               -18.7570      0.395    -47.456      0.000       -19.532   -17.982
C(Agegroup)[T.45-54]                      0.6111      0.399      1.531      0.126        -0.171     1.393
C(Agegroup)[T.55-64]                      2.4171      0.382      6.321      0.000         1.668     3.167
C(Agegroup)[T.65-74]                      3.6607      0.380      9.631      0.000         2.916     4.406
C(Agegroup)[T.75-84]                      4.4767      0.380     11.783      0.000         3.732     5.221
C(Agegroup)[T.85+]                        4.8308      0.383     12.624      0.000         4.081     5.581
C(Sex)[T.Male]                            3.1257      0.079     39.703      0.000         2.971     3.280
C(Region)[T.EAST MIDLANDS]                0.3347      0.113      2.961      0.003         0.113     0.556
C(Region)[T.LONDON]                       0.7638      0.099      7.749      0.000         0.571     0.957
C(Region)[T.NORTH EAST]                   1.7404      0.094     18.431      0.000         1.555     1.925
C(Region)[T.NORTH WEST]                   0.8226      0.094      8.725      0.000         0.638     1.007
C(Region)[T.SOUTH EAST]                   0.2734      0.096      2.838      0.005         0.085     0.462
C(Region)[T.SOUTH WEST]                   0.5931      0.100      5.961      0.000         0.398     0.788
C(Region)[T.WALES]                        0.0829      0.130      0.635      0.525        -0.173     0.339
C(Region)[T.WEST MIDLANDS]                0.1720      0.111      1.543      0.123        -0.046     0.391
C(Region)[T.YORKSHIRE AND THE HUMBER]     0.4795      0.104      4.615      0.000         0.276     0.683
=========================================================================================================

All Mesothelioma 

                          Poisson Regression Results                          
==============================================================================
Dep. Variable:                  Count   No. Observations:                 5576
Model:                        Poisson   Df Residuals:                     5558
Method:                           MLE   Df Model:                           17
Date:                Fri, 22 Aug 2014   Pseudo R-squ.:                  0.6836
Time:                        17:50:45   Log-Likelihood:                -17708.
converged:                       True   LL-Null:                       -55962.
                                        LLR p-value:                     0.000
=========================================================================================================
                                            coef    std err          z      P>|z|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------------------------------
Intercept                               -16.3931      0.115   -142.017      0.000       -16.619   -16.167
C(Agegroup)[T.35-44]                      2.0682      0.121     17.130      0.000         1.832     2.305
C(Agegroup)[T.45-54]                      3.7650      0.115     32.638      0.000         3.539     3.991
C(Agegroup)[T.55-64]                      5.0860      0.114     44.467      0.000         4.862     5.310
C(Agegroup)[T.65-74]                      5.7860      0.114     50.654      0.000         5.562     6.010
C(Agegroup)[T.75-84]                      6.1624      0.114     53.908      0.000         5.938     6.386
C(Agegroup)[T.85+]                        6.1026      0.116     52.803      0.000         5.876     6.329
C(Agegroup)[T.UNDER 25]                  -1.7635      0.269     -6.544      0.000        -2.292    -1.235
C(Sex)[T.Male]                            1.9372      0.013    148.771      0.000         1.912     1.963
C(Region)[T.EAST MIDLANDS]               -0.2447      0.024    -10.187      0.000        -0.292    -0.198
C(Region)[T.LONDON]                      -0.0030      0.021     -0.142      0.887        -0.045     0.039
C(Region)[T.NORTH EAST]                   0.5366      0.022     24.567      0.000         0.494     0.579
C(Region)[T.NORTH WEST]                  -0.0125      0.020     -0.627      0.531        -0.052     0.027
C(Region)[T.SOUTH EAST]                  -0.0487      0.019     -2.594      0.009        -0.086    -0.012
C(Region)[T.SOUTH WEST]                  -0.1443      0.022     -6.671      0.000        -0.187    -0.102
C(Region)[T.WALES]                       -0.4885      0.029    -17.007      0.000        -0.545    -0.432
C(Region)[T.WEST MIDLANDS]               -0.2729      0.023    -12.042      0.000        -0.317    -0.229
C(Region)[T.YORKSHIRE AND THE HUMBER]     0.0991      0.021      4.755      0.000         0.058     0.140
=========================================================================================================

In [9]:
#todo - bin years into periods
#read around poisson
#see if stata gives same results with xi:poisson death i.sex i.age_g, e(pop) irr
#make maps

In [10]:
print df[df['Cause'] == 'All Mesothelioma'].Deaths.sum()
print df[df['Cause'] == 'Asbestosis'].Deaths.sum()


43993.8756962
2784.0