This performs the quasi-Poisson regression and generates the plots used to illustrate it.



In [1]:

    
import pandas as pd
import glob
from os import path
import numpy as np
from __future__ import division 
import sqlalchemy as sqla
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels
from sklearn import linear_model
import matplotlib
import rpy2.robjects as robjects
#import rpy2.robjects.packages as rpackages
from rpy2.robjects import pandas2ri



In [2]:

    
#%matplotlib qt
%matplotlib inline



In [3]:

    
engine = sqla.create_engine('postgresql://postgres:postgres@localhost:5432/TaxiData',echo=False)

sqlmycols={'MOGE001':'total_commuters',\
           'MOGE011':'car_commuters',\
          'MRUE001':'pc_income',\
          'totalpopulation':'population',\
          'MOGM001':'me_total_commuters',\
          'MOGM011':'me_car_commuters',\
          'MRUM001':'me_pc_income',\
          'time_dif_derived_approxcount':'approxcount',\
          'time_dif_derived_approxcount_percerror':'perc_error_approxcount',\
           'time_dif_derived_approxcount_error':'error_approxcount',\
           'twentythirteen_full_count':'count',\
           'fipscodes':'fipscodes',\
           'perc_white':'white_ratio'
          }
columnstring=', '.join(['"%s" AS %s' % (i,j) for i,j in sqlmycols.iteritems()])



In [4]:

    
full=pd.read_sql_query('SELECT %s FROM lotsofdata WHERE totalpopulation>=1000' % (columnstring) ,engine).set_index('fipscodes')



In [5]:

    
full['driver_ratio']=full['car_commuters']/full['total_commuters']
full['approxcount_pc']=full['approxcount']/full['population']
full['count_pc']=full['count']/full['population']



In [6]:

    
borocods=pd.read_sql_table('testborocodedict',engine)
borocods.drop('county_name_test',inplace=True,axis=1)
borocods.set_index('code',inplace=True)
borocods









    Out[6]:






  
    
      
      int
      name
      county_name
    
    
      code
      
      
      
    
  
  
    
      005
      2
      Bronx
      Bronx
    
    
      047
      3
      Brooklyn
      Kings
    
    
      061
      1
      Manhattan
      New York
    
    
      081
      4
      Queens
      Queens
    
    
      085
      5
      Staten Island
      Richmond

Using this table, we can identify the NYC borough of each census tract from the FIPS code that is used for the index, as part of the FIPS code consists of a county code that corresponds to the borough.

int is an integer code that NYC assigns to boroughs.



In [7]:

    
full['borocode']=borocods.loc[full.index.to_series().str[2:5].values]['int'].values

I'm eliminating Staten Island from my data.

No python module has quasi-Poisson regression, but it will be convenient to have the Poisson estimates, as the behavior of the mean is identical.



In [8]:

    
psub=full[(full['borocode']!=5)]
#psub.drop(outliers,axis=0,inplace=True)
model=statsmodels.discrete.discrete_model.Poisson((psub['count']),\
                                                  sm.add_constant(np.log(psub['pc_income']),prepend=False),\
                                                 exposure=psub['population'].values)
fitp=model.fit(start_params=[2.2948,-21.5754])
fitp.summary()









    



Optimization terminated successfully.
         Current function value: 50432.844615
         Iterations 4






    Out[8]:





Poisson Regression Results

  Dep. Variable:        count         No. Observations:       1979   


  Model:               Poisson        Df Residuals:           1977   


  Method:                MLE          Df Model:                  1   


  Date:           Fri, 04 Mar 2016    Pseudo R-squ.:        0.6460   


  Time:               17:21:35        Log-Likelihood:     -9.9807e+07


  converged:            True          LL-Null:            -2.8196e+08


                                    LLR p-value:           0.000   




               coef      std err       z       P>|z|  [95.0% Conf. Int.] 


  pc_income      2.1338      0.000   1.75e+04   0.000      2.134     2.134


  const        -19.7852      0.001  -1.45e+04   0.000    -19.788   -19.783

We'll be relying on R for the quasi-Poisson modeling.



In [9]:

    
R=robjects.r
pandas2ri.activate()



In [10]:

    
r_model=R.glm('count~1+log(pc_income)', family=R.quasipoisson(),data=psub,offset=np.log(psub.population))



In [11]:

    
deviance=R.summary(r_model).rx2('deviance')[0]
'%.4g' % deviance









    Out[11]:





'1.996e+08'



In [12]:

    
dispersion=R.summary(r_model).rx2('dispersion')[0]



In [13]:

    
'%.4g' % dispersion









    Out[13]:





'5.176e+05'



In [14]:

    
deviance/dispersion









    Out[14]:





385.57723365839183



In [15]:

    
SIsub=full[(full['borocode']==5)]



In [16]:

    
# Plotting the data:
#plt.rc('text', **{'usetex':True,'latex.preamble':[
#       r'\usepackage{siunitx}',   
#       r'\sisetup{detect-all}',   
#       r'\usepackage{helvet}',    
#       r'\usepackage{sansmath}',  
#       r'\sansmath'               
#]  })
comd=0
buff=100

matplotlib.rc('legend', fontsize=10)

border=0.2
boro=1

#asub=full[(full.drivercommuterratio<=border)]
#bsub=full[(full.drivercommuterratio>border)]
XX=np.linspace(psub.pc_income.min(),psub.pc_income.max())

scatter_kwargs = {"zorder":100}
error_kwargs = {"lw":.5, "zorder":0}

a=6
gr=(1+np.sqrt(5))/2
plt.figure(figsize=[a*gr,a])
plt.autoscale(tight=True)
plt.xlabel('Per-capita income')
plt.ylabel('2013 yellow cab dropoffs per-capita')
plt.title("Yellow cab dropoffs vs income of census tract")
colorlist=plt.cm.gist_rainbow(np.linspace(0,2.8/4,4)).tolist()
colorlist.reverse()
for i in borocods[borocods['int']<5]['int'].values:
    #print(i)
    isub=psub[psub.borocode==i]
    a=plt.scatter(isub.pc_income,isub.count_pc,alpha=0.5,label=borocods[borocods['int']==i]['name'].values[0],c=colorlist[i-1],**scatter_kwargs)
    #plt.errorbar(isub.pc_income,isub.approxcount_pc,yerr=nintypercentile*isub.error_approxcount/isub.population,xerr=isub.me_pc_income,fmt=None,marker=None,mew=0,ecolor='black',alpha=0.5,label='',**error_kwargs)
#plt.scatter(bsub['pc_income'],bsub['approxcount_pc'],facecolors='none',edgecolors='black',alpha=0.1,label='driver commuter ratio $>$ '+str(round(border,1)))
#plt.scatter(full.loc[outliers|highleverage]['pc_income'],full.loc[outliers|highleverage]['approxcount_pc'],alpha=0.25,label=r'Outliers & high leverage',color='black')
plt.scatter(SIsub.pc_income,SIsub.count_pc,facecolors='none',edgecolors='black',alpha=0.1,label=borocods[borocods['int']==5]['name'].values[0])
#plt.scatter(outliers.pc_income,outliers.count_pc, facecolors='black',edgecolors='black',alpha=0.2, label='Outliers')
#plt.scatter(subset.loc[highleverage]['pc_income'],subset.loc[highleverage]['count_pc'],alpha=0.3,label='High leverage',color='orange')
plt.plot(XX,(XX**fitp.params['pc_income'])*np.exp(fitp.params['const']),color='black',alpha=0.8,lw=2, label='Fit, pseudo-$\mathsf{R}^\mathsf{2}=$ '+str(round(fitp.prsquared,3)) )
#cbar=plt.colorbar(a, ticks=borocods[borocods['int']<5]['int'].tolist())
#cbar.set_label("boro")


plt.yscale('log')
plt.xscale('log')
plt.legend(loc='lower right')
plt.savefig('2013count_dropoffs_vs_income_low_car_poisson.png',dpi=500)
plt.show()

Distribution of Anscombe residuals



In [17]:

    
psub['prediction']=pandas2ri.ri2py(r_model.rx2('fitted.values'))

psub['anscombe_resid']=(3/2)*(psub['count']**(2/3)-psub['prediction']**(2/3))/(psub['prediction']**(1/6))









    



/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()



In [18]:

    
sm.qqplot(psub['anscombe_resid'],line='s')
plt.ylabel('Anscombe residual quantiles')
plt.savefig('2013quasi-poisson_QQ.png',dpi=500)
plt.show()



In [19]:

    
plt.scatter(psub['pc_income'],psub['anscombe_resid'])
plt.xscale('log')
plt.show()

Standardized residuals



In [20]:

    
psub['standardized_resid']=pandas2ri.ri2py(R.rstandard(r_model))









    



/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [21]:

    
psub['standardized_resid'].hist(bins=50)









    Out[21]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f14e74c6150>



In [22]:

    
plt.scatter(psub['pc_income'],psub['standardized_resid'])
plt.xscale('log')
#plt.yscale('log')



In [23]:

    
sm.qqplot(psub['standardized_resid'],line='s')
#sm.qqplot(psub['anscombe_resid'],line='s')
plt.show()

Studentized Residuals



In [24]:

    
psub['studentized_resid']=pandas2ri.ri2py(R.rstudent(r_model,type='dev'))









    



/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [25]:

    
psub['studentized_resid'].hist(bins=50)
plt.show()



In [26]:

    
outliers=psub[np.abs(psub['studentized_resid'])>3]



In [27]:

    
outliers.count()









    Out[27]:





me_car_commuters          37
me_total_commuters        37
population                37
pc_income                 37
approxcount               37
car_commuters             37
total_commuters           37
count                     37
perc_error_approxcount    37
white_ratio               37
error_approxcount         37
me_pc_income              37
driver_ratio              37
approxcount_pc            37
count_pc                  37
borocode                  37
prediction                37
anscombe_resid            37
standardized_resid        37
studentized_resid         37
dtype: int64

Probably not enough to be significant in Poisson regression.

Exploring other variables

Note that none of this has yet made it into documentation



In [28]:

    
psub['manhattan_brooklyn']=0
psub['manhattan_brooklyn'][(psub['borocode']==1)|(psub['borocode']==3)]=1
psub['manhattan']=0
psub['manhattan'][(psub['borocode']==1)]=1









    



/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py:4059: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py:3066: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [29]:

    
psub[['manhattan','manhattan_brooklyn']].hist()









    Out[29]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f14eae2d490>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f14eaeb13d0>]], dtype=object)



In [30]:

    
candidates=['driver_ratio','white_ratio','borocode','manhattan','manhattan_brooklyn','population']



In [31]:

    
psub[psub['manhattan']==1].shape









    Out[31]:





(270, 22)

Plot them



In [32]:

    
for i in candidates:
    a=6
    gr=(1+np.sqrt(5))/2
    plt.figure(figsize=[a*gr,a])
    plt.autoscale(tight=True)
    plt.xlabel(i)
    plt.ylabel('2013 yellow cab dropoffs per-capita')
    plt.scatter(psub[i],psub['count_pc'],alpha=0.5)
    plt.yscale('log')
    plt.show()

None of them look that great on their own, let's see how they compare in the regression.

Note that the slight pattern in the population plot may indicate that the assumption that per-capita drop-offs is the right dependent variable isn't perfect.



In [33]:

    
for i in candidates:
    r_tempmodel=R.glm('count~1+log(pc_income)+%s' % (i), family=R.quasipoisson(),data=psub,offset=np.log(psub.population))
    devdiff=deviance - R.summary(r_tempmodel).rx2('deviance')[0]
    dispratio=R.summary(r_tempmodel).rx2('dispersion')[0]/ dispersion
    print('Adding %s lowers deviance by %.2g and lowers dispersion by a factor of %.2f' % (i,devdiff,dispratio))









    



Adding driver_ratio lowers deviance by 7.4e+07 and lowers dispersion by a factor of 0.36
Adding white_ratio lowers deviance by 1.1e+06 and lowers dispersion by a factor of 1.01
Adding borocode lowers deviance by 5.5e+07 and lowers dispersion by a factor of 0.45
Adding manhattan lowers deviance by 6.5e+07 and lowers dispersion by a factor of 0.39
Adding manhattan_brooklyn lowers deviance by 2.8e+07 and lowers dispersion by a factor of 0.68
Adding population lowers deviance by 4.5e+06 and lowers dispersion by a factor of 0.82

Using the ratio of commuters that drive seems to be obviously superior, though the Manhattan catigorical is pretty good too. That could be because the driver ratio is incredibly low in Manhattan.



In [34]:

    
twosub=psub[['count_pc','count','driver_ratio','pc_income','population','borocode']]
twosub.dropna(inplace=True)
twosub['log_pc_income']=np.log(twosub['pc_income'])









    



/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()



In [35]:

    
model2=statsmodels.discrete.discrete_model.Poisson((twosub['count']),\
                                                  sm.add_constant(twosub[['log_pc_income','driver_ratio']],prepend=False),\
                                                 exposure=twosub['population'].values)
fit2=model2.fit(start_params=[2.2948,-21.5754,0])
fit2.summary()









    



Optimization terminated successfully.
         Current function value: 31767.194997
         Iterations 25






    Out[35]:





Poisson Regression Results

  Dep. Variable:        count         No. Observations:       1977   


  Model:               Poisson        Df Residuals:           1974   


  Method:                MLE          Df Model:                  2   


  Date:           Fri, 04 Mar 2016    Pseudo R-squ.:        0.7771   


  Time:               17:21:53        Log-Likelihood:     -6.2804e+07


  converged:            True          LL-Null:            -2.8173e+08


                                    LLR p-value:           0.000   




                   coef      std err       z       P>|z|  [95.0% Conf. Int.] 


  log_pc_income      1.4955      0.000   1.06e+04   0.000      1.495     1.496


  driver_ratio     -10.1842      0.002  -6268.049   0.000    -10.187   -10.181


  const            -11.4356      0.002  -6945.485   0.000    -11.439   -11.432



In [36]:

    
r_model2=R.glm('count~1+log(pc_income)+driver_ratio', family=R.quasipoisson(),data=psub,offset=np.log(psub.population))



In [37]:

    
print(R.summary(r_model2).rx2('coefficients'))









    



                 Estimate Std. Error   t value      Pr(>|t|)
(Intercept)    -11.435575 0.71071660 -16.09020  7.854395e-55
log(pc_income)   1.495488 0.06097012  24.52821 3.422860e-116
driver_ratio   -10.184185 0.70135018 -14.52083  1.809339e-45



In [38]:

    
twodeviance=R.summary(r_model2).rx2('deviance')[0]
print('%.4g'% twodeviance)



In [39]:

    
print( '%.4g' % R.summary(r_model2).rx2('dispersion')[0])



In [40]:

    
twosub['combined_var']=fit2.params['log_pc_income']*twosub['log_pc_income']+fit2.params['driver_ratio']*twosub['driver_ratio']+np.log(twosub['population'])









    



/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [41]:

    
# Plotting the data:
#plt.rc('text', **{'usetex':True,'latex.preamble':[
#       r'\usepackage{siunitx}',   
#       r'\sisetup{detect-all}',   
#       r'\usepackage{helvet}',    
#       r'\usepackage{sansmath}',  
#       r'\sansmath'               
#]  })
comd=0
buff=100

matplotlib.rc('legend', fontsize=10)

border=0.2
boro=1

#asub=full[(full.drivercommuterratio<=border)]
#bsub=full[(full.drivercommuterratio>border)]
XX=np.linspace(twosub.combined_var.min(),twosub.combined_var.max())

scatter_kwargs = {"zorder":100}
error_kwargs = {"lw":.5, "zorder":0}

a=6
gr=(1+np.sqrt(5))/2
plt.figure(figsize=[a*gr,a])
plt.autoscale(tight=True)
plt.xlabel('Plane of fit')
plt.ylabel('2013 yellow cab dropoffs')
plt.title("Yellow cab dropoffs vs income of census tract")
colorlist=plt.cm.gist_rainbow(np.linspace(0,2.8/4,4)).tolist()
colorlist.reverse()
for i in borocods[borocods['int']<5]['int'].values:
    isub=twosub[twosub.borocode==i]
    a=plt.scatter(isub['combined_var'],isub['count'],alpha=0.5,label=borocods[borocods['int']==i]['name'].values[0],c=colorlist[i-1],**scatter_kwargs)
plt.plot(XX,np.exp(XX+fit2.params['const']),color='black',alpha=0.8,lw=2, label='Fit, pseudo-$\mathsf{R}^\mathsf{2}=$ '+str(round(fit2.prsquared,3)) )
plt.yscale('log')
plt.legend(loc='lower right')
plt.show()



In [43]:

    
twosub['prediction']=pandas2ri.ri2py(r_model2.rx2('fitted.values'))









    



/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [45]:

    
# Plotting the data:
#plt.rc('text', **{'usetex':True,'latex.preamble':[
#       r'\usepackage{siunitx}',   
#       r'\sisetup{detect-all}',   
#       r'\usepackage{helvet}',    
#       r'\usepackage{sansmath}',  
#       r'\sansmath'               
#]  })
comd=0
buff=100

matplotlib.rc('legend', fontsize=10)

border=0.2
boro=1

#asub=full[(full.drivercommuterratio<=border)]
#bsub=full[(full.drivercommuterratio>border)]
XX=np.linspace(twosub.combined_var.min(),twosub.combined_var.max())

scatter_kwargs = {"zorder":100}
error_kwargs = {"lw":.5, "zorder":0}

a=6
gr=(1+np.sqrt(5))/2
plt.figure(figsize=[a*gr,a])
plt.autoscale(tight=True)
plt.xlabel('Per-capita income')
plt.ylabel('2013 yellow cab dropoffs per-capita')
plt.title("Yellow cab dropoffs vs income of census tract")
colorlist=plt.cm.gist_rainbow(np.linspace(0,2.8/4,4)).tolist()
colorlist.reverse()
for i in borocods[borocods['int']<5]['int'].values:
    isub=twosub[twosub.borocode==i]
    a=plt.scatter(isub['pc_income'],isub['count_pc'],alpha=0.9,label=borocods[borocods['int']==i]['name'].values[0],c=colorlist[i-1],**error_kwargs)
#plt.plot(XX,np.exp(XX+fit2.params['const']),color='black',alpha=0.8,lw=2, label='Fit, pseudo-$\mathsf{R}^\mathsf{2}=$ '+str(round(fit2.prsquared,3)) )
plt.scatter(twosub['pc_income'],twosub['prediction']/twosub['population'],alpha=0.3,color='black',label='Predicted dropoffs',**scatter_kwargs)
plt.yscale('log')
plt.xscale('log')
plt.legend(loc='lower right')
plt.show()



In [46]:

    
twosub['anscombe_resid']=(3/2)*(twosub['count']**(2/3)-twosub['prediction']**(2/3))/(twosub['prediction']**(1/6))









    



/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [47]:

    
twosub['anscombe_resid'].hist(bins=50)
plt.show()



In [48]:

    
sm.qqplot(twosub['anscombe_resid'],line='s')
plt.ylabel('Anscombe residual quantiles')
plt.show()



In [49]:

    
plt.scatter(twosub['pc_income'],twosub['anscombe_resid'])
plt.xscale('log')
plt.show()



In [50]:

    
twosub['standardized_resid']=pandas2ri.ri2py(R.rstandard(r_model2, type='dev'))









    



/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [51]:

    
twosub['studentized_resid']=pandas2ri.ri2py(R.rstudent(r_model2, type='dev'))









    



/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [52]:

    
sm.qqplot(twosub['studentized_resid'],line='s')
plt.ylabel('Studentized deviance residual quantiles')
plt.show()



In [53]:

    
plt.scatter(twosub['population'],twosub['standardized_resid'])
plt.xscale('log')
plt.show()



In [54]:

    
(np.abs(twosub['studentized_resid'])>3).sum()









    Out[54]:





36



In [55]:

    
outliers=twosub[(np.abs(twosub['studentized_resid'])>3)]



In [56]:

    
psub['old_s_resid']=pandas2ri.ri2py(R.rstudent(r_model, type='dev'))









    



/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [57]:

    
oldouts=outliers=psub[(np.abs(psub['old_s_resid'])>3)]



In [58]:

    
plt.scatter(psub['pc_income'],psub['count_pc'])
plt.scatter(oldouts['pc_income'],oldouts['count_pc'],color='red')
plt.xscale('log')
plt.yscale('log')

Writing residuals to SQL database



In [60]:

    
#s=sqla.text('ALTER TABLE lotsofdata ADD COLUMN driver_income_anscombe_resid double precision')
#conn=engine.connect()
#conn.execute(s)
#s=sqla.text('ALTER TABLE lotsofdata ADD COLUMN driver_income_standard_dev_resid double precision')
#conn.execute(s)



In [ ]:

    
#metadata=sqla.MetaData()
#borotest=sqla.Table('lotsofdata',metadata,autoload=True,autoload_with=engine)



In [ ]:

    
#(getattr(borotest.c,'driver_income_anscombe_resid'))



In [ ]:

    
#(getattr(borotest.c,'driver_income_standard_dev_resid'))



In [ ]:

    
#bindparams={'driver_income_anscombe_resid':sqla.bindparam('anscombe_resid'),\
#            'driver_income_standard_dev_resid':sqla.bindparam('standardized_resid')}



In [ ]:

    
#smt=borotest.update().\
#where(borotest.c.fipscodes==sqla.bindparam('a_code')).\
#values(bindparams)



In [ ]:

    
#twosub['a_code']=twosub.index.to_series()



In [ ]:

    
#dlist=twosub[["a_code",'anscombe_resid','standardized_resid']].to_dict(orient='records')
#conn.execute(smt,dlist)



In [ ]:

	int	name	county_name
code
005	2	Bronx	Bronx
047	3	Brooklyn	Kings
061	1	Manhattan	New York
081	4	Queens	Queens
085	5	Staten Island	Richmond

Dep. Variable:	count	No. Observations:	1979
Model:	Poisson	Df Residuals:	1977
Method:	MLE	Df Model:	1
Date:	Fri, 04 Mar 2016	Pseudo R-squ.:	0.6460
Time:	17:21:35	Log-Likelihood:	-9.9807e+07
converged:	True	LL-Null:	-2.8196e+08
		LLR p-value:	0.000

	coef	std err	z	P>\|z\|	[95.0% Conf. Int.]
pc_income	2.1338	0.000	1.75e+04	0.000	2.134 2.134
const	-19.7852	0.001	-1.45e+04	0.000	-19.788 -19.783