This performs the quasi-Poisson regression and generates the plots used to illustrate it.


In [1]:
import pandas as pd
import glob
from os import path
import numpy as np
from __future__ import division 
import sqlalchemy as sqla
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels
from sklearn import linear_model
import matplotlib
import rpy2.robjects as robjects
#import rpy2.robjects.packages as rpackages
from rpy2.robjects import pandas2ri

In [2]:
#%matplotlib qt
%matplotlib inline

In [3]:
engine = sqla.create_engine('postgresql://postgres:postgres@localhost:5432/TaxiData',echo=False)

sqlmycols={'MOGE001':'total_commuters',\
           'MOGE011':'car_commuters',\
          'MRUE001':'pc_income',\
          'totalpopulation':'population',\
          'MOGM001':'me_total_commuters',\
          'MOGM011':'me_car_commuters',\
          'MRUM001':'me_pc_income',\
          'time_dif_derived_approxcount':'approxcount',\
          'time_dif_derived_approxcount_percerror':'perc_error_approxcount',\
           'time_dif_derived_approxcount_error':'error_approxcount',\
           'twentythirteen_full_count':'count',\
           'fipscodes':'fipscodes',\
           'perc_white':'white_ratio'
          }
columnstring=', '.join(['"%s" AS %s' % (i,j) for i,j in sqlmycols.iteritems()])

In [4]:
full=pd.read_sql_query('SELECT %s FROM lotsofdata WHERE totalpopulation>=1000' % (columnstring) ,engine).set_index('fipscodes')

In [5]:
full['driver_ratio']=full['car_commuters']/full['total_commuters']
full['approxcount_pc']=full['approxcount']/full['population']
full['count_pc']=full['count']/full['population']

In [6]:
borocods=pd.read_sql_table('testborocodedict',engine)
borocods.drop('county_name_test',inplace=True,axis=1)
borocods.set_index('code',inplace=True)
borocods


Out[6]:
int name county_name
code
005 2 Bronx Bronx
047 3 Brooklyn Kings
061 1 Manhattan New York
081 4 Queens Queens
085 5 Staten Island Richmond

Using this table, we can identify the NYC borough of each census tract from the FIPS code that is used for the index, as part of the FIPS code consists of a county code that corresponds to the borough.

int is an integer code that NYC assigns to boroughs.


In [7]:
full['borocode']=borocods.loc[full.index.to_series().str[2:5].values]['int'].values

I'm eliminating Staten Island from my data.

No python module has quasi-Poisson regression, but it will be convenient to have the Poisson estimates, as the behavior of the mean is identical.


In [8]:
psub=full[(full['borocode']!=5)]
#psub.drop(outliers,axis=0,inplace=True)
model=statsmodels.discrete.discrete_model.Poisson((psub['count']),\
                                                  sm.add_constant(np.log(psub['pc_income']),prepend=False),\
                                                 exposure=psub['population'].values)
fitp=model.fit(start_params=[2.2948,-21.5754])
fitp.summary()


Optimization terminated successfully.
         Current function value: 50432.844615
         Iterations 4
Out[8]:
Poisson Regression Results
Dep. Variable: count No. Observations: 1979
Model: Poisson Df Residuals: 1977
Method: MLE Df Model: 1
Date: Fri, 04 Mar 2016 Pseudo R-squ.: 0.6460
Time: 17:21:35 Log-Likelihood: -9.9807e+07
converged: True LL-Null: -2.8196e+08
LLR p-value: 0.000
coef std err z P>|z| [95.0% Conf. Int.]
pc_income 2.1338 0.000 1.75e+04 0.000 2.134 2.134
const -19.7852 0.001 -1.45e+04 0.000 -19.788 -19.783

We'll be relying on R for the quasi-Poisson modeling.


In [9]:
R=robjects.r
pandas2ri.activate()

In [10]:
r_model=R.glm('count~1+log(pc_income)', family=R.quasipoisson(),data=psub,offset=np.log(psub.population))

In [11]:
deviance=R.summary(r_model).rx2('deviance')[0]
'%.4g' % deviance


Out[11]:
'1.996e+08'

In [12]:
dispersion=R.summary(r_model).rx2('dispersion')[0]

In [13]:
'%.4g' % dispersion


Out[13]:
'5.176e+05'

In [14]:
deviance/dispersion


Out[14]:
385.57723365839183

In [15]:
SIsub=full[(full['borocode']==5)]

In [16]:
# Plotting the data:
#plt.rc('text', **{'usetex':True,'latex.preamble':[
#       r'\usepackage{siunitx}',   
#       r'\sisetup{detect-all}',   
#       r'\usepackage{helvet}',    
#       r'\usepackage{sansmath}',  
#       r'\sansmath'               
#]  })
comd=0
buff=100

matplotlib.rc('legend', fontsize=10)

border=0.2
boro=1

#asub=full[(full.drivercommuterratio<=border)]
#bsub=full[(full.drivercommuterratio>border)]
XX=np.linspace(psub.pc_income.min(),psub.pc_income.max())

scatter_kwargs = {"zorder":100}
error_kwargs = {"lw":.5, "zorder":0}

a=6
gr=(1+np.sqrt(5))/2
plt.figure(figsize=[a*gr,a])
plt.autoscale(tight=True)
plt.xlabel('Per-capita income')
plt.ylabel('2013 yellow cab dropoffs per-capita')
plt.title("Yellow cab dropoffs vs income of census tract")
colorlist=plt.cm.gist_rainbow(np.linspace(0,2.8/4,4)).tolist()
colorlist.reverse()
for i in borocods[borocods['int']<5]['int'].values:
    #print(i)
    isub=psub[psub.borocode==i]
    a=plt.scatter(isub.pc_income,isub.count_pc,alpha=0.5,label=borocods[borocods['int']==i]['name'].values[0],c=colorlist[i-1],**scatter_kwargs)
    #plt.errorbar(isub.pc_income,isub.approxcount_pc,yerr=nintypercentile*isub.error_approxcount/isub.population,xerr=isub.me_pc_income,fmt=None,marker=None,mew=0,ecolor='black',alpha=0.5,label='',**error_kwargs)
#plt.scatter(bsub['pc_income'],bsub['approxcount_pc'],facecolors='none',edgecolors='black',alpha=0.1,label='driver commuter ratio $>$ '+str(round(border,1)))
#plt.scatter(full.loc[outliers|highleverage]['pc_income'],full.loc[outliers|highleverage]['approxcount_pc'],alpha=0.25,label=r'Outliers & high leverage',color='black')
plt.scatter(SIsub.pc_income,SIsub.count_pc,facecolors='none',edgecolors='black',alpha=0.1,label=borocods[borocods['int']==5]['name'].values[0])
#plt.scatter(outliers.pc_income,outliers.count_pc, facecolors='black',edgecolors='black',alpha=0.2, label='Outliers')
#plt.scatter(subset.loc[highleverage]['pc_income'],subset.loc[highleverage]['count_pc'],alpha=0.3,label='High leverage',color='orange')
plt.plot(XX,(XX**fitp.params['pc_income'])*np.exp(fitp.params['const']),color='black',alpha=0.8,lw=2, label='Fit, pseudo-$\mathsf{R}^\mathsf{2}=$ '+str(round(fitp.prsquared,3)) )
#cbar=plt.colorbar(a, ticks=borocods[borocods['int']<5]['int'].tolist())
#cbar.set_label("boro")


plt.yscale('log')
plt.xscale('log')
plt.legend(loc='lower right')
plt.savefig('2013count_dropoffs_vs_income_low_car_poisson.png',dpi=500)
plt.show()


Distribution of Anscombe residuals


In [17]:
psub['prediction']=pandas2ri.ri2py(r_model.rx2('fitted.values'))

psub['anscombe_resid']=(3/2)*(psub['count']**(2/3)-psub['prediction']**(2/3))/(psub['prediction']**(1/6))


/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()

In [18]:
sm.qqplot(psub['anscombe_resid'],line='s')
plt.ylabel('Anscombe residual quantiles')
plt.savefig('2013quasi-poisson_QQ.png',dpi=500)
plt.show()



In [19]:
plt.scatter(psub['pc_income'],psub['anscombe_resid'])
plt.xscale('log')
plt.show()


Standardized residuals


In [20]:
psub['standardized_resid']=pandas2ri.ri2py(R.rstandard(r_model))


/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [21]:
psub['standardized_resid'].hist(bins=50)


Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f14e74c6150>

In [22]:
plt.scatter(psub['pc_income'],psub['standardized_resid'])
plt.xscale('log')
#plt.yscale('log')



In [23]:
sm.qqplot(psub['standardized_resid'],line='s')
#sm.qqplot(psub['anscombe_resid'],line='s')
plt.show()


Studentized Residuals


In [24]:
psub['studentized_resid']=pandas2ri.ri2py(R.rstudent(r_model,type='dev'))


/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [25]:
psub['studentized_resid'].hist(bins=50)
plt.show()



In [26]:
outliers=psub[np.abs(psub['studentized_resid'])>3]

In [27]:
outliers.count()


Out[27]:
me_car_commuters          37
me_total_commuters        37
population                37
pc_income                 37
approxcount               37
car_commuters             37
total_commuters           37
count                     37
perc_error_approxcount    37
white_ratio               37
error_approxcount         37
me_pc_income              37
driver_ratio              37
approxcount_pc            37
count_pc                  37
borocode                  37
prediction                37
anscombe_resid            37
standardized_resid        37
studentized_resid         37
dtype: int64

Probably not enough to be significant in Poisson regression.

Exploring other variables

Note that none of this has yet made it into documentation


In [28]:
psub['manhattan_brooklyn']=0
psub['manhattan_brooklyn'][(psub['borocode']==1)|(psub['borocode']==3)]=1
psub['manhattan']=0
psub['manhattan'][(psub['borocode']==1)]=1


/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py:4059: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py:3066: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [29]:
psub[['manhattan','manhattan_brooklyn']].hist()


Out[29]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f14eae2d490>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f14eaeb13d0>]], dtype=object)

In [30]:
candidates=['driver_ratio','white_ratio','borocode','manhattan','manhattan_brooklyn','population']

In [31]:
psub[psub['manhattan']==1].shape


Out[31]:
(270, 22)
Plot them

In [32]:
for i in candidates:
    a=6
    gr=(1+np.sqrt(5))/2
    plt.figure(figsize=[a*gr,a])
    plt.autoscale(tight=True)
    plt.xlabel(i)
    plt.ylabel('2013 yellow cab dropoffs per-capita')
    plt.scatter(psub[i],psub['count_pc'],alpha=0.5)
    plt.yscale('log')
    plt.show()


None of them look that great on their own, let's see how they compare in the regression.

Note that the slight pattern in the population plot may indicate that the assumption that per-capita drop-offs is the right dependent variable isn't perfect.


In [33]:
for i in candidates:
    r_tempmodel=R.glm('count~1+log(pc_income)+%s' % (i), family=R.quasipoisson(),data=psub,offset=np.log(psub.population))
    devdiff=deviance - R.summary(r_tempmodel).rx2('deviance')[0]
    dispratio=R.summary(r_tempmodel).rx2('dispersion')[0]/ dispersion
    print('Adding %s lowers deviance by %.2g and lowers dispersion by a factor of %.2f' % (i,devdiff,dispratio))


Adding driver_ratio lowers deviance by 7.4e+07 and lowers dispersion by a factor of 0.36
Adding white_ratio lowers deviance by 1.1e+06 and lowers dispersion by a factor of 1.01
Adding borocode lowers deviance by 5.5e+07 and lowers dispersion by a factor of 0.45
Adding manhattan lowers deviance by 6.5e+07 and lowers dispersion by a factor of 0.39
Adding manhattan_brooklyn lowers deviance by 2.8e+07 and lowers dispersion by a factor of 0.68
Adding population lowers deviance by 4.5e+06 and lowers dispersion by a factor of 0.82

Using the ratio of commuters that drive seems to be obviously superior, though the Manhattan catigorical is pretty good too. That could be because the driver ratio is incredibly low in Manhattan.


In [34]:
twosub=psub[['count_pc','count','driver_ratio','pc_income','population','borocode']]
twosub.dropna(inplace=True)
twosub['log_pc_income']=np.log(twosub['pc_income'])


/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()

In [35]:
model2=statsmodels.discrete.discrete_model.Poisson((twosub['count']),\
                                                  sm.add_constant(twosub[['log_pc_income','driver_ratio']],prepend=False),\
                                                 exposure=twosub['population'].values)
fit2=model2.fit(start_params=[2.2948,-21.5754,0])
fit2.summary()


Optimization terminated successfully.
         Current function value: 31767.194997
         Iterations 25
Out[35]:
Poisson Regression Results
Dep. Variable: count No. Observations: 1977
Model: Poisson Df Residuals: 1974
Method: MLE Df Model: 2
Date: Fri, 04 Mar 2016 Pseudo R-squ.: 0.7771
Time: 17:21:53 Log-Likelihood: -6.2804e+07
converged: True LL-Null: -2.8173e+08
LLR p-value: 0.000
coef std err z P>|z| [95.0% Conf. Int.]
log_pc_income 1.4955 0.000 1.06e+04 0.000 1.495 1.496
driver_ratio -10.1842 0.002 -6268.049 0.000 -10.187 -10.181
const -11.4356 0.002 -6945.485 0.000 -11.439 -11.432

In [36]:
r_model2=R.glm('count~1+log(pc_income)+driver_ratio', family=R.quasipoisson(),data=psub,offset=np.log(psub.population))

In [37]:
print(R.summary(r_model2).rx2('coefficients'))


                 Estimate Std. Error   t value      Pr(>|t|)
(Intercept)    -11.435575 0.71071660 -16.09020  7.854395e-55
log(pc_income)   1.495488 0.06097012  24.52821 3.422860e-116
driver_ratio   -10.184185 0.70135018 -14.52083  1.809339e-45


In [38]:
twodeviance=R.summary(r_model2).rx2('deviance')[0]
print('%.4g'% twodeviance)


1.256e+08

In [39]:
print( '%.4g' % R.summary(r_model2).rx2('dispersion')[0])


1.863e+05

In [40]:
twosub['combined_var']=fit2.params['log_pc_income']*twosub['log_pc_income']+fit2.params['driver_ratio']*twosub['driver_ratio']+np.log(twosub['population'])


/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [41]:
# Plotting the data:
#plt.rc('text', **{'usetex':True,'latex.preamble':[
#       r'\usepackage{siunitx}',   
#       r'\sisetup{detect-all}',   
#       r'\usepackage{helvet}',    
#       r'\usepackage{sansmath}',  
#       r'\sansmath'               
#]  })
comd=0
buff=100

matplotlib.rc('legend', fontsize=10)

border=0.2
boro=1

#asub=full[(full.drivercommuterratio<=border)]
#bsub=full[(full.drivercommuterratio>border)]
XX=np.linspace(twosub.combined_var.min(),twosub.combined_var.max())

scatter_kwargs = {"zorder":100}
error_kwargs = {"lw":.5, "zorder":0}

a=6
gr=(1+np.sqrt(5))/2
plt.figure(figsize=[a*gr,a])
plt.autoscale(tight=True)
plt.xlabel('Plane of fit')
plt.ylabel('2013 yellow cab dropoffs')
plt.title("Yellow cab dropoffs vs income of census tract")
colorlist=plt.cm.gist_rainbow(np.linspace(0,2.8/4,4)).tolist()
colorlist.reverse()
for i in borocods[borocods['int']<5]['int'].values:
    isub=twosub[twosub.borocode==i]
    a=plt.scatter(isub['combined_var'],isub['count'],alpha=0.5,label=borocods[borocods['int']==i]['name'].values[0],c=colorlist[i-1],**scatter_kwargs)
plt.plot(XX,np.exp(XX+fit2.params['const']),color='black',alpha=0.8,lw=2, label='Fit, pseudo-$\mathsf{R}^\mathsf{2}=$ '+str(round(fit2.prsquared,3)) )
plt.yscale('log')
plt.legend(loc='lower right')
plt.show()



In [43]:
twosub['prediction']=pandas2ri.ri2py(r_model2.rx2('fitted.values'))


/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [45]:
# Plotting the data:
#plt.rc('text', **{'usetex':True,'latex.preamble':[
#       r'\usepackage{siunitx}',   
#       r'\sisetup{detect-all}',   
#       r'\usepackage{helvet}',    
#       r'\usepackage{sansmath}',  
#       r'\sansmath'               
#]  })
comd=0
buff=100

matplotlib.rc('legend', fontsize=10)

border=0.2
boro=1

#asub=full[(full.drivercommuterratio<=border)]
#bsub=full[(full.drivercommuterratio>border)]
XX=np.linspace(twosub.combined_var.min(),twosub.combined_var.max())

scatter_kwargs = {"zorder":100}
error_kwargs = {"lw":.5, "zorder":0}

a=6
gr=(1+np.sqrt(5))/2
plt.figure(figsize=[a*gr,a])
plt.autoscale(tight=True)
plt.xlabel('Per-capita income')
plt.ylabel('2013 yellow cab dropoffs per-capita')
plt.title("Yellow cab dropoffs vs income of census tract")
colorlist=plt.cm.gist_rainbow(np.linspace(0,2.8/4,4)).tolist()
colorlist.reverse()
for i in borocods[borocods['int']<5]['int'].values:
    isub=twosub[twosub.borocode==i]
    a=plt.scatter(isub['pc_income'],isub['count_pc'],alpha=0.9,label=borocods[borocods['int']==i]['name'].values[0],c=colorlist[i-1],**error_kwargs)
#plt.plot(XX,np.exp(XX+fit2.params['const']),color='black',alpha=0.8,lw=2, label='Fit, pseudo-$\mathsf{R}^\mathsf{2}=$ '+str(round(fit2.prsquared,3)) )
plt.scatter(twosub['pc_income'],twosub['prediction']/twosub['population'],alpha=0.3,color='black',label='Predicted dropoffs',**scatter_kwargs)
plt.yscale('log')
plt.xscale('log')
plt.legend(loc='lower right')
plt.show()



In [46]:
twosub['anscombe_resid']=(3/2)*(twosub['count']**(2/3)-twosub['prediction']**(2/3))/(twosub['prediction']**(1/6))


/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [47]:
twosub['anscombe_resid'].hist(bins=50)
plt.show()



In [48]:
sm.qqplot(twosub['anscombe_resid'],line='s')
plt.ylabel('Anscombe residual quantiles')
plt.show()



In [49]:
plt.scatter(twosub['pc_income'],twosub['anscombe_resid'])
plt.xscale('log')
plt.show()



In [50]:
twosub['standardized_resid']=pandas2ri.ri2py(R.rstandard(r_model2, type='dev'))


/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [51]:
twosub['studentized_resid']=pandas2ri.ri2py(R.rstudent(r_model2, type='dev'))


/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [52]:
sm.qqplot(twosub['studentized_resid'],line='s')
plt.ylabel('Studentized deviance residual quantiles')
plt.show()



In [53]:
plt.scatter(twosub['population'],twosub['standardized_resid'])
plt.xscale('log')
plt.show()



In [54]:
(np.abs(twosub['studentized_resid'])>3).sum()


Out[54]:
36

In [55]:
outliers=twosub[(np.abs(twosub['studentized_resid'])>3)]

In [56]:
psub['old_s_resid']=pandas2ri.ri2py(R.rstudent(r_model, type='dev'))


/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [57]:
oldouts=outliers=psub[(np.abs(psub['old_s_resid'])>3)]

In [58]:
plt.scatter(psub['pc_income'],psub['count_pc'])
plt.scatter(oldouts['pc_income'],oldouts['count_pc'],color='red')
plt.xscale('log')
plt.yscale('log')


Writing residuals to SQL database


In [60]:
#s=sqla.text('ALTER TABLE lotsofdata ADD COLUMN driver_income_anscombe_resid double precision')
#conn=engine.connect()
#conn.execute(s)
#s=sqla.text('ALTER TABLE lotsofdata ADD COLUMN driver_income_standard_dev_resid double precision')
#conn.execute(s)

In [ ]:
#metadata=sqla.MetaData()
#borotest=sqla.Table('lotsofdata',metadata,autoload=True,autoload_with=engine)

In [ ]:
#(getattr(borotest.c,'driver_income_anscombe_resid'))

In [ ]:
#(getattr(borotest.c,'driver_income_standard_dev_resid'))

In [ ]:
#bindparams={'driver_income_anscombe_resid':sqla.bindparam('anscombe_resid'),\
#            'driver_income_standard_dev_resid':sqla.bindparam('standardized_resid')}

In [ ]:
#smt=borotest.update().\
#where(borotest.c.fipscodes==sqla.bindparam('a_code')).\
#values(bindparams)

In [ ]:
#twosub['a_code']=twosub.index.to_series()

In [ ]:
#dlist=twosub[["a_code",'anscombe_resid','standardized_resid']].to_dict(orient='records')
#conn.execute(smt,dlist)

In [ ]: