This performs the quasi-Poisson regression and generates the plots used to illustrate it.
In [1]:
import pandas as pd
import glob
from os import path
import numpy as np
from __future__ import division
import sqlalchemy as sqla
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels
from sklearn import linear_model
import matplotlib
import rpy2.robjects as robjects
#import rpy2.robjects.packages as rpackages
from rpy2.robjects import pandas2ri
In [2]:
#%matplotlib qt
%matplotlib inline
In [3]:
engine = sqla.create_engine('postgresql://postgres:postgres@localhost:5432/TaxiData',echo=False)
sqlmycols={'MOGE001':'total_commuters',\
'MOGE011':'car_commuters',\
'MRUE001':'pc_income',\
'totalpopulation':'population',\
'MOGM001':'me_total_commuters',\
'MOGM011':'me_car_commuters',\
'MRUM001':'me_pc_income',\
'time_dif_derived_approxcount':'approxcount',\
'time_dif_derived_approxcount_percerror':'perc_error_approxcount',\
'time_dif_derived_approxcount_error':'error_approxcount',\
'twentythirteen_full_count':'count',\
'fipscodes':'fipscodes',\
'perc_white':'white_ratio'
}
columnstring=', '.join(['"%s" AS %s' % (i,j) for i,j in sqlmycols.iteritems()])
In [4]:
full=pd.read_sql_query('SELECT %s FROM lotsofdata WHERE totalpopulation>=1000' % (columnstring) ,engine).set_index('fipscodes')
In [5]:
full['driver_ratio']=full['car_commuters']/full['total_commuters']
full['approxcount_pc']=full['approxcount']/full['population']
full['count_pc']=full['count']/full['population']
In [6]:
borocods=pd.read_sql_table('testborocodedict',engine)
borocods.drop('county_name_test',inplace=True,axis=1)
borocods.set_index('code',inplace=True)
borocods
Out[6]:
Using this table, we can identify the NYC borough of each census tract from the FIPS code that is used for the index, as part of the FIPS code consists of a county code that corresponds to the borough.
int is an integer code that NYC assigns to boroughs.
In [7]:
full['borocode']=borocods.loc[full.index.to_series().str[2:5].values]['int'].values
I'm eliminating Staten Island from my data.
No python module has quasi-Poisson regression, but it will be convenient to have the Poisson estimates, as the behavior of the mean is identical.
In [8]:
psub=full[(full['borocode']!=5)]
#psub.drop(outliers,axis=0,inplace=True)
model=statsmodels.discrete.discrete_model.Poisson((psub['count']),\
sm.add_constant(np.log(psub['pc_income']),prepend=False),\
exposure=psub['population'].values)
fitp=model.fit(start_params=[2.2948,-21.5754])
fitp.summary()
Out[8]:
We'll be relying on R for the quasi-Poisson modeling.
In [9]:
R=robjects.r
pandas2ri.activate()
In [10]:
r_model=R.glm('count~1+log(pc_income)', family=R.quasipoisson(),data=psub,offset=np.log(psub.population))
In [11]:
deviance=R.summary(r_model).rx2('deviance')[0]
'%.4g' % deviance
Out[11]:
In [12]:
dispersion=R.summary(r_model).rx2('dispersion')[0]
In [13]:
'%.4g' % dispersion
Out[13]:
In [14]:
deviance/dispersion
Out[14]:
In [15]:
SIsub=full[(full['borocode']==5)]
In [16]:
# Plotting the data:
#plt.rc('text', **{'usetex':True,'latex.preamble':[
# r'\usepackage{siunitx}',
# r'\sisetup{detect-all}',
# r'\usepackage{helvet}',
# r'\usepackage{sansmath}',
# r'\sansmath'
#] })
comd=0
buff=100
matplotlib.rc('legend', fontsize=10)
border=0.2
boro=1
#asub=full[(full.drivercommuterratio<=border)]
#bsub=full[(full.drivercommuterratio>border)]
XX=np.linspace(psub.pc_income.min(),psub.pc_income.max())
scatter_kwargs = {"zorder":100}
error_kwargs = {"lw":.5, "zorder":0}
a=6
gr=(1+np.sqrt(5))/2
plt.figure(figsize=[a*gr,a])
plt.autoscale(tight=True)
plt.xlabel('Per-capita income')
plt.ylabel('2013 yellow cab dropoffs per-capita')
plt.title("Yellow cab dropoffs vs income of census tract")
colorlist=plt.cm.gist_rainbow(np.linspace(0,2.8/4,4)).tolist()
colorlist.reverse()
for i in borocods[borocods['int']<5]['int'].values:
#print(i)
isub=psub[psub.borocode==i]
a=plt.scatter(isub.pc_income,isub.count_pc,alpha=0.5,label=borocods[borocods['int']==i]['name'].values[0],c=colorlist[i-1],**scatter_kwargs)
#plt.errorbar(isub.pc_income,isub.approxcount_pc,yerr=nintypercentile*isub.error_approxcount/isub.population,xerr=isub.me_pc_income,fmt=None,marker=None,mew=0,ecolor='black',alpha=0.5,label='',**error_kwargs)
#plt.scatter(bsub['pc_income'],bsub['approxcount_pc'],facecolors='none',edgecolors='black',alpha=0.1,label='driver commuter ratio $>$ '+str(round(border,1)))
#plt.scatter(full.loc[outliers|highleverage]['pc_income'],full.loc[outliers|highleverage]['approxcount_pc'],alpha=0.25,label=r'Outliers & high leverage',color='black')
plt.scatter(SIsub.pc_income,SIsub.count_pc,facecolors='none',edgecolors='black',alpha=0.1,label=borocods[borocods['int']==5]['name'].values[0])
#plt.scatter(outliers.pc_income,outliers.count_pc, facecolors='black',edgecolors='black',alpha=0.2, label='Outliers')
#plt.scatter(subset.loc[highleverage]['pc_income'],subset.loc[highleverage]['count_pc'],alpha=0.3,label='High leverage',color='orange')
plt.plot(XX,(XX**fitp.params['pc_income'])*np.exp(fitp.params['const']),color='black',alpha=0.8,lw=2, label='Fit, pseudo-$\mathsf{R}^\mathsf{2}=$ '+str(round(fitp.prsquared,3)) )
#cbar=plt.colorbar(a, ticks=borocods[borocods['int']<5]['int'].tolist())
#cbar.set_label("boro")
plt.yscale('log')
plt.xscale('log')
plt.legend(loc='lower right')
plt.savefig('2013count_dropoffs_vs_income_low_car_poisson.png',dpi=500)
plt.show()
In [17]:
psub['prediction']=pandas2ri.ri2py(r_model.rx2('fitted.values'))
psub['anscombe_resid']=(3/2)*(psub['count']**(2/3)-psub['prediction']**(2/3))/(psub['prediction']**(1/6))
In [18]:
sm.qqplot(psub['anscombe_resid'],line='s')
plt.ylabel('Anscombe residual quantiles')
plt.savefig('2013quasi-poisson_QQ.png',dpi=500)
plt.show()
In [19]:
plt.scatter(psub['pc_income'],psub['anscombe_resid'])
plt.xscale('log')
plt.show()
In [20]:
psub['standardized_resid']=pandas2ri.ri2py(R.rstandard(r_model))
In [21]:
psub['standardized_resid'].hist(bins=50)
Out[21]:
In [22]:
plt.scatter(psub['pc_income'],psub['standardized_resid'])
plt.xscale('log')
#plt.yscale('log')
In [23]:
sm.qqplot(psub['standardized_resid'],line='s')
#sm.qqplot(psub['anscombe_resid'],line='s')
plt.show()
In [24]:
psub['studentized_resid']=pandas2ri.ri2py(R.rstudent(r_model,type='dev'))
In [25]:
psub['studentized_resid'].hist(bins=50)
plt.show()
In [26]:
outliers=psub[np.abs(psub['studentized_resid'])>3]
In [27]:
outliers.count()
Out[27]:
Probably not enough to be significant in Poisson regression.
In [28]:
psub['manhattan_brooklyn']=0
psub['manhattan_brooklyn'][(psub['borocode']==1)|(psub['borocode']==3)]=1
psub['manhattan']=0
psub['manhattan'][(psub['borocode']==1)]=1
In [29]:
psub[['manhattan','manhattan_brooklyn']].hist()
Out[29]:
In [30]:
candidates=['driver_ratio','white_ratio','borocode','manhattan','manhattan_brooklyn','population']
In [31]:
psub[psub['manhattan']==1].shape
Out[31]:
In [32]:
for i in candidates:
a=6
gr=(1+np.sqrt(5))/2
plt.figure(figsize=[a*gr,a])
plt.autoscale(tight=True)
plt.xlabel(i)
plt.ylabel('2013 yellow cab dropoffs per-capita')
plt.scatter(psub[i],psub['count_pc'],alpha=0.5)
plt.yscale('log')
plt.show()
None of them look that great on their own, let's see how they compare in the regression.
Note that the slight pattern in the population plot may indicate that the assumption that per-capita drop-offs is the right dependent variable isn't perfect.
In [33]:
for i in candidates:
r_tempmodel=R.glm('count~1+log(pc_income)+%s' % (i), family=R.quasipoisson(),data=psub,offset=np.log(psub.population))
devdiff=deviance - R.summary(r_tempmodel).rx2('deviance')[0]
dispratio=R.summary(r_tempmodel).rx2('dispersion')[0]/ dispersion
print('Adding %s lowers deviance by %.2g and lowers dispersion by a factor of %.2f' % (i,devdiff,dispratio))
Using the ratio of commuters that drive seems to be obviously superior, though the Manhattan catigorical is pretty good too. That could be because the driver ratio is incredibly low in Manhattan.
In [34]:
twosub=psub[['count_pc','count','driver_ratio','pc_income','population','borocode']]
twosub.dropna(inplace=True)
twosub['log_pc_income']=np.log(twosub['pc_income'])
In [35]:
model2=statsmodels.discrete.discrete_model.Poisson((twosub['count']),\
sm.add_constant(twosub[['log_pc_income','driver_ratio']],prepend=False),\
exposure=twosub['population'].values)
fit2=model2.fit(start_params=[2.2948,-21.5754,0])
fit2.summary()
Out[35]:
In [36]:
r_model2=R.glm('count~1+log(pc_income)+driver_ratio', family=R.quasipoisson(),data=psub,offset=np.log(psub.population))
In [37]:
print(R.summary(r_model2).rx2('coefficients'))
In [38]:
twodeviance=R.summary(r_model2).rx2('deviance')[0]
print('%.4g'% twodeviance)
In [39]:
print( '%.4g' % R.summary(r_model2).rx2('dispersion')[0])
In [40]:
twosub['combined_var']=fit2.params['log_pc_income']*twosub['log_pc_income']+fit2.params['driver_ratio']*twosub['driver_ratio']+np.log(twosub['population'])
In [41]:
# Plotting the data:
#plt.rc('text', **{'usetex':True,'latex.preamble':[
# r'\usepackage{siunitx}',
# r'\sisetup{detect-all}',
# r'\usepackage{helvet}',
# r'\usepackage{sansmath}',
# r'\sansmath'
#] })
comd=0
buff=100
matplotlib.rc('legend', fontsize=10)
border=0.2
boro=1
#asub=full[(full.drivercommuterratio<=border)]
#bsub=full[(full.drivercommuterratio>border)]
XX=np.linspace(twosub.combined_var.min(),twosub.combined_var.max())
scatter_kwargs = {"zorder":100}
error_kwargs = {"lw":.5, "zorder":0}
a=6
gr=(1+np.sqrt(5))/2
plt.figure(figsize=[a*gr,a])
plt.autoscale(tight=True)
plt.xlabel('Plane of fit')
plt.ylabel('2013 yellow cab dropoffs')
plt.title("Yellow cab dropoffs vs income of census tract")
colorlist=plt.cm.gist_rainbow(np.linspace(0,2.8/4,4)).tolist()
colorlist.reverse()
for i in borocods[borocods['int']<5]['int'].values:
isub=twosub[twosub.borocode==i]
a=plt.scatter(isub['combined_var'],isub['count'],alpha=0.5,label=borocods[borocods['int']==i]['name'].values[0],c=colorlist[i-1],**scatter_kwargs)
plt.plot(XX,np.exp(XX+fit2.params['const']),color='black',alpha=0.8,lw=2, label='Fit, pseudo-$\mathsf{R}^\mathsf{2}=$ '+str(round(fit2.prsquared,3)) )
plt.yscale('log')
plt.legend(loc='lower right')
plt.show()
In [43]:
twosub['prediction']=pandas2ri.ri2py(r_model2.rx2('fitted.values'))
In [45]:
# Plotting the data:
#plt.rc('text', **{'usetex':True,'latex.preamble':[
# r'\usepackage{siunitx}',
# r'\sisetup{detect-all}',
# r'\usepackage{helvet}',
# r'\usepackage{sansmath}',
# r'\sansmath'
#] })
comd=0
buff=100
matplotlib.rc('legend', fontsize=10)
border=0.2
boro=1
#asub=full[(full.drivercommuterratio<=border)]
#bsub=full[(full.drivercommuterratio>border)]
XX=np.linspace(twosub.combined_var.min(),twosub.combined_var.max())
scatter_kwargs = {"zorder":100}
error_kwargs = {"lw":.5, "zorder":0}
a=6
gr=(1+np.sqrt(5))/2
plt.figure(figsize=[a*gr,a])
plt.autoscale(tight=True)
plt.xlabel('Per-capita income')
plt.ylabel('2013 yellow cab dropoffs per-capita')
plt.title("Yellow cab dropoffs vs income of census tract")
colorlist=plt.cm.gist_rainbow(np.linspace(0,2.8/4,4)).tolist()
colorlist.reverse()
for i in borocods[borocods['int']<5]['int'].values:
isub=twosub[twosub.borocode==i]
a=plt.scatter(isub['pc_income'],isub['count_pc'],alpha=0.9,label=borocods[borocods['int']==i]['name'].values[0],c=colorlist[i-1],**error_kwargs)
#plt.plot(XX,np.exp(XX+fit2.params['const']),color='black',alpha=0.8,lw=2, label='Fit, pseudo-$\mathsf{R}^\mathsf{2}=$ '+str(round(fit2.prsquared,3)) )
plt.scatter(twosub['pc_income'],twosub['prediction']/twosub['population'],alpha=0.3,color='black',label='Predicted dropoffs',**scatter_kwargs)
plt.yscale('log')
plt.xscale('log')
plt.legend(loc='lower right')
plt.show()
In [46]:
twosub['anscombe_resid']=(3/2)*(twosub['count']**(2/3)-twosub['prediction']**(2/3))/(twosub['prediction']**(1/6))
In [47]:
twosub['anscombe_resid'].hist(bins=50)
plt.show()
In [48]:
sm.qqplot(twosub['anscombe_resid'],line='s')
plt.ylabel('Anscombe residual quantiles')
plt.show()
In [49]:
plt.scatter(twosub['pc_income'],twosub['anscombe_resid'])
plt.xscale('log')
plt.show()
In [50]:
twosub['standardized_resid']=pandas2ri.ri2py(R.rstandard(r_model2, type='dev'))
In [51]:
twosub['studentized_resid']=pandas2ri.ri2py(R.rstudent(r_model2, type='dev'))
In [52]:
sm.qqplot(twosub['studentized_resid'],line='s')
plt.ylabel('Studentized deviance residual quantiles')
plt.show()
In [53]:
plt.scatter(twosub['population'],twosub['standardized_resid'])
plt.xscale('log')
plt.show()
In [54]:
(np.abs(twosub['studentized_resid'])>3).sum()
Out[54]:
In [55]:
outliers=twosub[(np.abs(twosub['studentized_resid'])>3)]
In [56]:
psub['old_s_resid']=pandas2ri.ri2py(R.rstudent(r_model, type='dev'))
In [57]:
oldouts=outliers=psub[(np.abs(psub['old_s_resid'])>3)]
In [58]:
plt.scatter(psub['pc_income'],psub['count_pc'])
plt.scatter(oldouts['pc_income'],oldouts['count_pc'],color='red')
plt.xscale('log')
plt.yscale('log')
In [60]:
#s=sqla.text('ALTER TABLE lotsofdata ADD COLUMN driver_income_anscombe_resid double precision')
#conn=engine.connect()
#conn.execute(s)
#s=sqla.text('ALTER TABLE lotsofdata ADD COLUMN driver_income_standard_dev_resid double precision')
#conn.execute(s)
In [ ]:
#metadata=sqla.MetaData()
#borotest=sqla.Table('lotsofdata',metadata,autoload=True,autoload_with=engine)
In [ ]:
#(getattr(borotest.c,'driver_income_anscombe_resid'))
In [ ]:
#(getattr(borotest.c,'driver_income_standard_dev_resid'))
In [ ]:
#bindparams={'driver_income_anscombe_resid':sqla.bindparam('anscombe_resid'),\
# 'driver_income_standard_dev_resid':sqla.bindparam('standardized_resid')}
In [ ]:
#smt=borotest.update().\
#where(borotest.c.fipscodes==sqla.bindparam('a_code')).\
#values(bindparams)
In [ ]:
#twosub['a_code']=twosub.index.to_series()
In [ ]:
#dlist=twosub[["a_code",'anscombe_resid','standardized_resid']].to_dict(orient='records')
#conn.execute(smt,dlist)
In [ ]: