In [1]:
import pandas as pd
import sqlalchemy as sqla
import numpy as np
#import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm
In [2]:
#%matplotlib qt
%matplotlib inline
In [3]:
engine = sqla.create_engine('postgresql://postgres:postgres@localhost:5432/TaxiData',echo=False)
In [4]:
columntypelist=pd.read_sql_query("SELECT column_name, data_type FROM information_schema.columns WHERE table_name = 'lotsofdata';", engine)
columntypelist.set_index('column_name',inplace=True)
columntypelist.groupby(['data_type'])['data_type'].count()
Out[4]:
In [5]:
numbercolumns=columntypelist[(columntypelist['data_type']=='double precision')|(columntypelist['data_type']=='bigint')|(columntypelist['data_type']=='integer')|(columntypelist.index=='fipscodes')]['data_type']
numbercolumns.index=numbercolumns.index.str.strip()
numbercolumns=numbercolumns[numbercolumns.index.str.find(' ')==-1].index.tolist()
len(numbercolumns)
Out[5]:
In [6]:
#columnstring="'"+"', '".join(numbercolumns)+"'"
columnstring='"'+'", "'.join(numbercolumns)+'"'
In [7]:
full=pd.read_sql_query('SELECT '+columnstring + ' FROM lotsofdata',engine).set_index('fipscodes')
full=full[full['totalpopulation']>=1000]
In [8]:
Yname='twentythirteen_full_count_pc'
dropoffitems=['abridged2013ycdrpoffpc','counts','abridged2013ycdrpoff',\
'driver_income_standard_dev_resid', 'twentythirteen_full_count_pc',\
'time_dif_derived_approxcount_error', 'time_dif_derived_approxcount',\
'twentythirteen_full_count', 'driver_income_anscombe_resid']
full.drop([i for i in dropoffitems if i is not Yname],axis=1,inplace=True)
In [9]:
full.replace(np.inf, np.nan,inplace=True)
full.dropna(axis='columns',how='all',inplace=True)
full.dropna(inplace=True)
In [10]:
full.drop_duplicates(inplace=True)
full=full.T.drop_duplicates(keep='last').T
pcfull=full
#pcfull.replace({'totalpopulation':{0:np.nan}},inplace=True)
#pcfull.dropna().shape
Y=pcfull[Yname]
pcfull=pcfull.divide(full['totalpopulation'],axis='index')
#These were already per-capita type data columns, or shouldn't be per-capita
#pcfull['nondrivercommuterrat']=((pcfull['MOGE001']-pcfull['MOGE011'])/pcfull['MOGE001'])
pcfull['MRUE001']=full['MRUE001']
pcfull['MRUM001']=full['MRUM001']
pcfull['totalpopulation']=full['totalpopulation']
pcfull['boro_int_code']=full['boro_int_code']
pcfull['nondrivercomrat']=full['nondrivercomrat']
The census/ACS data columns all are given a 7 letter/number code. The data comes with "code books", text files with descriptions of the data and what the codes mean. The function below grabs the one-line descriptions that go with each code, giving a nice, brief description of what each code means.
In [11]:
import os.path
codebookpath=os.path.expanduser('~/Documents/TaxiTripData/TIGERFiles/nhgis_codebooks')
def searchcodebook(code,path=codebookpath):
import subprocess
command='grep -r -h -m 1 '+code+' '+path+'*'
try:
grepstring=subprocess.check_output(command,shell=True)
return grepstring[grepstring.find(code)+len(code)+1:grepstring.find('\r')].strip()
except subprocess.CalledProcessError:
return code
codebookdict=pcfull.columns.to_series().apply(searchcodebook)
In [12]:
codebookdict[((codebookdict.str.find('capita'))!=-1)]
Out[12]:
That makes sure I didn't miss any per-capita data columns that I didn't know about.
In [13]:
from sklearn import feature_selection
featureselect=feature_selection.SelectKBest(feature_selection.f_regression)
featureselect.fit(pcfull.drop(Yname,axis=1),Y)
pcfeaturescores=pd.Series(featureselect.pvalues_)
pcfeaturescores.index=pcfull.drop(Yname,axis=1).columns
pcfeaturescores.sort_values(ascending=True,inplace=True)
#codebookdict[pcfeaturescores.iloc[:10].index].tolist()
codebookdict[pcfeaturescores.iloc[:10].index]
Out[13]:
The 4th letter in the codes correspond to whether or not the code is an estimate of the value (E) or a margin of error for that estimate (M). Apparently the margin of error for per capita income, MRUM001, is better correlated with drop-offs per capita than the estimate itself. Since it seems drop-offs have a pretty significant power relationship to income and are not linear, I'm not about to look into that too deeply. Also, apparently the number of commuters who walk to work is pretty predictive too.
However, it does look like the feature that I first had the instinct to look at, per-capita income, is relatively predictive.
In [14]:
correlm=pcfull[pcfeaturescores.iloc[:3].index.tolist()+[Yname,'MOJE016']]
#correlm['logWalked']=np.log(correlm['MOGE101'])
correlm['logIncome']=np.log(correlm['MRUE001'])
correlm['logDropOffs']=np.log(correlm[Yname])
corr_matrix = np.corrcoef(correlm.T)
sm.graphics.plot_corr(corr_matrix, xnames=correlm.columns.tolist(),cmap=plt.cm.get_cmap('viridis'))
plt.show()