In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
In [12]:
orig= pd.read_csv('OlonOlatIntersect_329547.csv')
dest= pd.read_csv('DlonDlatIntersect_329547.csv')
#orig= pd.read_csv('ForGengGIS/OwithBG.csv')
#dest= pd.read_csv('ForGengGIS/DwithBG.csv')
od = pd.read_csv('Car2goOD.csv')
block=pd.read_csv('SelectedBlock.csv')
print(orig.shape, dest.shape, od.shape, block.shape)
In [ ]:
In [ ]:
In [36]:
print(od.columns)
od.tail()
Out[36]:
In [5]:
# key for od is id+otime or id+dtime
od[od[['id', 'dtime']].duplicated()]
Out[5]:
In [ ]:
In [ ]:
In [48]:
print(block.columns)
block.tail()
Out[48]:
In [ ]:
# block has 16773 rows
# the key is TRACTCE10+BLOCKCE10, or OBJECTID
In [54]:
len(block.OBJECTID.unique())
Out[54]:
In [34]:
block.groupby(['TRACTCE10','BLOCKCE10']).count().OBJECTID.shape
Out[34]:
In [32]:
# each tract can have multiple blocks
plt.figure(figsize=(15, 3))
plt.subplot(121)
plt.plot(block.groupby(['TRACTCE10']).count().OBJECTID, 'o-')
plt.xlim(0,20000)
# each tract can have multiple blocks
plt.subplot(122)
plt.plot(block.groupby(['BLOCKCE10']).count().OBJECTID, 'o-')
plt.xlim(0,10000)
Out[32]:
In [ ]:
In [ ]:
In [11]:
print(orig.columns)
orig.loc[:, :'BLOCKCE10'].tail()
In [ ]:
# OBJECTID is the col from block
# FID_OlonOl is the index from od
# Im not sure how OBJECTID_1 was generated. maybe when merged in arcgis?
In [20]:
len(orig.OBJECTID.unique())
Out[20]:
In [70]:
orig.OBJECTID.describe()
Out[70]:
In [74]:
orig.FID_OlonOl.describe()
Out[74]:
In [ ]:
In [ ]:
In [ ]:
In [13]:
# clean duplicate in orig and dest
# we verified that arcgis can place a pt twice in 2 regions if the pt falls on the boundary
# note that we have 329473 pts, compared to 329478 from od. cuz we threw out 5 pts out of bound
print(orig.shape, dest.shape)
orig=orig[~ orig['FID_OlonOl'].duplicated()]
dest=dest[~ dest['FID_DlonDl'].duplicated()]
print(orig.shape, dest.shape)
In [4]:
# add fid explicitly in od
od['fid']=od.index
od.head()
Out[4]:
In [5]:
keepcol = ['id', 'otime', 'dtime', 'fid' ]
od = od.drop([col for col in od.columns if col not in keepcol], axis=1)
od.head()
Out[5]:
In [ ]:
In [ ]:
In [6]:
orig.columns
Out[6]:
In [7]:
keepcol = ['id', 'otime', 'dtime', 'OBJECTID', 'TRACTCE10', 'BLOCKCE10']
# orig = orig.drop([col for col in orig.columns if col not in keepcol], axis=1)
# orig.head()
o=pd.DataFrame()
for col in keepcol:
o['o_'+ col] = orig[col]
o['fid'] = orig.FID_OlonOl
o.head()
Out[7]:
In [ ]:
In [ ]:
In [8]:
dest.columns
Out[8]:
In [9]:
keepcol = ['id', 'otime', 'dtime', 'OBJECTID', 'TRACTCE10', 'BLOCKCE10']
#dest = dest.drop([col for col in dest.columns if col not in keepcol], axis=1)
#dest.head()
d=pd.DataFrame()
for col in keepcol:
d['d_'+ col] = dest[col]
d['fid'] = dest.FID_DlonDl
d.head()
Out[9]:
In [ ]:
In [ ]:
In [10]:
d.sort_values('fid').head()
Out[10]:
In [11]:
o.sort_values('fid').head()
Out[11]:
In [12]:
od.head()
Out[12]:
In [ ]:
In [ ]:
In [36]:
# now we have od, o and d
ood = pd.merge(o, od, on='fid')
oodd =pd.merge(ood, d, on='fid')
oodd.head()
Out[36]:
In [37]:
oodd.columns
Out[37]:
In [38]:
oodd.o_id.equals(oodd.id)
Out[38]:
In [39]:
oodd = oodd.drop(['o_id', 'o_otime', 'o_dtime', 'd_id', 'd_otime','d_dtime'],axis=1)
In [40]:
oodd.head()
Out[40]:
In [41]:
len(oodd.d_OBJECTID.unique())
Out[41]:
In [43]:
#oodd.to_pickle('oodd.df')
oodd = pd.read_pickle('oodd.df')
In [53]:
oodd['ohour']= pd.DatetimeIndex(oodd.otime).hour
oodd['dhour']= pd.DatetimeIndex(oodd.dtime).hour
In [54]:
oodd.head()
Out[54]:
In [70]:
dmap=oodd.groupby(['dhour','d_OBJECTID']).fid.count()
omap=oodd.groupby(['ohour','o_OBJECTID']).fid.count()
In [93]:
dmap.unstack(level=0, fill_value=0).shape
Out[93]:
In [106]:
influxmap = dmap.unstack(level=0,fill_value=0)-omap.unstack(level=0, fill_value=0)
influxmap
Out[106]:
In [98]:
os=set(oodd.o_OBJECTID.unique())
ds=set(oodd.d_OBJECTID.unique())
In [105]:
len(ds |os )
Out[105]:
In [124]:
plt.plot(influxmap.ix[:,8])
Out[124]:
In [126]:
influxmap.ix[:,8].to_pickle('influxtest.df')
In [130]:
influxmap.to_pickle('influxmap.df')
In [ ]: