In [2]:
sc
Out[2]:
In [3]:
import pyproj
import csv
import shapely.geometry as geom
import fiona
import fiona.crs
import shapely
import rtree
import geopandas as gpd
import numpy as np
import operator
# just for display, not for calculation
import pandas as pd
In [4]:
# small sample
taxi = pd.read_pickle('../why_yellow_taxi/Data/df_shuffle.pkl')
del taxi['Unnamed: 0']
taxi.to_csv('df_shuffle.csv', index=False)
taxi = pd.read_csv('./df_shuffle.csv')
print taxi.columns
taxi.head(2)
# test5 = taxi.head()[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
Out[4]:
In [5]:
shapefile = '../why_yellow_taxi/Buffer/entr_buffer_100_feet_epsg4269_nad83/entr_buffer_100_feet_epsg4269_nad83.shp'
entr_buf = gpd.read_file(shapefile)
entr_buf.crs
Out[5]:
In [6]:
entr_buf.head(2)
Out[6]:
In [37]:
entr_buf = entr_buf.to_crs(fiona.crs.from_epsg(2263))
routes = ['Route_' + str(n) for n in range(1, 12)]
entr2line = []
for i in xrange(len(entr_buf)):
#lines = [line for line in list(entr_buf.loc[:,routes].ix[i].dropna().values)]
lines = []
for line in list(entr_buf.loc[:,routes].ix[i].dropna().values):
try:
line = str(int(line))
except ValueError:
pass
lines.append(line)
entr2line.append(lines)
entr_buf['entr2line'] = entr2line
entr_buf.head(2)
Out[37]:
In [8]:
len(taxi)
Out[8]:
In [9]:
len(entr_buf)
Out[9]:
In [11]:
index = rtree.Rtree()
for idx, geometry in enumerate(entr_buf.geometry):
index.insert(idx, geometry.bounds)
In [44]:
proj = pyproj.Proj(init='epsg:2263', preserve_units=True)
#?preserve_units=True
entr_pair = {}
pick_entr = {}
drop_entr = {}
entr_lines = {}
with open('./df_shuffle.csv', 'rb') as fi:
reader = csv.reader(fi)
print reader.next()
for row in reader:
if ((float(row[5])!=0) and float(row[9]!=0)):
p = geom.Point(proj(float(row[5]), float(row[6])))
d = geom.Point(proj(float(row[9]), float(row[10])))
p_potential = index.intersection((p.x,p.y,p.x,p.y))
d_potential = index.intersection((d.x,d.y,d.x,d.y))
# print p_potential
p_match = None # The first one match, should be the closest one? No!
d_match = None
for p_idx in p_potential:
if entr_buf.geometry[p_idx].contains(p):
p_match = p_idx # print 'p',p_idx
p_lines = set(entr_buf.entr2line[p_idx])
break
pick_entr[p_match] = pick_entr.get(p_match, 0)+1
for d_idx in d_potential:
if entr_buf.geometry[d_idx].contains(d):
d_match = d_idx # print 'd',d_idx
d_lines = set(entr_buf.entr2line[d_idx])
break
drop_entr[d_match] = drop_entr.get(d_match, 0)+1
if ((p_match and d_match) and (p_match != d_match)):
dirct_lines = tuple(p_lines.intersection(d_lines))
if dirct_lines:
entr_lines[dirct_lines] = entr_lines.get(dirct_lines, 0)+1
if p_match > d_match:
pair = (d_match, p_match)
else:
pair = (p_match, d_match)
entr_pair[pair] = entr_pair.get(pair, 0)+1
In [45]:
print len(taxi)
print np.unique(pick_entr.keys()).shape
print sum(pick_entr.values())
print np.unique(drop_entr.keys()).shape
print sum(drop_entr.values())
In [46]:
print sum(entr_pair.values())
sorted(entr_pair.items(), key=operator.itemgetter(1), reverse=True)[:10]
Out[46]:
In [47]:
entr_buf.columns.values
Out[47]:
In [48]:
entr_lines
Out[48]:
In [ ]:
In [50]:
def countLine(partID, records):
import pyproj
import csv
import shapely.geometry as geom
import fiona
import fiona.crs
import shapely
import rtree
import geopandas as gpd
import numpy as np
import operator
import pandas as pd
taxi = pd.read_pickle('../why_yellow_taxi/Data/df_shuffle.pkl')
del taxi['Unnamed: 0']
shapefile = '../why_yellow_taxi/Buffer/entr_buffer_100_feet_epsg4269_nad83/entr_buffer_100_feet_epsg4269_nad83.shp'
entr_buf = gpd.read_file(shapefile)
entr_buf = entr_buf.to_crs(fiona.crs.from_epsg(2263))
routes = ['Route_' + str(n) for n in range(1, 12)]
entr2line = []
for i in xrange(len(entr_buf)):
lines = []
for line in list(entr_buf.loc[:,routes].ix[i].dropna().values):
try:
line = str(int(line))
except ValueError:
pass
lines.append(line)
entr2line.append(lines)
entr_buf['entr2line'] = entr2line
index = rtree.Rtree()
for idx, geometry in enumerate(entr_buf.geometry):
index.insert(idx, geometry.bounds)
entr_pair = {}
pick_entr = {}
drop_entr = {}
entr_lines = {}
proj = pyproj.Proj(init='epsg:2263', preserve_units=True)
if partID==0:
records.next()
reader = csv.reader(records)
for row in reader:
if ((float(row[5])!=0) and float(row[9]!=0)):
p = geom.Point(proj(float(row[5]), float(row[6])))
d = geom.Point(proj(float(row[9]), float(row[10])))
p_potential = index.intersection((p.x,p.y,p.x,p.y))
d_potential = index.intersection((d.x,d.y,d.x,d.y))
p_match = None # The first one match, should be the closest one? No!
d_match = None
for p_idx in p_potential:
if entr_buf.geometry[p_idx].contains(p):
p_match = p_idx # print 'p',p_idx
p_lines = set(entr_buf.entr2line[p_idx])
break
pick_entr[p_match] = pick_entr.get(p_match, 0)+1
for d_idx in d_potential:
if entr_buf.geometry[d_idx].contains(d):
d_match = d_idx # print 'd',d_idx
d_lines = set(entr_buf.entr2line[d_idx])
break
drop_entr[d_match] = drop_entr.get(d_match, 0)+1
if ((p_match and d_match) and (p_match != d_match)):
dirct_lines = tuple(p_lines.intersection(d_lines))
if dirct_lines:
entr_lines[dirct_lines] = entr_lines.get(dirct_lines, 0)+1
if p_match > d_match:
pair = (d_match, p_match)
else:
pair = (p_match, d_match)
entr_pair[pair] = entr_pair.get(pair, 0)+1
return entr_lines.items()
In [70]:
def mapper(record):
for key in record[0]:
yield key, record[1]
rdd = sc.textFile('./df_shuffle.csv')
counts = rdd.mapPartitionsWithIndex(countLine).flatMap(mapper).reduceByKey(lambda x,y: x+y).collect()
In [71]:
counts
Out[71]: