In [58]:
import findspark
import numpy as np
import os
import re
import subprocess
import shapefile
findspark.init()
import pyspark
sc = pyspark.SparkContext()
sqlContext = pyspark.sql.SQLContext(sc)
In [59]:
output_shapefile = "../CagisOpenDataQuarterly/neighborhood.shp"
if not os.path.exists(output_shapefile):
sys_command = 'ogr2ogr ' + output_shapefile + ' ' +\
'"../CagisOpenDataQuarterly/cc neighbndy.shp" -t_srs EPSG:4326'
process = subprocess.Popen(sys_command,
shell=True,
stdout=subprocess.PIPE)
process.wait()
print process.returncode
In [60]:
def init_neighborhoods(readerobj):
""" Initializes a dictionary that stores a description of
City of Cincinnati neighborhoods
Args:
readerobj: shapfilemodule Reader class object handle
Returns:
neighborhood: Dictionary that stores a description of
City of Cincinnati neighborhoods"""
shapes = readerobj.shapes()
fieldnames = [re.sub('_', '', elem[0].lower())
for elem in readerobj.fields[1:]]
neighborhood = {}
for idx in range(0, sf.numRecords):
row_dict = dict(zip(fieldnames, readerobj.record(idx)))
row_dict['boundingbox'] = np.array(shapes[idx].bbox)
row_dict['centroid'] = [np.mean(row_dict['boundingbox'][0:4:2]),
np.mean(row_dict['boundingbox'][1:4:2])]
cur_neighborhood = row_dict.pop('neigh').lower()
cur_neighborhood = re.sub('[-\s+]','', cur_neighborhood)
neighborhood[cur_neighborhood] = row_dict
return neighborhood
In [61]:
readerobj = shapefile.Reader(output_shapefile)
neighborhood = init_neighborhoods(readerobj)
In [98]:
from pyspark.mllib.feature import Vectors
from pyspark.mllib.linalg import DenseVector
neighborhood_centroid = []
for key in neighborhood.keys():
neighborhood_centroid.append(Vectors.dense(neighborhood[key]['centroid']))
In [ ]: