Initialize software environment

Initialize Spark Environment for Juypter Notebook



In [58]:

    
import findspark
import numpy as np
import os
import re
import subprocess
import shapefile

findspark.init()

import pyspark

sc = pyspark.SparkContext()

sqlContext = pyspark.sql.SQLContext(sc)

Reproject Cincinnati Area Geographic Information System (CAGIS) cc neighbndy.shp



In [59]:

    
output_shapefile = "../CagisOpenDataQuarterly/neighborhood.shp"

if not os.path.exists(output_shapefile):
    sys_command = 'ogr2ogr ' + output_shapefile + ' ' +\
              '"../CagisOpenDataQuarterly/cc neighbndy.shp" -t_srs EPSG:4326'

    process = subprocess.Popen(sys_command,
                               shell=True,
                               stdout=subprocess.PIPE)
    process.wait()
    print process.returncode

Compute Cincinnati neighborhood centroids

Read shapefile



In [60]:

    
def init_neighborhoods(readerobj):
    """ Initializes a dictionary that stores a description of
    City of Cincinnati neighborhoods
    
    Args:
        readerobj: shapfilemodule Reader class object handle
    
    Returns:
        neighborhood: Dictionary that stores a description of
                      City of Cincinnati neighborhoods"""
    shapes = readerobj.shapes()

    fieldnames = [re.sub('_', '', elem[0].lower())
                  for elem in readerobj.fields[1:]]

    neighborhood = {}

    for idx in range(0, sf.numRecords):
        row_dict = dict(zip(fieldnames, readerobj.record(idx)))

        row_dict['boundingbox'] = np.array(shapes[idx].bbox)

        row_dict['centroid'] = [np.mean(row_dict['boundingbox'][0:4:2]),
                               np.mean(row_dict['boundingbox'][1:4:2])]

        cur_neighborhood = row_dict.pop('neigh').lower()

        cur_neighborhood = re.sub('[-\s+]','', cur_neighborhood)

        neighborhood[cur_neighborhood] = row_dict

    return neighborhood



In [61]:

    
readerobj = shapefile.Reader(output_shapefile)

neighborhood = init_neighborhoods(readerobj)



In [98]:

    
from pyspark.mllib.feature import Vectors
from pyspark.mllib.linalg import DenseVector

neighborhood_centroid = []

for key in neighborhood.keys():
    neighborhood_centroid.append(Vectors.dense(neighborhood[key]['centroid']))



In [ ]: