In [ ]:
import pandas as pd
import numpy as np
In [ ]:
# url is http://www.foodandwaterwatch.org/sites/default/files/water_rate_survey-location_0.pdf
# downloaded pdf, converted to csv, cleaned up
In [ ]:
# create directory
path = 'data/pricing/pricing.csv'
df = pd.read_csv(path)
df
In [ ]:
def get_lat_lng(address):
# 1) Set-up service url:
import urllib
import json
# Google URL
serviceurl = "https://maps.googleapis.com/maps/api/geocode/json?"
api_key = 'your_api_key'
# 2) Encode location data into 'url-speak' to be added to the end of the service url
url = serviceurl + urllib.urlencode({'sensor':'false', 'address': address}) +'&key=' + api_key
# 3) Open the socket to the url
socket = urllib.urlopen(url)
# 4) Read th data in as a string.
data_string = socket.read()
# 5) Convert it from a JSON string to a dictionary
data_dict = json.loads(data_string)
return data_dict["results"][0]["geometry"]["location"]
In [ ]:
def df_to_list(dataframe, col1, col2):
# extract columns from dataframe
l_extract = dataframe[[col1, col2]].values.tolist()
l_extract2 = []
# concatenate each nested element into a comma-delimited string, append to new list
for elements in l_extract:
text = elements[0] + ', ' + elements[1]
l_extract2.append(text)
return l_extract2
In [ ]:
def geo_list(place_state_list):
return_list = []
for place_state in place_state_list:
try:
geo_dict = get_lat_lng(place_state)
split = place_state.rpartition(',')
loc = split[0]
state = split[2].strip()
tuple_output = (loc, state, geo_dict['lng'], geo_dict['lat'])
return_list.append(tuple_output)
print place_state, geo_dict['lng'], geo_dict['lat']
except:
tuple_output = (place_state, 'none', 'none')
return_list.append(tuple_output)
print 'no geocode'
return return_list
In [ ]:
utility_locations = df_to_list(df, 'System', 'State')
utility_locations[:10]
In [ ]:
geocoded = geo_list(utility_locations)
In [ ]:
df_geo = pd.DataFrame(geocoded)
df_geo.head()
In [ ]:
# drop columns, rename remaining columns
df_geo.drop([0,1], axis=1, inplace=True)
df_geo.rename(columns={2:'lng', 3:'lat'}, inplace=True)
df_geo.head(3)
In [ ]:
# merge, rename cols, drop other cols
df_joined = df.join(df_geo, how='inner')
df_joined.rename(columns={'System':'system', 'Annual Bill':'bill', 'State':'state', 'Owner':'owner'}, inplace=True)
df_joined.drop(['Utility','System_Old'], axis=1, inplace=True)
df_joined.head(3)
In [ ]:
# filter out none lons and lats and take non-nans
df_joined['lng'] = pd.to_numeric(df_joined['lng'], errors='coerce')
df_joined['lat'] = pd.to_numeric(df_joined['lat'], errors='coerce')
df_joined = df_joined[np.isfinite(df_joined['lng'])]
print df_joined.dtypes, df_joined.shape
In [ ]:
df_joined.to_csv('data/pricing/geocoded.csv')
import pickle
with open('data/pricing/geocoded.pickle', 'wb') as f:
pickle.dump(df_joined, f)