In [ ]:
import pandas as pd
import numpy as np

In [ ]:
# url is http://www.foodandwaterwatch.org/sites/default/files/water_rate_survey-location_0.pdf
# downloaded pdf, converted to csv, cleaned up

In [ ]:
# create directory
path = 'data/pricing/pricing.csv'
df = pd.read_csv(path)
df

In [ ]:
def get_lat_lng(address):
    # 1) Set-up service url:
    import urllib
    import json

    # Google URL
    serviceurl = "https://maps.googleapis.com/maps/api/geocode/json?"
    api_key = 'your_api_key'

    # 2) Encode location data into 'url-speak' to be added to the end of the service url
    url = serviceurl + urllib.urlencode({'sensor':'false', 'address': address}) +'&key=' + api_key

    # 3) Open the socket to the url
    socket = urllib.urlopen(url)

    # 4) Read th data in as a string.
    data_string = socket.read()

    # 5) Convert it from a JSON string to a dictionary
    data_dict = json.loads(data_string)

    return data_dict["results"][0]["geometry"]["location"]

In [ ]:
def df_to_list(dataframe, col1, col2):
    # extract columns from dataframe
    l_extract = dataframe[[col1, col2]].values.tolist()
    l_extract2 = []
    
    # concatenate each nested element into a comma-delimited string, append to new list
    for elements in l_extract:
        text = elements[0] + ', ' + elements[1]
        l_extract2.append(text)
    return l_extract2

In [ ]:
def geo_list(place_state_list):
    return_list = []
    for place_state in place_state_list:
        try:
            geo_dict = get_lat_lng(place_state)
            split = place_state.rpartition(',')
            loc = split[0]
            state = split[2].strip()
            tuple_output = (loc, state, geo_dict['lng'], geo_dict['lat'])
            return_list.append(tuple_output)
            print place_state, geo_dict['lng'], geo_dict['lat']
        except:
            tuple_output = (place_state, 'none', 'none')
            return_list.append(tuple_output)
            print 'no geocode'
    return return_list

In [ ]:
utility_locations = df_to_list(df, 'System', 'State')
utility_locations[:10]

In [ ]:
geocoded = geo_list(utility_locations)

In [ ]:
df_geo = pd.DataFrame(geocoded)
df_geo.head()

In [ ]:
# drop columns, rename remaining columns
df_geo.drop([0,1], axis=1, inplace=True)
df_geo.rename(columns={2:'lng', 3:'lat'}, inplace=True)
df_geo.head(3)

In [ ]:
# merge, rename cols, drop other cols
df_joined = df.join(df_geo, how='inner')
df_joined.rename(columns={'System':'system', 'Annual Bill':'bill', 'State':'state', 'Owner':'owner'}, inplace=True)
df_joined.drop(['Utility','System_Old'], axis=1, inplace=True)
df_joined.head(3)

In [ ]:
# filter out none lons and lats and take non-nans

df_joined['lng'] = pd.to_numeric(df_joined['lng'], errors='coerce')
df_joined['lat'] = pd.to_numeric(df_joined['lat'], errors='coerce')
df_joined = df_joined[np.isfinite(df_joined['lng'])]
print df_joined.dtypes, df_joined.shape

In [ ]:
df_joined.to_csv('data/pricing/geocoded.csv')
import pickle
with open('data/pricing/geocoded.pickle', 'wb') as f:
    pickle.dump(df_joined, f)