In [1]:
import pandas as pd, requests, json
First download data from the city of Berkeley's API. You can use Socrata's $limit parameter to specify how many rows to grab (otherwise the default is 1,000 rows of data): https://dev.socrata.com/docs/paging.html
Example request: https://data.cityofberkeley.info/resource/k489-uv4i.json?$limit=5
In [2]:
# API endpoint for city of Berkeley's 311 calls
endpoint_url = 'https://data.cityofberkeley.info/resource/k489-uv4i.json?$limit=2000'
In [3]:
# fetch the URL and load the data
response = requests.get(endpoint_url)
data = response.json()
Next, turn the json data into a dataframe and clean it up a bit: drop unnecessary columns and any rows that lack lat-long data. We want to make our json file as small as possible (prefer under 5 mb) so that it can be loaded over the Internet to anyone viewing your map, without taking forever to download a huge file.
In [4]:
# turn the json data into a dataframe and see how many rows and what columns we have
df = pd.DataFrame(data)
print('We have {} rows'.format(len(df)))
str(df.columns.tolist())
Out[4]:
In [5]:
df.head()
Out[5]:
In [6]:
# convert lat-long to floats and change address from ALL CAPS to Regular Capitalization
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)
df['street_address'] = df['street_address'].str.title()
In [7]:
# we don't need all those columns - only keep useful ones
useful_cols = ['issue_description', 'issue_type', 'latitude', 'longitude', 'street_address', 'ticket_status']
df_subset = df[useful_cols]
In [8]:
# drop any rows that lack lat/long data
df_geo = df_subset.dropna(subset=['latitude', 'longitude'], axis=0, inplace=False)
print('We have {} geotagged rows'.format(len(df_geo)))
df_geo.tail()
Out[8]:
In [9]:
# what is the distribution of issue types?
df_geo['issue_type'].value_counts()
Out[9]:
Finally, convert each row in the dataframe to a geojson-formatted feature and save the result as a file. The format is pretty simple and you can see it here: http://geojson.org/
In [10]:
def df_to_geojson(df, properties, lat='latitude', lon='longitude'):
"""
Turn a dataframe containing point data into a geojson formatted python dictionary
df : the dataframe to convert to geojson
properties : a list of columns in the dataframe to turn into geojson feature properties
lat : the name of the column in the dataframe that contains latitude data
lon : the name of the column in the dataframe that contains longitude data
"""
# create a new python dict to contain our geojson data, using geojson format
geojson = {'type':'FeatureCollection', 'features':[]}
# loop through each row in the dataframe and convert each row to geojson format
for _, row in df.iterrows():
# create a feature template to fill in
feature = {'type':'Feature',
'properties':{},
'geometry':{'type':'Point',
'coordinates':[]}}
# fill in the coordinates
feature['geometry']['coordinates'] = [row[lon],row[lat]]
# for each column, get the value and add it as a new feature property
for prop in properties:
feature['properties'][prop] = row[prop]
# add this feature (aka, converted dataframe row) to the list of features inside our dict
geojson['features'].append(feature)
return geojson
In [11]:
useful_columns = ['street_address', 'issue_description', 'issue_type', 'ticket_status']
geojson_dict = df_to_geojson(df_geo, properties=useful_columns)
geojson_str = json.dumps(geojson_dict, indent=2)
In [12]:
# save the geojson result to a file
output_filename = 'dataset.js'
with open(output_filename, 'w') as output_file:
output_file.write('var dataset = {};'.format(geojson_str))
# how many features did we save to the geojson file?
print('{} geotagged features saved to file'.format(len(geojson_dict['features'])))
In [ ]: