In [1]:
import requests
import json
from pprint import pprint
from collections import defaultdict
from datastreams import *
from dictstreams import *
apptoken = open('/home/stuart/Projects/datastream/res/spd_socrata.key').read().strip()
api_query = 'https://data.seattle.gov/resource/3k2p-39jp.json?$limit={}&$offset={}&$$app_token=' + apptoken
batchsize = 1000
def fetch_police_stats():
offset = 0
results = json.loads(requests.get(api_query.format(batchsize, offset)).content)
for result in results:
yield result
while len(results) > 999:
offset += batchsize
results = json.loads(requests.get(api_query.format(batchsize, offset)).content)
for result in results:
yield result
In [2]:
spddata = DictStream(fetch_police_stats())
In [3]:
data = spddata.take(5000).collect()
In [4]:
data.take(1)
Out[4]:
In [5]:
from datetime import datetime
class Location(object):
def __init__(self, general_offense_number, latitude, longitude):
self.general_offense_number = general_offense_number
self.latitude = latitude
self.longitude = longitude
def __repr__(self):
return '<Location {}>'.format(self.__dict__)
class Event(object):
def __init__(self, general_offense_number, clearance_group, clearance_description, clearance_time):
self.general_offense_number = general_offense_number
self.clearance_group = clearance_group
self.clearance_description = clearance_description
self.clearance_time = datetime.strptime(clearance_time, "%Y-%m-%dT%H:%M:%S")
def getdesc(self):
return self.clearance_description
def __repr__(self):
return '<Event {}>'.format(self.__dict__)
locations = DataSet(data.map(lambda row: Location(row['general_offense_number'], row['latitude'], row['longitude'])))
events = DataSet(data.map(lambda row: Event(row['general_offense_number'],
row['event_clearance_group'],
row['event_clearance_description'],
row['event_clearance_date'])))
events.take(1)
Out[5]:
In [16]:
eventlocs = events.join(locations, 'general_offense_number')
next(eventlocs.__iter__())
Out[16]:
In [8]:
eventlocs.filter(lambda eventloc: 'disturbance' in eventloc.event_clearance_group.lower()).take(5)
Out[8]:
In [6]:
# count the number of different events that happen
DictStream(data\
.filter(lambda row: '17th' in row['hundred_block_location'].lower()))\
.groupby('event_clearance_group', lambda count, event: count + 1, 0)
Out[6]:
In [19]:
len(data) * len(str(data[0]))
Out[19]:
In [20]:
Out[20]:
In [ ]: