In [1]:
import requests
import json
from pprint import pprint
from collections import defaultdict
from datastreams import *
from dictstreams import *

apptoken = open('/home/stuart/Projects/datastream/res/spd_socrata.key').read().strip()

api_query = 'https://data.seattle.gov/resource/3k2p-39jp.json?$limit={}&$offset={}&$$app_token=' + apptoken
batchsize = 1000

def fetch_police_stats():
    offset = 0
    results = json.loads(requests.get(api_query.format(batchsize, offset)).content)
    for result in results:
        yield result
    while len(results) > 999:
        offset += batchsize
        results = json.loads(requests.get(api_query.format(batchsize, offset)).content)
        for result in results:
            yield result

In [2]:
spddata = DictStream(fetch_police_stats())

In [3]:
data = spddata.take(5000).collect()

In [4]:
data.take(1)


Out[4]:
[{u'hundred_block_location': u'3XX BLOCK OF PINE ST', u'district_sector': u'M', u'event_clearance_code': u'242', u'cad_cdw_id': u'\ufeff15736', u'event_clearance_date': u'2010-07-17T20:49:00', u'event_clearance_description': u'FIGHT DISTURBANCE', u'zone_beat': u'M2', u'event_clearance_subgroup': u'DISTURBANCES', u'longitude': u'-122.338146748', u'cad_event_number': u'10000246357', u'incident_location': {u'latitude': u'47.610975163', u'needs_recoding': False, u'longitude': u'-122.338146748'}, u'census_tract': u'8100.2001', u'latitude': u'47.610975163', u'general_offense_number': u'2010246357', u'event_clearance_group': u'DISTURBANCES'}]

In [5]:
from datetime import datetime

class Location(object):
    def __init__(self, general_offense_number, latitude, longitude):
        self.general_offense_number = general_offense_number
        self.latitude = latitude
        self.longitude = longitude
    def __repr__(self):
        return '<Location {}>'.format(self.__dict__)
        
class Event(object):
    def __init__(self, general_offense_number, clearance_group, clearance_description, clearance_time):
        self.general_offense_number = general_offense_number
        self.clearance_group = clearance_group
        self.clearance_description = clearance_description
        self.clearance_time = datetime.strptime(clearance_time, "%Y-%m-%dT%H:%M:%S")
    def getdesc(self):
        return self.clearance_description
    def __repr__(self):
        return '<Event {}>'.format(self.__dict__)

locations = DataSet(data.map(lambda row: Location(row['general_offense_number'], row['latitude'], row['longitude'])))
events = DataSet(data.map(lambda row: Event(row['general_offense_number'], 
                                            row['event_clearance_group'], 
                                            row['event_clearance_description'],
                                            row['event_clearance_date'])))
events.take(1)


Out[5]:
[<Event {'clearance_description': u'FIGHT DISTURBANCE', 'clearance_group': u'DISTURBANCES', 'general_offense_number': u'2010246357', 'clearance_time': datetime.datetime(2010, 7, 17, 20, 49)}>]

In [16]:
eventlocs = events.join(locations, 'general_offense_number')
next(eventlocs.__iter__())


Out[16]:
<EventLocation {'right': <Location {'latitude': u'47.610975163', 'general_offense_number': u'2010246357', 'longitude': u'-122.338146748'}>, 'left': <Event {'clearance_description': u'FIGHT DISTURBANCE', 'clearance_group': u'DISTURBANCES', 'general_offense_number': u'2010246357', 'clearance_time': datetime.datetime(2010, 7, 17, 20, 49)}>}>

In [8]:
eventlocs.filter(lambda eventloc: 'disturbance' in eventloc.event_clearance_group.lower()).take(5)


Out[8]:
[<datastreams.JoinedOjbect object at 0x7fd89f549990>, <datastreams.JoinedOjbect object at 0x7fd89f549c50>, <datastreams.JoinedOjbect object at 0x7fd89f549e50>, <datastreams.JoinedOjbect object at 0x7fd89f549e90>, <datastreams.JoinedOjbect object at 0x7fd89f549ed0>]

In [6]:
# count the number of different events that happen
DictStream(data\
    .filter(lambda row: '17th' in row['hundred_block_location'].lower()))\
    .groupby('event_clearance_group', lambda count, event: count + 1, 0)


Out[6]:
{u'ACCIDENT INVESTIGATION': 28,
 u'ANIMAL COMPLAINTS': 1,
 u'ARREST': 7,
 u'ASSAULTS': 15,
 u'AUTO THEFTS': 11,
 u'BIKE': 1,
 u'BURGLARY': 30,
 u'CAR PROWL': 24,
 u'DISTURBANCES': 115,
 u'FAILURE TO REGISTER (SEX OFFENDER)': 1,
 u'FALSE ALARMS': 18,
 u'FRAUD CALLS': 17,
 u'HAZARDS': 2,
 u'LEWD CONDUCT': 2,
 u'LIQUOR VIOLATIONS': 16,
 u'MENTAL HEALTH': 22,
 u'NARCOTICS COMPLAINTS': 4,
 u'NUISANCE, MISCHIEF ': 6,
 u'OTHER PROPERTY': 20,
 u'PERSON DOWN/INJURY': 7,
 u'PERSONS - LOST, FOUND, MISSING': 6,
 u'PROPERTY - MISSING, FOUND': 4,
 u'PROPERTY DAMAGE': 16,
 u'PROSTITUTION': 1,
 u'PROWLER': 1,
 u'ROBBERY': 2,
 u'SHOPLIFTING': 1,
 u'SUSPICIOUS CIRCUMSTANCES': 105,
 u'THREATS, HARASSMENT': 11,
 u'TRAFFIC RELATED CALLS': 104,
 u'TRESPASS': 7,
 u'WEAPONS CALLS': 2}

In [19]:
len(data) * len(str(data[0]))


Out[19]:
64100000

In [20]:



Out[20]:
100000

In [ ]: