Database


In [1]:
%matplotlib inline
from IPython.display import display

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


import os

In [2]:
SQL_ENGINE = create_engine('sqlite:///streetlight_cases.db')

In [3]:
DATA_STREETLIGHT_CASES_URL = 'https://data.sfgov.org/api/views/c53t-rr3f/rows.json?accessType=DOWNLOAD'
DATA_STREETLIGHT_CASES_LOCAL = 'DATA_STREETLIGHT_CASES.json'

data_path = DATA_STREETLIGHT_CASES_URL
if os.path.isfile(DATA_STREETLIGHT_CASES_LOCAL):
    data_path = DATA_STREETLIGHT_CASES_LOCAL

In [4]:
import urllib, json

def _load_data(url):
    response = urllib.urlopen(url)
    raw_data = json.loads(response.read())
    columns = [col['name'] for col in raw_data['meta']['view']['columns']]
    rows = raw_data['data']
    return pd.DataFrame(data=rows, columns=columns)

In [5]:
df = _load_data(data_path)

In [6]:
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

In [7]:
df['opened'] = pd.to_datetime(df.opened)
df['opened_dayofweek'] = df.opened.dt.dayofweek
df['opened_month'] = df.opened.dt.month
df['opened_year'] = df.opened.dt.year
df['opened_dayofmonth'] = df.opened.dt.day
df['opened_weekend'] = df.opened_dayofweek >= 5

df['closed'] = pd.to_datetime(df.closed)
df['closed_dayofweek'] = df.closed.dt.dayofweek
df['closed_month'] = df.closed.dt.month
df['closed_year'] = df.closed.dt.year
df['closed_dayofmonth'] = df.closed.dt.day
df['closed_weekend'] = df.closed_dayofweek >= 5

In [8]:
df['delta'] = (df.closed - df.opened).dt.days
df['is_open'] = pd.isnull(df.closed)
df['target'] = df.delta <= 2

In [9]:
from geopy.distance import vincenty

df['latitude'] = df.point.apply(lambda e: float(e[1]))
df['longitude'] = df.point.apply(lambda e: float(e[2]))

In [10]:
min_lat, max_lat = min(df.latitude), max(df.latitude)
min_lng, max_lng = min(df.longitude), max(df.longitude)

def grid(lat, lng):
    x = vincenty((lat, min_lng), (lat, lng)).miles
    y = vincenty((min_lat, lng), (lat, lng)).miles
    return x, y

xy = [grid(lat, lng) for lat, lng in zip(df.latitude.values, df.longitude.values)]

df['loc_x'] = np.array(xy)[:,0]
df['loc_y'] = np.array(xy)[:,1]

In [12]:
dummies = pd.get_dummies(df.neighborhood.str.replace(' ', '_').str.lower(), prefix='neigh_', drop_first=False)
df[dummies.columns] = dummies
del df['neighborhood']

In [13]:
dummies = pd.get_dummies(df.category.str.replace(' ', '_').str.lower(), prefix='cat_', drop_first=False)
df[dummies.columns] = dummies
del df['category']

In [14]:
dummies = pd.get_dummies(df.source.str.replace(' ', '_').str.lower(), prefix='source_', drop_first=False)
df[dummies.columns] = dummies
del df['source']

In [15]:
df['status'] = df.status == 'Closed'

In [16]:
del df['sid']
del df['id']
del df['position']
del df['created_at']
del df['created_meta']
del df['updated_at']
del df['updated_meta']
del df['meta']
del df['caseid']
del df['address']
del df['responsible_agency']
del df['request_details']
del df['request_type']
del df['status']
del df['updated']
del df['supervisor_district']
del df['point']

In [17]:
df = df.sort_values(by='opened', ascending=True)

del df['opened']
del df['closed']
del df['closed_dayofweek']
del df['closed_month']
del df['closed_year']
del df['closed_dayofmonth']
del df['closed_weekend']
del df['delta']
del df['is_open']

# deleting opened_year because there is only 2012 and 2013, which are not relevant for future classifications
del df['opened_year']

df = df.dropna()

In [18]:
columns = list(df.columns)
columns.remove('target')
columns.append('target')
df = df[columns]
feature_columns = columns[:-1]

In [19]:
df.to_sql('new_data', SQL_ENGINE, if_exists='replace', index=False)

Database


In [22]:
r1 = pd.read_sql_query('SELECT * FROM new_data', SQL_ENGINE)

In [23]:
print r1.shape


(7643, 61)

In [24]:
r2 = pd.read_sql_query('SELECT latitude, longitude FROM new_data', SQL_ENGINE)

In [26]:
r2.columns, r2.shape


Out[26]:
(Index([u'latitude', u'longitude'], dtype='object'), (7643, 2))

In [ ]: