In [1]:
%matplotlib inline
from IPython.display import display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
In [2]:
SQL_ENGINE = create_engine('sqlite:///streetlight_cases.db')
In [3]:
DATA_STREETLIGHT_CASES_URL = 'https://data.sfgov.org/api/views/c53t-rr3f/rows.json?accessType=DOWNLOAD'
DATA_STREETLIGHT_CASES_LOCAL = 'DATA_STREETLIGHT_CASES.json'
data_path = DATA_STREETLIGHT_CASES_URL
if os.path.isfile(DATA_STREETLIGHT_CASES_LOCAL):
data_path = DATA_STREETLIGHT_CASES_LOCAL
In [4]:
import urllib, json
def _load_data(url):
response = urllib.urlopen(url)
raw_data = json.loads(response.read())
columns = [col['name'] for col in raw_data['meta']['view']['columns']]
rows = raw_data['data']
return pd.DataFrame(data=rows, columns=columns)
In [5]:
df = _load_data(data_path)
In [6]:
df.columns = [col.lower().replace(' ', '_') for col in df.columns]
In [7]:
df['opened'] = pd.to_datetime(df.opened)
df['opened_dayofweek'] = df.opened.dt.dayofweek
df['opened_month'] = df.opened.dt.month
df['opened_year'] = df.opened.dt.year
df['opened_dayofmonth'] = df.opened.dt.day
df['opened_weekend'] = df.opened_dayofweek >= 5
df['closed'] = pd.to_datetime(df.closed)
df['closed_dayofweek'] = df.closed.dt.dayofweek
df['closed_month'] = df.closed.dt.month
df['closed_year'] = df.closed.dt.year
df['closed_dayofmonth'] = df.closed.dt.day
df['closed_weekend'] = df.closed_dayofweek >= 5
In [8]:
df['delta'] = (df.closed - df.opened).dt.days
df['is_open'] = pd.isnull(df.closed)
df['target'] = df.delta <= 2
In [9]:
from geopy.distance import vincenty
df['latitude'] = df.point.apply(lambda e: float(e[1]))
df['longitude'] = df.point.apply(lambda e: float(e[2]))
In [10]:
min_lat, max_lat = min(df.latitude), max(df.latitude)
min_lng, max_lng = min(df.longitude), max(df.longitude)
def grid(lat, lng):
x = vincenty((lat, min_lng), (lat, lng)).miles
y = vincenty((min_lat, lng), (lat, lng)).miles
return x, y
xy = [grid(lat, lng) for lat, lng in zip(df.latitude.values, df.longitude.values)]
df['loc_x'] = np.array(xy)[:,0]
df['loc_y'] = np.array(xy)[:,1]
In [12]:
dummies = pd.get_dummies(df.neighborhood.str.replace(' ', '_').str.lower(), prefix='neigh_', drop_first=False)
df[dummies.columns] = dummies
del df['neighborhood']
In [13]:
dummies = pd.get_dummies(df.category.str.replace(' ', '_').str.lower(), prefix='cat_', drop_first=False)
df[dummies.columns] = dummies
del df['category']
In [14]:
dummies = pd.get_dummies(df.source.str.replace(' ', '_').str.lower(), prefix='source_', drop_first=False)
df[dummies.columns] = dummies
del df['source']
In [15]:
df['status'] = df.status == 'Closed'
In [16]:
del df['sid']
del df['id']
del df['position']
del df['created_at']
del df['created_meta']
del df['updated_at']
del df['updated_meta']
del df['meta']
del df['caseid']
del df['address']
del df['responsible_agency']
del df['request_details']
del df['request_type']
del df['status']
del df['updated']
del df['supervisor_district']
del df['point']
In [17]:
df = df.sort_values(by='opened', ascending=True)
del df['opened']
del df['closed']
del df['closed_dayofweek']
del df['closed_month']
del df['closed_year']
del df['closed_dayofmonth']
del df['closed_weekend']
del df['delta']
del df['is_open']
# deleting opened_year because there is only 2012 and 2013, which are not relevant for future classifications
del df['opened_year']
df = df.dropna()
In [18]:
columns = list(df.columns)
columns.remove('target')
columns.append('target')
df = df[columns]
feature_columns = columns[:-1]
In [19]:
df.to_sql('new_data', SQL_ENGINE, if_exists='replace', index=False)
In [22]:
r1 = pd.read_sql_query('SELECT * FROM new_data', SQL_ENGINE)
In [23]:
print r1.shape
In [24]:
r2 = pd.read_sql_query('SELECT latitude, longitude FROM new_data', SQL_ENGINE)
In [26]:
r2.columns, r2.shape
Out[26]:
In [ ]: