In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import import_data
%matplotlib inline
In [2]:
data_311 = pd.read_csv('../data/detroit-311.csv')
In [3]:
data_311.info()
In [4]:
fig, ax = plt.subplots(1)
ax.scatter(data_311['lng'],data_311['lat'],s=5,alpha=0.5)
ax.set_xlabel('longitude')
ax.set_ylabel('latitude')
plt.show()
In [5]:
# To remove outliers
data_311 = data_311[(data_311['lng']>-84.0) & (data_311['lat']>42.0)]
In [6]:
fig, ax = plt.subplots(1)
ax.scatter(data_311['lng'],data_311['lat'],s=3,color='blue',alpha=0.3)
ax.set_xlabel('longitude')
ax.set_ylabel('latitude')
ax.set_xlim(min(data_311['lng'])-0.01, max(data_311['lng'])+0.01)
ax.set_ylim(min(data_311['lat'])-0.01, max(data_311['lat'])+0.01)
plt.show()
In [7]:
data_311['addr'] = data_311['address'].astype(str).apply(lambda x: x.lower())
In [8]:
data_311.drop(['city','location','image'],axis=1,inplace=True)
In [9]:
import re
In [10]:
def clean_addr(addr):
'''remove city, state, country, or zip from an address. Make street, avenue, and drive
in short forms (st, ave, dr)'''
addr = re.sub('4\d{4}$','',addr)
addr = re.sub('4\d{4}, usa$','',addr)
addr = re.sub('detroit, mi.?\s?$','',addr)
addr = re.sub('detroit mi.?\s?$','',addr)
addr = re.sub('detroit, michigan\s?$','',addr)
addr = re.sub('detroit michigan\s?$','',addr)
addr = re.sub('michigan\s?$','',addr)
addr = re.sub(' street ',' st ', addr)
addr = re.sub(' st. ',' st ', addr)
addr = re.sub(' drive ', ' dr ', addr)
addr = re.sub(' dr. ', ' dr ', addr)
addr = re.sub(' ave.',' ave', addr)
addr = re.sub('\s?,', '', addr)
addr = re.sub('\.', '', addr)
addr = addr.rstrip()
return addr
In [11]:
data_311['addr'] = data_311['addr'].apply(lambda x: clean_addr(x))
In [12]:
event_beg = 0
event_end = data_311.shape[0]
data_311['event_id'] = np.arange(event_beg,event_end)
data_311['type'] = 1
In [13]:
data_311.to_csv('../data/data_311.csv', index=False)
In [14]:
data_crime = pd.read_csv("../data/detroit-crime.csv")
In [15]:
data_crime.columns
Out[15]:
In [16]:
fig, ax = plt.subplots(1)
ax.scatter(data_crime['LON'],data_crime['LAT'],s=5,alpha=0.5)
ax.set_xlabel('longitude')
ax.set_ylabel('latitude')
plt.show()
In [17]:
# To remove outliers
data_crime = data_crime[(data_crime['LON']>-83.3) & (data_crime['LON']<-82.7) &
(data_crime['LAT']>42.25) & (data_crime['LAT']<42.46)]
In [18]:
fig, ax = plt.subplots(1)
ax.scatter(data_crime['LON'],data_crime['LAT'],s=3,color='red',alpha=0.2)
ax.set_xlabel('longitude')
ax.set_ylabel('latitude')
ax.set_xlim(min(data_crime['LON'])-0.01,max(data_crime['LON']+0.01))
ax.set_ylim(min(data_crime['LAT'])-0.01,max(data_crime['LAT']+0.01))
plt.show()
In [19]:
def clean_crime_addr(addr):
'''lower case'''
addr = addr.lower()
return addr
In [20]:
data_crime['addr'] = data_crime['ADDRESS'].astype(str).apply(lambda x: clean_crime_addr(x))
In [21]:
data_crime.drop(['ADDRESS'],axis=1,inplace=True)
In [22]:
event_beg = event_end
event_end += data_crime.shape[0]
data_crime['event_id'] = np.arange(event_beg,event_end)
data_crime['type'] = 2
In [23]:
data_crime.to_csv('../data/data_crime.csv', index=False)
In [24]:
data_bv = pd.read_csv("../data/detroit-blight-violations.csv")
In [25]:
data_bv.columns
Out[25]:
In [26]:
data_bv['ViolationAddress'][0]
Out[26]:
In [27]:
vlocs = data_bv['ViolationAddress'].astype(str)
vcoords = vlocs.apply(lambda x : x.split('\n')[-1])
vaddr = vlocs.apply(lambda x : x.split('\n')[0].lower())
vcoords = vcoords.apply(lambda x : x.rstrip(')').lstrip('('))
data_bv['coord'] = vcoords.astype(str)
data_bv['addr'] = vaddr.astype(str)
data_bv = data_bv[data_bv['coord']!='']
data_bv['lon'] = data_bv['coord'].apply(lambda x: x.split(', ')[-1])
data_bv['lat'] = data_bv['coord'].apply(lambda x: x.split(', ')[0])
data_bv['lon'] = data_bv['lon'].astype(float)
data_bv['lat'] = data_bv['lat'].astype(float)
In [28]:
fig, ax = plt.subplots(1)
ax.scatter(data_bv['lon'],data_bv['lat'],s=3,color='green',alpha=0.3)
ax.set_xlabel('longitude')
ax.set_ylabel('latitude')
ax.set_xlim(min(data_bv['lon'])-0.01, max(data_bv['lon'])+0.01)
ax.set_ylim(min(data_bv['lat'])-0.01, max(data_bv['lat'])+0.01)
plt.show()
In [29]:
event_beg = event_end
event_end += data_bv.shape[0]
data_bv['event_id'] = np.arange(event_beg,event_end)
data_bv['type'] = 3
In [30]:
data_bv.to_csv('../data/data_bv.csv',index=False)
In [31]:
permits = pd.read_csv('../data/detroit-demolition-permits.tsv',delimiter="\t")
In [32]:
plocs = permits['site_location'].astype(str)
pcoords = plocs.apply(lambda x : x.split('\n')[-1])
paddr = plocs.apply(lambda x : x.split('\n')[0].lower())
pcoords = pcoords.apply(lambda x : x.rstrip(')').lstrip('('))
permits['coord'] = pcoords.astype(str)
permits['addr'] = paddr.astype(str)
permits = permits[permits['coord']!='']
permits['lon'] = permits['coord'].apply(lambda x: x.split(', ')[-1])
permits['lat'] = permits['coord'].apply(lambda x: x.split(', ')[0])
permits['lon'] = permits['lon'].astype(float)
permits['lat'] = permits['lat'].astype(float)
In [33]:
permits.drop(['coord','site_location'],axis=1,inplace=True)
In [34]:
fig, ax = plt.subplots(1)
ax.scatter(permits['lon'],permits['lat'],s=1)
ax.set_xlabel('longitude')
ax.set_ylabel('latitude')
plt.show()
Detroit coordinates are centered at lower left corner in the above figure. We can get rid of outliers with way different coordiantes.
In [35]:
permits = permits[(permits['lon']<-70) & (permits['lat'] < 45)]
In [36]:
fig, ax = plt.subplots(1)
ax.scatter(permits['lon'],permits['lat'],s=3,color='purple',alpha=0.4)
ax.set_xlabel('longitude')
ax.set_xlim(min(permits['lon'])-0.01,max(permits['lon']+0.01))
ax.set_ylabel('latitude')
ax.set_ylim(min(permits['lat'])-0.01,max(permits['lat']+0.01))
plt.show()
In [37]:
event_beg = event_end
event_end += permits.shape[0]
permits['event_id'] = np.arange(event_beg,event_end)
permits['type'] = 4
In [38]:
permits['PARCEL_NO'] = permits['PARCEL_NO'].apply(lambda x: x.replace(" ",""))
In [39]:
permits.to_csv('../data/permits.csv', index=False)
In [40]:
events_collection = [data_311.as_matrix(['event_id','lng','lat','addr','type']),
data_crime.as_matrix(['event_id', 'LON', 'LAT', 'addr','type']),
data_bv.as_matrix(['event_id','lon','lat','addr','type']),
permits.as_matrix(['event_id','lon','lat','addr','type'])]
events = pd.DataFrame(data=np.concatenate(events_collection),
columns=['event_id','lon','lat','addr','type'])
In [41]:
events.shape
Out[41]:
In [42]:
events.head(3)
Out[42]:
In [43]:
events.to_csv('../data/events.csv',index=False)
In [ ]:
In [ ]: