In [1]:
import numpy as np
import pandas as pd
Data already cleaned by this notebook The collection of data was saved at ../data/events.csv
In [2]:
data_events = pd.read_csv('../data/events.csv')
In [3]:
data_events.head(10)
Out[3]:
In [4]:
data_events.shape
Out[4]:
In [5]:
# To get rid of duplicates with same coordinates and possibly different address names
building_pool = data_events.drop_duplicates(subset=['lon','lat'])
In [6]:
building_pool.shape
Out[6]:
In [18]:
# 1. sort data according to longitude
# init new_data
# 2. for each record:
# if record[lon] - prev[lon] > length:
# add new record into new_data
# else:
# find previous coords that are close
# if no coords in bbox:
# add new record into new_data
# else:
# for each of these coords:
# if record in bbox:
# append event_id
#
# At the same time, if building is assigned one permit or more for demolition, blighted will be assigned to one.
#
def gen_buildings(data):
'''generate buildings from coordinates'''
from assign_bbox import nearest_pos, is_in_bbox, raw_dist # defined in assign_bbox.py in current dir
new_data = {'addr': [], 'lon': [], 'lat': [], 'event_id_list': [], 'blighted': []}
data_sorted = data.sort_values(by='lon', inplace=False)
length = 4.11e-4 # longitude
width = 2.04e-4 # latitude
prev_lon = 0
prev_lat = 0
max_distX = abs(length/2)
max_distY = abs(width/2)
for i, entry in data_sorted.iterrows():
lon = entry['lon']
lat = entry['lat']
b = entry['type']
if abs(lon - prev_lon) > length:
new_data['addr'].append(entry['addr'])
new_data['lon'].append(lon)
new_data['lat'].append(lat)
# below line is different from the loop for events_part2
new_data['event_id_list'].append([entry['event_id']])
if b == 4: # if demolition permit
new_data['blighted'].append(1)
else:
new_data['blighted'].append(0)
prev_lon = lon
prev_lat = lat
else:
listX = np.array(new_data['lon'])
listY = np.array(new_data['lat'])
poses = nearest_pos((lon,lat), listX, listY, length, width)
# if already in new_data
if poses.size > 0:
has_pos = False
for pos in poses:
temp_lon = new_data['lon'][pos]
temp_lat = new_data['lat'][pos]
if (abs(temp_lon - lon) < max_distX) & (abs(temp_lat - lat) < max_distY):
new_data['event_id_list'][pos] += [entry['event_id']]
if b == 4:
new_data['blighted'][pos] = 1
has_pos = True
if has_pos:
continue
new_data['addr'].append(entry['addr'])
new_data['lon'].append(lon)
new_data['lat'].append(lat)
# below line is different from the loop for events_part2
new_data['event_id_list'].append([entry['event_id']])
if b == 4:
new_data['blighted'].append(1)
else:
new_data['blighted'].append(0)
prev_lon = lon
prev_lat = lat
return pd.DataFrame(new_data)
In [22]:
buildings_concise = gen_buildings(building_pool)
In [23]:
buildings_concise.shape# shorter than before
Out[23]:
In [24]:
buildings_concise.tail()
Out[24]:
In [25]:
buildings = buildings_concise
In [26]:
buildings = buildings[(buildings['lat']>42.25) & (buildings['lat']<42.5) & (buildings['lon']>-83.3) & (buildings['lon']<-82.9)]
In [27]:
buildings.shape
Out[27]:
In [28]:
buildings['blighted'].value_counts()
Out[28]:
In [2]:
data_dir = '../data/'
In [3]:
buildings_step_0 = pd.read_csv(data_dir+'buildings_step_0.csv')
permits = pd.read_csv(data_dir+'permits.csv')
In [4]:
permits = permits[['PARCEL_NO', 'BLD_PERMIT_TYPE', 'addr', 'lon', 'lat']]
In [5]:
permits['BLD_PERMIT_TYPE'].unique()
Out[5]:
In [6]:
demo01 = permits.loc[0,['PARCEL_NO','addr','lon','lat']]
print(demo01)
In [7]:
c = buildings_step_0['addr'].apply(lambda x: x == permits.loc[0,'addr'])
In [8]:
buildings_step_0[c][['PARCELNO','lon','lat','addr']]
Out[8]:
The coordinate of this building from data.detroitmi.gov is slightly different from data given in our course material.
In [13]:
length = 0.000411
width = 0.000204 # These results come from step 0.
In [14]:
buildings.loc[:,'llcrnrlon'] = buildings.loc[:,'lon'] - length/2
buildings.loc[:,'llcrnrlat'] = buildings.loc[:,'lat'] - width/2
buildings.loc[:,'urcrnrlon'] = buildings.loc[:,'lon'] + length/2
buildings.loc[:,'urcrnrlat'] = buildings.loc[:,'lat'] + width/2
buildings.loc[:,'building_id'] = np.arange(0,buildings.shape[0])
buildings = buildings.reindex()
In [15]:
buildings.tail()
Out[15]:
In [39]:
buildings.to_csv('../data/buildings.csv', index=False)
In [11]:
from bbox import draw_screen_bbox
from matplotlib import pyplot as plt
%matplotlib inline
In [12]:
buildings = pd.read_csv('../data/buildings.csv')
bboxes = buildings.loc[:,['llcrnrlon','llcrnrlat','urcrnrlon','urcrnrlat']]
bboxes = bboxes.as_matrix()
In [42]:
fig = plt.figure(figsize=(8,6), dpi=2000)
for box in bboxes:
draw_screen_bbox(box, fig)
plt.xlim(-83.3,-82.9)
plt.ylim(42.25,42.45)
plt.savefig('../data/buildings_distribution.png')
plt.show()
In [16]:
blighted_buildings = buildings[buildings.loc[:,'blighted'] == 1]
In [17]:
blighted_bboxes = blighted_buildings.loc[:,['llcrnrlon','llcrnrlat','urcrnrlon','urcrnrlat']]
blighted_bboxes = blighted_bboxes.as_matrix()
In [19]:
fig = plt.figure(figsize=(8,6), dpi=2000)
for box in blighted_bboxes:
draw_screen_bbox(box, fig)
plt.xlim(-83.3,-82.9)
plt.ylim(42.25,42.46)
plt.title("Distribution of Blighted Buildings in Detroit")
plt.savefig('../data/blighted_buildings_distribution.png')
plt.show()
In [ ]: