In [1]:

    
import numpy as np
import pandas as pd

Step 1: Building List and Labels

Collecting instances from 311 calls, crimes, blight violations, and demolition permits.

Data already cleaned by this notebook The collection of data was saved at ../data/events.csv



In [2]:

    
data_events = pd.read_csv('../data/events.csv')



In [3]:

    
data_events.head(10)









    Out[3]:






  
    
      
      event_id
      lon
      lat
      addr
      type
    
  
  
    
      0
      0
      -83.161039
      42.383998
      13120-13130 ilene st
      1
    
    
      1
      1
      -83.080919
      42.440471
      1485 e outer dr
      1
    
    
      2
      2
      -82.962038
      42.445244
      15460 eastburn
      1
    
    
      3
      3
      -83.166194
      42.421043
      17541 mendota st
      1
    
    
      4
      4
      -83.162874
      42.402033
      griggs
      1
    
    
      5
      5
      -83.158100
      42.399431
      14902 kentucky
      1
    
    
      6
      6
      -83.240740
      42.439669
      20089 vaughan
      1
    
    
      7
      7
      -83.053367
      42.430693
      18663 fenelon st
      1
    
    
      8
      8
      -83.161803
      42.410764
      16170 ilene st
      1
    
    
      9
      9
      -83.152779
      42.437114
      19530 roselawn st
      1



In [4]:

    
data_events.shape









    Out[4]:





(453192, 5)



In [5]:

    
# To get rid of duplicates with same coordinates and possibly different address names
building_pool = data_events.drop_duplicates(subset=['lon','lat'])



In [6]:

    
building_pool.shape









    Out[6]:





(219984, 5)



In [18]:

    
# 1. sort data according to longitude
#    init new_data
# 2. for each record:
#        if record[lon] - prev[lon] > length:
#            add new record into new_data
#        else:
#            find previous coords that are close
#            if no coords in bbox:
#                add new record into new_data
#            else:
#                for each of these coords:
#                    if record in bbox:
#                        append event_id
#  
# At the same time, if building is assigned one permit or more for demolition, blighted will be assigned to one.
#

def gen_buildings(data):
    '''generate buildings from coordinates'''
    from assign_bbox import nearest_pos, is_in_bbox, raw_dist     # defined in assign_bbox.py in current dir
    new_data = {'addr': [], 'lon': [], 'lat': [], 'event_id_list': [], 'blighted': []}
    data_sorted = data.sort_values(by='lon', inplace=False)
    length = 4.11e-4  # longitude
    width = 2.04e-4   # latitude
    prev_lon = 0
    prev_lat = 0
    max_distX = abs(length/2)
    max_distY = abs(width/2)
    
    for i, entry in data_sorted.iterrows():
        lon = entry['lon']
        lat = entry['lat']
        b = entry['type']
        if abs(lon - prev_lon) > length:
            new_data['addr'].append(entry['addr'])
            new_data['lon'].append(lon)
            new_data['lat'].append(lat)
            # below line is different from the loop for events_part2
            new_data['event_id_list'].append([entry['event_id']])
            if b == 4:  # if demolition permit
                new_data['blighted'].append(1)
            else:
                new_data['blighted'].append(0)
            
            prev_lon = lon
            prev_lat = lat
        else:
            listX = np.array(new_data['lon'])
            listY = np.array(new_data['lat'])
            poses = nearest_pos((lon,lat), listX, listY, length, width)
            
            # if already in new_data
            if poses.size > 0:
                has_pos = False
                for pos in poses:
                    temp_lon = new_data['lon'][pos]
                    temp_lat = new_data['lat'][pos]
                    if (abs(temp_lon - lon) < max_distX) & (abs(temp_lat - lat) < max_distY):
                        new_data['event_id_list'][pos] += [entry['event_id']]
                        if b == 4:
                            new_data['blighted'][pos] = 1
                        has_pos = True
                if has_pos:
                    continue
            
            new_data['addr'].append(entry['addr'])
            new_data['lon'].append(lon)
            new_data['lat'].append(lat)
            # below line is different from the loop for events_part2
            new_data['event_id_list'].append([entry['event_id']])
            if b == 4:
                new_data['blighted'].append(1)
            else:
                new_data['blighted'].append(0)
            prev_lon = lon
            prev_lat = lat
                

    return pd.DataFrame(new_data)



In [22]:

    
buildings_concise = gen_buildings(building_pool)



In [23]:

    
buildings_concise.shape# shorter than before









    Out[23]:





(130728, 5)



In [24]:

    
buildings_concise.tail()









    Out[24]:






  
    
      
      addr
      blighted
      event_id_list
      lat
      lon
    
  
  
    
      130723
      00000 kerby, grosse point farms
      0
      [127565]
      42.4139
      -82.9090
    
    
      130724
      00400 calvin ave
      0
      [104873]
      42.4109
      -82.9087
    
    
      130725
      00 mack and renald
      0
      [118910]
      42.4352
      -82.9082
    
    
      130726
      20000 mack plaza, grosse pointe woods police d...
      0
      [26965, 69819]
      42.4314
      -82.9053
    
    
      130727
      20000 ballatyne court grosse pointe 48236
      0
      [29575]
      42.4219
      -82.8986



In [25]:

    
buildings = buildings_concise

Get rid of void coordinates



In [26]:

    
buildings = buildings[(buildings['lat']>42.25) & (buildings['lat']<42.5) & (buildings['lon']>-83.3) & (buildings['lon']<-82.9)]



In [27]:

    
buildings.shape









    Out[27]:





(130727, 5)



In [28]:

    
buildings['blighted'].value_counts()









    Out[28]:





0    128112
1      2615
Name: blighted, dtype: int64

Recap of step 0

Adopting building coordinates

It turns out that there is a slight mismatch between real world building coordinates w.r.t given data. So that only median building dimension info is reserved from the building info we got from online open data at data.detroitmi.gov.



In [2]:

    
data_dir = '../data/'



In [3]:

    
buildings_step_0 = pd.read_csv(data_dir+'buildings_step_0.csv')
permits = pd.read_csv(data_dir+'permits.csv')



In [4]:

    
permits = permits[['PARCEL_NO', 'BLD_PERMIT_TYPE', 'addr', 'lon', 'lat']]



In [5]:

    
permits['BLD_PERMIT_TYPE'].unique()









    Out[5]:





array(['Dismantle', 'DISM'], dtype=object)

For example: the very first entry of permit has coordinate:



In [6]:

    
demo01 = permits.loc[0,['PARCEL_NO','addr','lon','lat']]
print(demo01)









    



PARCEL_NO      2165525-6
addr         4331 barham
lon             -82.9474
lat              42.3941
Name: 0, dtype: object

In real world data, this corresponds to:



In [7]:

    
c = buildings_step_0['addr'].apply(lambda x: x == permits.loc[0,'addr'])



In [8]:

    
buildings_step_0[c][['PARCELNO','lon','lat','addr']]









    Out[8]:






  
    
      
      PARCELNO
      lon
      lat
      addr
    
  
  
    
      261994
      21065525-6
      -82.947708
      42.393997
      4331 barham

The coordinate of this building from data.detroitmi.gov is slightly different from data given in our course material.

Only building dimension info is adopted for our analysis.



In [13]:

    
length = 0.000411
width = 0.000204  # These results come from step 0.



In [14]:

    
buildings.loc[:,'llcrnrlon'] = buildings.loc[:,'lon'] - length/2
buildings.loc[:,'llcrnrlat'] = buildings.loc[:,'lat'] - width/2
buildings.loc[:,'urcrnrlon'] = buildings.loc[:,'lon'] + length/2
buildings.loc[:,'urcrnrlat'] = buildings.loc[:,'lat'] + width/2

buildings.loc[:,'building_id'] = np.arange(0,buildings.shape[0])
buildings = buildings.reindex()



In [15]:

    
buildings.tail()









    Out[15]:






  
    
      
      addr
      blighted
      event_id_list
      lat
      lon
      llcrnrlon
      llcrnrlat
      urcrnrlon
      urcrnrlat
      building_id
    
  
  
    
      130722
      00300 neff
      0
      [96671]
      42.3845
      -82.9100
      -82.910206
      42.384398
      -82.909794
      42.384602
      130722
    
    
      130723
      00000 kerby, grosse point farms
      0
      [127565]
      42.4139
      -82.9090
      -82.909205
      42.413798
      -82.908794
      42.414002
      130723
    
    
      130724
      00400 calvin ave
      0
      [104873]
      42.4109
      -82.9087
      -82.908906
      42.410798
      -82.908494
      42.411002
      130724
    
    
      130725
      00 mack and renald
      0
      [118910]
      42.4352
      -82.9082
      -82.908406
      42.435098
      -82.907994
      42.435302
      130725
    
    
      130726
      20000 mack plaza, grosse pointe woods police d...
      0
      [26965, 69819]
      42.4314
      -82.9053
      -82.905506
      42.431298
      -82.905094
      42.431502
      130726



In [39]:

    
buildings.to_csv('../data/buildings.csv', index=False)

Visualization



In [11]:

    
from bbox import draw_screen_bbox
from matplotlib import pyplot as plt
%matplotlib inline



In [12]:

    
buildings = pd.read_csv('../data/buildings.csv')
bboxes = buildings.loc[:,['llcrnrlon','llcrnrlat','urcrnrlon','urcrnrlat']]
bboxes = bboxes.as_matrix()



In [42]:

    
fig = plt.figure(figsize=(8,6), dpi=2000)
for box in bboxes:     
    draw_screen_bbox(box, fig)
    
plt.xlim(-83.3,-82.9)
plt.ylim(42.25,42.45)
plt.savefig('../data/buildings_distribution.png')
plt.show()

Distribution of blighted buildings



In [16]:

    
blighted_buildings = buildings[buildings.loc[:,'blighted'] == 1]



In [17]:

    
blighted_bboxes = blighted_buildings.loc[:,['llcrnrlon','llcrnrlat','urcrnrlon','urcrnrlat']]
blighted_bboxes = blighted_bboxes.as_matrix()



In [19]:

    
fig = plt.figure(figsize=(8,6), dpi=2000)
for box in blighted_bboxes:     
    draw_screen_bbox(box, fig)
    
plt.xlim(-83.3,-82.9)
plt.ylim(42.25,42.46)
plt.title("Distribution of Blighted Buildings in Detroit")
plt.savefig('../data/blighted_buildings_distribution.png')
plt.show()



In [ ]:

	event_id	lon	lat	addr	type
0	0	-83.161039	42.383998	13120-13130 ilene st	1
1	1	-83.080919	42.440471	1485 e outer dr	1
2	2	-82.962038	42.445244	15460 eastburn	1
3	3	-83.166194	42.421043	17541 mendota st	1
4	4	-83.162874	42.402033	griggs	1
5	5	-83.158100	42.399431	14902 kentucky	1
6	6	-83.240740	42.439669	20089 vaughan	1
7	7	-83.053367	42.430693	18663 fenelon st	1
8	8	-83.161803	42.410764	16170 ilene st	1
9	9	-83.152779	42.437114	19530 roselawn st	1

	addr	event_id_list	lat	lon
130723	00000 kerby, grosse point farms	[127565]	42.4139	-82.9090
130724	00400 calvin ave	[104873]	42.4109	-82.9087
130725	00 mack and renald	[118910]	42.4352	-82.9082
130726	20000 mack plaza, grosse pointe woods police d...	[26965, 69819]	42.4314	-82.9053
130727	20000 ballatyne court grosse pointe 48236	[29575]	42.4219	-82.8986

	addr	event_id_list	lat	lon	llcrnrlon	llcrnrlat	urcrnrlon	urcrnrlat	building_id
130722	00300 neff	[96671]	42.3845	-82.9100	-82.910206	42.384398	-82.909794	42.384602	130722
130723	00000 kerby, grosse point farms	[127565]	42.4139	-82.9090	-82.909205	42.413798	-82.908794	42.414002	130723
130724	00400 calvin ave	[104873]	42.4109	-82.9087	-82.908906	42.410798	-82.908494	42.411002	130724
130725	00 mack and renald	[118910]	42.4352	-82.9082	-82.908406	42.435098	-82.907994	42.435302	130725
130726	20000 mack plaza, grosse pointe woods police d...	[26965, 69819]	42.4314	-82.9053	-82.905506	42.431298	-82.905094	42.431502	130726