Choose same number of blighted and not_blighted buildings


In [1]:
import pandas as pd
import numpy as np

In [2]:
buildings = pd.read_csv('../data/buildings.csv')

In [3]:
buildings.head(5)


Out[3]:
addr blighted event_id_list lat lon llcrnrlon llcrnrlat urcrnrlon urcrnrlat building_id
0 13500 norborne 0 [138721] 42.3842 -83.2975 -83.297706 42.384098 -83.297294 42.384302 0
1 26200 mich ave inkster mi 48141 0 [116216] 42.2954 -83.2968 -83.297006 42.295298 -83.296594 42.295502 1
2 25800 elsinore 0 [70770] 42.3986 -83.2966 -83.296806 42.398498 -83.296394 42.398702 2
3 00000 mile/ beach daily 0 [20244] 42.3993 -83.2959 -83.296106 42.399198 -83.295694 42.399402 3
4 00 fellrath apartments 0 [109241] 42.2654 -83.2931 -83.293306 42.265298 -83.292894 42.265502 4

In [4]:
buildings['blighted'].value_counts()


Out[4]:
0    128112
1      2615
Name: blighted, dtype: int64

In [5]:
blighted_buildings = buildings[buildings['blighted']==1]
not_blighted_buildings = buildings[buildings['blighted']==0]

In [6]:
index_b = np.random.choice(blighted_buildings.index, blighted_buildings.shape[0], replace=False) #blighted
index_nb = np.random.choice(not_blighted_buildings.index, blighted_buildings.shape[0], replace=False) #nonblighted

# for bagging purpose
index_b_2 = np.random.choice(blighted_buildings.index, blighted_buildings.shape[0], replace=False) #blighted
index_nb_2 = np.random.choice(not_blighted_buildings.index, blighted_buildings.shape[0], replace=False) #nonblighted

In [9]:
blighted = blighted_buildings.loc[index_b,:]

In [10]:
not_blighted = not_blighted_buildings.loc[index_nb,:]

In [11]:
balanced_data = pd.concat([blighted.copy(), not_blighted.copy()])

In [12]:
balanced_data = balanced_data.sample(frac=1, replace=False).reset_index(drop=True)

In [13]:
balanced_keys = balanced_data.loc[:,['building_id','blighted']].copy()

In [14]:
balanced_data.drop('blighted',axis=1,inplace=True)

In [19]:
# save files
balanced_keys.to_csv('../data/balanced_keys.csv', index=False)
balanced_data.to_csv('../data/balanced_data.csv', index=False)

In [16]:
balanced_data.shape


Out[16]:
(5230, 9)

In [17]:
balanced_data.head(5)


Out[17]:
addr event_id_list lat lon llcrnrlon llcrnrlat urcrnrlon urcrnrlat building_id
0 15360 tracey [448971, 118987] 42.403010 -83.180173 -83.180379 42.402908 -83.179968 42.403112 38754
1 4817 chopin [143959] 42.332660 -83.134467 -83.134672 42.332558 -83.134261 42.332762 60569
2 5300 wayburn st [13405] 42.400448 -82.955224 -82.955430 42.400346 -82.955019 42.400550 121589
3 5960 tarnow [448703, 127281] 42.340029 -83.140914 -83.141119 42.339927 -83.140708 42.340131 57068
4 19622 runyon [450264] 42.438970 -83.006893 -83.007099 42.438868 -83.006688 42.439072 104186

In [18]:
balanced_keys['blighted'].value_counts()


Out[18]:
1    2615
0    2615
Name: blighted, dtype: int64

Second batch


In [7]:
blighted = blighted_buildings.loc[index_b_2,:]
not_blighted = not_blighted_buildings.loc[index_nb_2,:]
balanced_data = pd.concat([blighted.copy(), not_blighted.copy()])
balanced_data = balanced_data.sample(frac=1, replace=False).reset_index(drop=True)
balanced_keys = balanced_data.loc[:,['building_id','blighted']].copy()
balanced_data.drop('blighted',axis=1,inplace=True)
# save files
balanced_keys.to_csv('../data/balanced_keys_2.csv', index=False)
balanced_data.to_csv('../data/balanced_data_2.csv', index=False)
balanced_data.head(5)


Out[7]:
addr event_id_list lat lon llcrnrlon llcrnrlat urcrnrlon urcrnrlat building_id
0 9394 winthrop [391361, 447704] 42.36488 -83.198203 -83.198408 42.364778 -83.197997 42.364982 31267
1 09700 harper (o.m.g wear) [90532, 96642, 79253, 71882] 42.39290 -83.006000 -83.006206 42.392798 -83.005794 42.393002 104509
2 00 maddelein/chalmers [115723, 170910, 306873] 42.43620 -82.966400 -82.966606 42.436098 -82.966194 42.436302 117821
3 16800 schoolcraft [78830, 144619, 447350] 42.38280 -83.240700 -83.240906 42.382698 -83.240494 42.382902 10482
4 9743 chenlot [246672, 452322, 150818] 42.36884 -83.135226 -83.135432 42.368738 -83.135021 42.368942 60160

In [ ]: