In [1]:
import pandas as pd
import numpy as np
In [2]:
buildings = pd.read_csv('../data/buildings.csv')
In [3]:
buildings.head(5)
Out[3]:
In [4]:
buildings['blighted'].value_counts()
Out[4]:
In [5]:
blighted_buildings = buildings[buildings['blighted']==1]
not_blighted_buildings = buildings[buildings['blighted']==0]
In [6]:
index_b = np.random.choice(blighted_buildings.index, blighted_buildings.shape[0], replace=False) #blighted
index_nb = np.random.choice(not_blighted_buildings.index, blighted_buildings.shape[0], replace=False) #nonblighted
# for bagging purpose
index_b_2 = np.random.choice(blighted_buildings.index, blighted_buildings.shape[0], replace=False) #blighted
index_nb_2 = np.random.choice(not_blighted_buildings.index, blighted_buildings.shape[0], replace=False) #nonblighted
In [9]:
blighted = blighted_buildings.loc[index_b,:]
In [10]:
not_blighted = not_blighted_buildings.loc[index_nb,:]
In [11]:
balanced_data = pd.concat([blighted.copy(), not_blighted.copy()])
In [12]:
balanced_data = balanced_data.sample(frac=1, replace=False).reset_index(drop=True)
In [13]:
balanced_keys = balanced_data.loc[:,['building_id','blighted']].copy()
In [14]:
balanced_data.drop('blighted',axis=1,inplace=True)
In [19]:
# save files
balanced_keys.to_csv('../data/balanced_keys.csv', index=False)
balanced_data.to_csv('../data/balanced_data.csv', index=False)
In [16]:
balanced_data.shape
Out[16]:
In [17]:
balanced_data.head(5)
Out[17]:
In [18]:
balanced_keys['blighted'].value_counts()
Out[18]:
In [7]:
blighted = blighted_buildings.loc[index_b_2,:]
not_blighted = not_blighted_buildings.loc[index_nb_2,:]
balanced_data = pd.concat([blighted.copy(), not_blighted.copy()])
balanced_data = balanced_data.sample(frac=1, replace=False).reset_index(drop=True)
balanced_keys = balanced_data.loc[:,['building_id','blighted']].copy()
balanced_data.drop('blighted',axis=1,inplace=True)
# save files
balanced_keys.to_csv('../data/balanced_keys_2.csv', index=False)
balanced_data.to_csv('../data/balanced_data_2.csv', index=False)
balanced_data.head(5)
Out[7]:
In [ ]: