In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
Loading data
In [2]:
buildings = pd.read_csv("../data/buildings.csv")
In [3]:
events = pd.read_csv("../data/events.csv")
In [4]:
buildings.head(2)
Out[4]:
In [5]:
events.head(2)
Out[5]:
In [6]:
events['type'].value_counts() # types: 1: 311-calls, 2: crimes, 3: blight violations
Out[6]:
Add counts for each type into buildings
In [7]:
def str_to_list(events_str):
events_list = events_str.rstrip(']').lstrip('[').split(', ')
return events_list
In [8]:
buildings.loc[:, 'event_id_list'] = buildings.loc[:,'event_id_list'].copy().apply(lambda x: str_to_list(x))
In [9]:
buildings['311-calls'] = np.zeros(buildings.shape[0])
buildings['crimes'] = np.zeros(buildings.shape[0])
buildings['blight_violations'] = np.zeros(buildings.shape[0])
buildings['permit_cnt'] = np.zeros(buildings.shape[0])
for i in range(buildings.shape[0]):
for event in buildings.loc[i, 'event_id_list']:
event = int(event)
event_type = events.loc[event,'type']
if event_type == 1:
buildings.loc[i, '311-calls'] += 1
elif event_type == 2:
buildings.loc[i, 'crimes'] += 1
elif event_type == 3:
buildings.loc[i, 'blight_violations'] += 1
elif event_type == 4:
buildings.loc[i, 'permit_cnt'] += 1
else:
print("unexpected event_type: %d in row %d" % (event_type, i))
In [10]:
buildings[['311-calls','crimes','blight_violations','permit_cnt']].describe()
Out[10]:
In [11]:
buildings['norm_lon'] = (buildings['lon'].copy() - np.mean(buildings['lon'].values))/np.std(buildings['lon'].values)
buildings['norm_lat'] = (buildings['lat'].copy() - np.mean(buildings['lat'].values))/np.std(buildings['lat'].values)
In [12]:
buildings.head(2)
Out[12]:
In [13]:
buildings.to_csv('../data/buildings_with_features.csv', index=False)
In [14]:
import xgboost as xgb
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.cross_validation import train_test_split
In [321]:
buildings = pd.read_csv('../data/buildings_with_features.csv')
balanced_data = pd.read_csv('../data/balanced_data.csv')
balanced_keys = pd.read_csv('../data/balanced_keys.csv')
In [322]:
buildings.head(2)
Out[322]:
In [323]:
balanced_data.head(3)
Out[323]:
In [324]:
balanced_buildings = buildings.loc[buildings['building_id'].isin(balanced_data['building_id'].values)].copy()
In [325]:
feature_names = ['norm_lat', 'norm_lon', '311-calls', 'crimes', 'blight_violations']
feature_types = ['float', 'float', 'int', 'int', 'int', 'int']
names_to_drop = ['building_id','blighted','addr','event_id_list','lat','lon','llcrnrlon','llcrnrlat','urcrnrlon','urcrnrlat', 'permit_cnt']
In [326]:
labels = balanced_buildings['blighted']
data = balanced_buildings.drop(names_to_drop, axis=1, inplace=False)
In [327]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=0)
In [328]:
x, x_eval, y, y_eval = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=500)
In [329]:
dtrain = xgb.DMatrix(x, label=y)
deval = xgb.DMatrix(x_eval, label=y_eval)
dtest = xgb.DMatrix(x_test, label=y_test)
param = {
'booster': 'gbtree',
'subsample': 1.0,
'max_depth': 5,
'min_child_weight': 2,
'eta': 0.2,
'gamma': 3,
'objective':'binary:logistic',
'eval_metric': 'auc',
'lambda': 3, # L2 regularization,
'alpha': 1 # L1 regularization
}
watchlist = [(deval, 'eval'), (dtrain, 'train')]
num_round = 1000
In [330]:
evals_result = {}
bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result, early_stopping_rounds=20, verbose_eval=False)
In [331]:
plt.plot(np.arange(len(evals_result['eval']['auc'])), evals_result['eval']['auc'], 'b-',\
np.arange(len(evals_result['train']['auc'])), evals_result['train']['auc'], 'r--')
plt.legend(['Eval','Train'], loc='best')
plt.ylim(0.8,1.0)
plt.show()
In [332]:
preds_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
In [333]:
round(roc_auc_score(y_test, preds_proba), 5)
Out[333]:
In [334]:
dtrain_total = xgb.DMatrix(x_train, label=y_train)
In [335]:
res = xgb.cv(param, dtrain_total, num_boost_round=10, nfold=5, metrics={'auc'}, seed=99,\
callbacks=[xgb.callback.print_evaluation(show_stdv=False),\
xgb.callback.early_stop(3)])
In [336]:
plt.plot(np.arange(len(preds_proba)), preds_proba, 'bo', alpha=0.2)
plt.show()
In [337]:
y_pred_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
In [338]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba, pos_label=1)
plt.figure(figsize=(5,5))
plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr, tpr, 'ro',label='RF', alpha=0.4)
plt.title("ROC Curve")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.show()
In [342]:
fig, ax = plt.subplots(figsize=(20,15), dpi=600)
xgb.plot_tree(bst, num_trees=0, ax=ax)
plt.show()
In [42]:
feat_imp = [(k,v) for (k,v) in bst.get_fscore().items()] # python3
feat_imp.sort()
In [43]:
feat_names, feat_imps = zip(*feat_imp)
In [44]:
feat_imp_vis = pd.Series(feat_imps, index=feat_names)
fig = plt.figure()
feat_imp_vis.plot(kind='bar', title='Feature importance')
Out[44]:
According to this simplified analysis, location has the most predictive power for blightness. ('norm_lat' and 'norm_lon' correspond to 'normalized latitude' and 'normalized longitude' respectively)
In [45]:
data_crimes = pd.read_csv('../data/data_crime.csv')
In [46]:
data_crimes.CATEGORY.unique()
Out[46]:
In [47]:
crime_categories = {'more_serious': ['ASSAULT', 'LARCENY', 'STOLEN VEHICLE', 'BURGLARY', 'AGGRAVATED ASSAULT',\
'ROBBERY', 'KIDNAPING', 'OTHER BURGLARY', 'NEGLIGENT HOMICIDE', 'JUSTIFIABLE HOMICIDE',\
'FELONY DEATH FROM FLEEING VEHICLE', 'DANGEROUS DRUGS', 'ARSON', 'HOMICIDE'], \
'less_serious': ['WEAPONS OFFENSES', 'TRAFFIC VIOLATIONS-MOTORCYCLE VIOLATIONS', \
'DAMAGE TO PROPERTY', 'TRAFFIC VIOLATIONS-DRIVING ON SUSPENDED', 'FRAUD', 'OBSTRUCTING THE POLICE',\
'RUNAWAY', 'BRIBERY', 'EXTORTION', 'STOLEN PROPERTY', 'HEALTH-SAFETY', 'VAGRANCY (OTHER)', \
'ENVIRONMENT', 'EMBEZZLEMENT', 'FORGERY', 'CONSPIRACY BY COMPUTER', 'ANTITRUST', 'PUBLIC PEACE',\
'LIQUOR', 'OUIL', 'OBSCENITY', 'SOVEREIGNTY', 'TAX REVENUE', 'GAMBLING', 'IMMIGRATION', 'CONGRESS',\
'REVOKED', 'ELECTION LAWS', 'DRUNKENNESS', 'MISCELLANEOUS ARREST', 'MILITARY', 'SOLICITATION', \
'OUIL DISPOSE OF VEHICLE TO AVOID FORFEITURE', 'FAMILY OFFENSE', 'ESCAPE', 'OBSTRUCTING JUDICIARY']}
# based on data from gis.chicagopolice.org
In [48]:
data_crimes.head(2)
Out[48]:
In [49]:
def cat_crime(crime_str):
'''numerical category:
---- more_serious: 1
---- less_serious: 0
---- unclassified: -1
'''
if crime_str in crime_categories['more_serious']:
return 1
elif crime_str in crime_categories['less_serious']:
return 0
else:
return -1
In [50]:
data_crimes['num_cat'] = data_crimes['CATEGORY'].apply(cat_crime)
In [51]:
data_crimes['num_cat'].unique() # all crimes classified, no -1 encountered
Out[51]:
In [52]:
buildings.head(1) # refresher
Out[52]:
In [53]:
less_serious_crime_event_ids = data_crimes.loc[data_crimes['num_cat']==0,'event_id'].values
In [54]:
more_serious_crime_event_ids = data_crimes.loc[data_crimes['num_cat']==1,'event_id'].values
In [55]:
buildings.loc[:, 'event_id_list'] = buildings.loc[:,'event_id_list'].copy().apply(lambda x: str_to_list(x))
In [56]:
buildings['less_serious_crimes'] = 0 # count of less serious crimes
buildings['more_serious_crimes'] = 0 # count of more serious crimes
buildings['event_id_list'] = buildings['event_id_list'].apply(lambda x: [int(i.rstrip("'").lstrip("'")) for i in x])
for i in range(buildings.shape[0]):
for event in buildings.loc[i, 'event_id_list']:
if event in less_serious_crime_event_ids:
buildings.loc[i, 'less_serious_crimes'] += 1
elif event in more_serious_crime_event_ids:
buildings.loc[i, 'more_serious_crimes'] += 1
In [57]:
buildings.head(1)
Out[57]:
In [58]:
buildings.to_csv('../data/buildings_with_features_2.csv', index=False)
In [59]:
balanced_buildings = buildings.loc[buildings['building_id'].isin(balanced_data['building_id'].values)].copy()
In [60]:
feature_names = ['norm_lat', 'norm_lon', '311-calls', 'blight_violations', 'less_serious_crimes', 'more_serious_crimes']
feature_types = ['float', 'float', 'int', 'int', 'int', 'int']
names_to_drop = ['building_id','blighted','addr','event_id_list','lat','lon','llcrnrlon','llcrnrlat','urcrnrlon','urcrnrlat', 'permit_cnt', 'crimes']
In [61]:
labels = balanced_buildings['blighted']
data = balanced_buildings.drop(names_to_drop, axis=1, inplace=False)
In [62]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=0)
In [63]:
x, x_eval, y, y_eval = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=500)
In [207]:
dtrain = xgb.DMatrix(x, label=y)
deval = xgb.DMatrix(x_eval, label=y_eval)
dtest = xgb.DMatrix(x_test, label=y_test)
param = {
'booster': 'gbtree',
'subsample': 1.0,
'max_depth': 8,
'min_child_weight': 8,
'eta': 0.1,
'gamma': 5,
'objective':'binary:logistic',
'eval_metric': 'auc',
'lambda': 2, # L2 regularization,
'alpha': 0 # L1 regularization
}
watchlist = [(deval, 'eval'), (dtrain, 'train')]
num_round = 2000
In [208]:
evals_result = {}
bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result, early_stopping_rounds=20, verbose_eval=False)
In [209]:
plt.plot(np.arange(len(evals_result['eval']['auc'])), evals_result['eval']['auc'], 'b-',\
np.arange(len(evals_result['train']['auc'])), evals_result['train']['auc'], 'r--')
plt.legend(['Eval','Train'], loc='best')
plt.ylim(0.8,1.0)
plt.show()
In [210]:
preds_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
In [211]:
round(roc_auc_score(y_test, preds_proba), 5)
Out[211]:
In [212]:
dtrain_total = xgb.DMatrix(x_train, label=y_train)
In [213]:
res = xgb.cv(param, dtrain_total, num_boost_round=10, nfold=5, metrics={'auc'}, seed=99,\
callbacks=[xgb.callback.print_evaluation(show_stdv=False),\
xgb.callback.early_stop(3)])
In [214]:
plt.plot(np.arange(len(preds_proba)), preds_proba, 'bo', alpha=0.2)
plt.show()
In [215]:
y_pred_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
In [216]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba, pos_label=1)
plt.figure(figsize=(5,5))
plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr, tpr, 'ro',label='RF', alpha=0.4)
plt.title("ROC Curve")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.show()
In [217]:
feat_imp = [(k,v) for (k,v) in bst.get_fscore().items()] # python3
feat_imp.sort()
In [218]:
feat_names, feat_imps = zip(*feat_imp)
In [219]:
feat_imp_vis = pd.Series(feat_imps, index=feat_names)
fig = plt.figure()
feat_imp_vis.plot(kind='bar', title='Feature importance')
Out[219]:
The additional feature created using differentiated crimes did not provide a better answer.
In [220]:
# Train using another balanced data set (1:1 blighted vs nonblighted) with nonblighted of another list of random indexes
balanced_data_2 = pd.read_csv('../data/balanced_data_2.csv')
balanced_keys_2 = pd.read_csv('../data/balanced_keys_2.csv')
In [221]:
balanced_buildings_2 = buildings.loc[buildings['building_id'].isin(balanced_data_2['building_id'].values)].copy()
In [222]:
feature_names = ['norm_lat', 'norm_lon', '311-calls', 'blight_violations', 'less_serious_crimes', 'more_serious_crimes']
feature_types = ['float', 'float', 'int', 'int', 'int', 'int']
names_to_drop = ['building_id','blighted','addr','event_id_list','lat','lon','llcrnrlon','llcrnrlat','urcrnrlon','urcrnrlat', 'permit_cnt', 'crimes']
In [223]:
labels_2 = balanced_buildings_2['blighted']
data_2 = balanced_buildings_2.drop(names_to_drop, axis=1, inplace=False)
In [224]:
x_train, x_test, y_train, y_test = train_test_split(data_2, labels_2, test_size=0.2, stratify=labels, random_state=0)
In [225]:
x, x_eval, y, y_eval = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=500)
In [226]:
from sklearn.svm import SVC
In [232]:
clf = SVC(C=0.8, kernel='rbf', probability=True)
In [233]:
clf.fit(x_train, y_train)
Out[233]:
In [234]:
y_pred_2 = clf.predict_proba(x_test)
In [235]:
round(roc_auc_score(y_test, y_pred_2[:,1]), 5)
Out[235]:
In [236]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_2[:,1], pos_label=1)
plt.figure(figsize=(5,5))
plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr, tpr, 'ro',label='RF', alpha=0.4)
plt.title("ROC Curve")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.show()
In [343]:
data_violations = pd.read_csv('../data/data_bv.csv')
In [344]:
data_violations.columns
Out[344]:
In [345]:
data_violations.FineAmt.unique() # this indicate importance of violations
Out[345]:
In [346]:
len(data_violations.ViolationCode.value_counts()) # Too many kind of violations if by category
Out[346]:
In [347]:
buildings = pd.read_csv('../data/buildings_with_features_2.csv')
balanced_data = pd.read_csv('../data/balanced_data.csv')
balanced_keys = pd.read_csv('../data/balanced_keys.csv')
In [348]:
def get_num_amt(FineAmt):
'''convert FineAmt string to numerical values'''
amt_str = FineAmt.tolist()[0]
if isinstance(amt_str, float): # nan
# print(amt_str)
return 0
amt_str = amt_str.lstrip('$')
amt = float(amt_str)
return amt
In [349]:
buildings.loc[:, 'event_id_list'] = buildings.loc[:,'event_id_list'].copy().apply(lambda x: str_to_list(x))
In [350]:
buildings.head(1)
Out[350]:
In [351]:
buildings['trivial_v'] = 0 # count of violation of minimal importance (< $100)
buildings['small_v'] = 0 # count of violation of small importance ($100 <= v < $1000 )
buildings['medium_v'] = 0 # count of violation with ($1000 <= v < $5000)
buildings['heavy_v'] = 0 # count of violation with (>=$5000)
buildings['event_id_list'] = buildings['event_id_list'].apply(lambda x: [int(i.rstrip("'").lstrip("'")) for i in x])
In [352]:
violation_events = data_violations['event_id'].values
for i in range(buildings.shape[0]):
for event in buildings.loc[i, 'event_id_list']:
if event in violation_events:
amt = get_num_amt(data_violations.loc[data_violations['event_id']==event, 'FineAmt'])
if amt < 100:
buildings.loc[i, 'trivial_v'] += 1
elif amt >= 100 and amt < 1000:
buildings.loc[i, 'small_v'] += 1
elif amt >= 1000 and amt < 5000:
buildings.loc[i, 'medium_v'] += 1
elif amt >= 5000:
buildings.loc[i, 'heavy_v'] += 1
else: # nan
buildings.loc[i, 'trivial_v'] += 1
In [353]:
buildings.head(1)
Out[353]:
In [1029]:
feature_names = ['norm_lat', 'norm_lon', '311-calls', 'less_serious_crimes', 'more_serious_crimes', 'trivial_v', 'small_v', 'medium_v', 'heavy_v']
feature_types = ['float', 'float', 'int', 'int', 'int', 'int']
names_to_drop = ['building_id','blighted','addr','event_id_list','lat','lon','llcrnrlon','llcrnrlat','urcrnrlon','urcrnrlat', 'permit_cnt', 'crimes', 'blight_violations']
In [1030]:
buildings.to_csv('../data/buildings_with_features_3.csv', index=False)
In [1031]:
balanced_buildings = buildings.loc[buildings['building_id'].isin(balanced_data['building_id'].values)].copy()
In [1032]:
labels = balanced_buildings['blighted']
data = balanced_buildings.drop(names_to_drop, axis=1, inplace=False)
In [1033]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=1234)
In [1038]:
x, x_eval, y, y_eval = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=6890)
In [1039]:
dtrain = xgb.DMatrix(x, label=y)
deval = xgb.DMatrix(x_eval, label=y_eval)
dtest = xgb.DMatrix(x_test, label=y_test)
param = {
'booster': 'gbtree',
'subsample': 1.0,
'max_depth': 7,
'min_child_weight': 4.0,
'eta': 0.2,
'gamma': 3,
'objective':'binary:logistic',
'eval_metric': 'auc',
'lambda': 3, # L2 regularization,
'alpha': 0.5 # L1 regularization
}
watchlist = [(deval, 'eval'), (dtrain, 'train')]
num_round = 1000
In [1040]:
evals_result = {}
bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result, early_stopping_rounds=40, verbose_eval=False)
In [1041]:
plt.plot(np.arange(len(evals_result['eval']['auc'])), evals_result['eval']['auc'], 'b-',\
np.arange(len(evals_result['train']['auc'])), evals_result['train']['auc'], 'r--')
plt.legend(['Eval','Train'], loc='best')
plt.ylim(0.8,1.0)
plt.show()
In [1042]:
preds_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
In [1043]:
round(roc_auc_score(y_test, preds_proba), 5)
Out[1043]:
In [1044]:
dtrain_total = xgb.DMatrix(x_train, label=y_train)
In [1045]:
res = xgb.cv(param, dtrain_total, num_boost_round=10, nfold=5, metrics={'auc'}, seed=99,\
callbacks=[xgb.callback.print_evaluation(show_stdv=False),\
xgb.callback.early_stop(3)])
In [972]:
plt.plot(np.arange(len(preds_proba)), preds_proba, 'bo', alpha=0.2)
plt.show()
In [973]:
y_pred_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
In [974]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba, pos_label=1)
plt.figure(figsize=(5,5))
plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr, tpr, 'ro',label='RF', alpha=0.4)
plt.title("ROC Curve")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.show()
In [975]:
feat_imp = [(k,v) for (k,v) in bst.get_fscore().items()] # python3
feat_imp.sort()
In [976]:
feat_names, feat_imps = zip(*feat_imp)
In [977]:
feat_imp_vis = pd.Series(feat_imps, index=feat_names)
feat_imp_vis.sort_values(inplace=True)
fig = plt.figure()
feat_imp_vis.plot(kind='bar', title='Feature importance')
Out[977]:
In [1309]:
blighted_buildings = buildings.loc[buildings['blighted'] == 1, :].copy()
nonblighted_buildings = buildings.loc[buildings['blighted']==0, :].copy()
In [1310]:
n_blighted = blighted_buildings.shape[0]
n_nonblighted = nonblighted_buildings.shape[0]
print("number of blighted buildings: %d" % n_blighted)
print("number of non-blighted buildings: %d " % n_nonblighted)
In [1371]:
feature_names = ['norm_lat', 'norm_lon', '311-calls', 'crimes', 'blight_violations', ]
feature_types = ['float', 'float', 'int', 'int', 'int', 'int']
names_to_drop = ['building_id','blighted','addr','event_id_list','lat','lon','llcrnrlon','llcrnrlat','urcrnrlon','urcrnrlat', 'less_serious_crimes', 'more_serious_crimes', 'trivial_v', 'small_v', 'medium_v', 'heavy_v', 'permit_cnt']
In [1372]:
n_test_b = int(n_blighted*0.2) # number for blighted buildings in test
In [1373]:
index_b_test = np.random.choice(blighted_buildings.index, n_test_b, replace=False) #blighted
index_nb_test = np.random.choice(nonblighted_buildings.index, n_test_b, replace=False) #nonblighted
In [1374]:
blighted_test = blighted_buildings.loc[index_b_test,:]
nonblighted_test = nonblighted_buildings.loc[index_nb_test,:]
balanced_test = pd.concat([blighted_test.copy(), nonblighted_test.copy()])
balanced_test = balanced_test.sample(frac=1, replace=False).reset_index(drop=True)
In [1375]:
test_x = balanced_test.drop(names_to_drop, axis=1, inplace=False)
test_y = balanced_test.loc[:,['blighted']].copy()
In [1376]:
# Train data are chosen from rest of the buildings
rest_blighted_buildings = blighted_buildings.loc[~blighted_buildings.index.isin(index_b_test),:].copy()
rest_nonblighted_buildings = nonblighted_buildings.loc[~nonblighted_buildings.index.isin(index_nb_test),:].copy()
In [1377]:
n_train_b = rest_blighted_buildings.shape[0] # number of rows to choose from each kind
In [1394]:
indexes_b_train = []
# choose 5 set of train from the same samples with replacement.
for i in range(5):
index_b_train = np.random.choice(rest_blighted_buildings.index, n_train_b, replace=False)
index_nb_train = np.random.choice(rest_nonblighted_buildings.index, n_train_b, replace=False)
indexes_b_train.append([index_b_train, index_nb_train])
In [1395]:
train_list = []
for index_pair in indexes_b_train:
index_b, index_nb = tuple(index_pair)
blighted_train = rest_blighted_buildings.loc[index_b,:]
nonblighted_train = rest_nonblighted_buildings.loc[index_nb,:]
balanced_train = pd.concat([blighted_train.copy(), nonblighted_train.copy()])
balanced_train = balanced_train.sample(frac=1, replace=False).reset_index(drop=True)
y_train = balanced_train['blighted'].copy()
x_train = balanced_train.drop(names_to_drop, axis=1, inplace=False)
train_list.append((x_train, y_train))
In [1474]:
param = {
'booster': 'gbtree',
'subsample': 1.0,
'max_depth': 6,
'min_child_weight': 3.0,
'eta': 0.2,
'gamma': 3.0,
'objective':'binary:logistic',
'eval_metric': 'auc',
'lambda': 4, # L2 regularization,
'alpha': 0 # L1 regularization
}
num_round = 1000
In [1475]:
def train_model(x, x_eval, y, y_eval, param):
#x, x_eval, y, y_eval = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)
dtrain = xgb.DMatrix(x, label=y)
deval = xgb.DMatrix(x_eval, label=y_eval)
watchlist = [(deval, 'eval'), (dtrain, 'train')]
evals_result={}
bst = xgb.train(param, dtrain, num_boost_round=1000, evals=watchlist, evals_result=evals_result,
early_stopping_rounds=40, verbose_eval=False)
return (bst, evals_result)
In [1476]:
x_train = train_list[0][0]
y_train = train_list[0][1]
x, x_eval, y, y_eval = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)
In [1477]:
bst, evals_result = train_model(x, x_eval, y, y_eval, param)
In [1478]:
plt.plot(np.arange(len(evals_result['eval']['auc'])), evals_result['eval']['auc'], 'b-',\
np.arange(len(evals_result['train']['auc'])), evals_result['train']['auc'], 'r--')
plt.legend(['Eval','Train'], loc='best')
plt.ylim(0.82,0.9)
plt.show()
In [1479]:
res = xgb.cv(param, dtrain_total, num_boost_round=10, nfold=5, metrics={'auc'}, seed=9999,\
callbacks=[xgb.callback.print_evaluation(show_stdv=False),\
xgb.callback.early_stop(3)])
In [1480]:
dtest = xgb.DMatrix(test_x, label=test_y['blighted'])
In [1481]:
preds_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
In [1482]:
round(roc_auc_score(test_y, preds_proba), 5)
Out[1482]:
In [1488]:
bsts = []
fig, axes = plt.subplots(len(train_list), sharex=True, sharey=True, figsize=(6,12))
for train_data,i in zip(train_list,range(len(axes))):
x_train, y_train = train_data
x, x_eval, y, y_eval = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)
bst, evals_result = train_model(x, x_eval, y, y_eval, param)
axes[i].plot(np.arange(len(evals_result['eval']['auc'])), evals_result['eval']['auc'], 'b-',\
np.arange(len(evals_result['train']['auc'])), evals_result['train']['auc'], 'r--')
bsts.append(bst)
plt.legend(['Eval','Train'], loc='best')
plt.ylim(0.8,0.9)
plt.show()
In [1484]:
#preds_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
y_preds = np.mean(np.array([bst.predict(dtest, ntree_limit=bst.best_ntree_limit) for bst in bsts]), axis=0)
In [1485]:
round(roc_auc_score(test_y, y_preds), 5)
Out[1485]:
An AUC score of 0.8625 was achieved by reducing variance.
In [1489]:
fpr, tpr, thresholds = roc_curve(test_y, y_preds, pos_label=1)
plt.figure(figsize=(5,5))
plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr, tpr, 'ro',label='RF', alpha=0.4)
plt.title("ROC Curve")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.savefig('../data/ROC_Curve_combined.png')
plt.show()
In [ ]: