In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Loading data



In [2]:

    
buildings = pd.read_csv("../data/buildings.csv")



In [3]:

    
events = pd.read_csv("../data/events.csv")



In [4]:

    
buildings.head(2)









    Out[4]:






  
    
      
      addr
      blighted
      event_id_list
      lat
      lon
      llcrnrlon
      llcrnrlat
      urcrnrlon
      urcrnrlat
      building_id
    
  
  
    
      0
      13500 norborne
      0
      [138721]
      42.3842
      -83.2975
      -83.297706
      42.384098
      -83.297294
      42.384302
      0
    
    
      1
      26200 mich ave inkster mi 48141
      0
      [116216]
      42.2954
      -83.2968
      -83.297006
      42.295298
      -83.296594
      42.295502
      1



In [5]:

    
events.head(2)









    Out[5]:






  
    
      
      event_id
      lon
      lat
      addr
      type
    
  
  
    
      0
      0
      -83.161039
      42.383998
      13120-13130 ilene st
      1
    
    
      1
      1
      -83.080919
      42.440471
      1485 e outer dr
      1



In [6]:

    
events['type'].value_counts()   # types: 1: 311-calls, 2: crimes, 3: blight violations









    Out[6]:





3    307804
2    119394
1     19679
4      6315
Name: type, dtype: int64

Add counts for each type into buildings



In [7]:

    
def str_to_list(events_str):
    events_list = events_str.rstrip(']').lstrip('[').split(', ')
    return events_list



In [8]:

    
buildings.loc[:, 'event_id_list'] = buildings.loc[:,'event_id_list'].copy().apply(lambda x: str_to_list(x))



In [9]:

    
buildings['311-calls'] = np.zeros(buildings.shape[0])
buildings['crimes'] = np.zeros(buildings.shape[0])
buildings['blight_violations'] = np.zeros(buildings.shape[0])
buildings['permit_cnt'] = np.zeros(buildings.shape[0])

for i in range(buildings.shape[0]):
    for event in buildings.loc[i, 'event_id_list']:
        event = int(event)
        event_type = events.loc[event,'type']
        if event_type == 1:
              buildings.loc[i, '311-calls'] += 1
        elif event_type == 2:
              buildings.loc[i, 'crimes'] += 1
        elif event_type == 3:
              buildings.loc[i, 'blight_violations'] += 1
        elif event_type == 4:
              buildings.loc[i, 'permit_cnt'] += 1
        else:
              print("unexpected event_type: %d in row %d" % (event_type, i))



In [10]:

    
buildings[['311-calls','crimes','blight_violations','permit_cnt']].describe()









    Out[10]:






  
    
      
      311-calls
      crimes
      blight_violations
      permit_cnt
    
  
  
    
      count
      130727.000000
      130727.000000
      130727.000000
      130727.000000
    
    
      mean
      0.134142
      0.831328
      0.760715
      0.021373
    
    
      std
      0.390493
      1.094696
      0.958943
      0.155434
    
    
      min
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.000000
      1.000000
      0.000000
      0.000000
    
    
      75%
      0.000000
      1.000000
      1.000000
      0.000000
    
    
      max
      12.000000
      9.000000
      12.000000
      5.000000

Normalize longitude and latitude



In [11]:

    
buildings['norm_lon'] = (buildings['lon'].copy() - np.mean(buildings['lon'].values))/np.std(buildings['lon'].values)
buildings['norm_lat'] = (buildings['lat'].copy() - np.mean(buildings['lat'].values))/np.std(buildings['lat'].values)



In [12]:

    
buildings.head(2)









    Out[12]:






  
    
      
      addr
      blighted
      event_id_list
      lat
      lon
      llcrnrlon
      llcrnrlat
      urcrnrlon
      urcrnrlat
      building_id
      311-calls
      crimes
      blight_violations
      permit_cnt
      norm_lon
      norm_lat
    
  
  
    
      0
      13500 norborne
      0
      [138721]
      42.3842
      -83.2975
      -83.297706
      42.384098
      -83.297294
      42.384302
      0
      0.0
      1.0
      0.0
      0.0
      -1.905905
      -0.172687
    
    
      1
      26200 mich ave inkster mi 48141
      0
      [116216]
      42.2954
      -83.2968
      -83.297006
      42.295298
      -83.296594
      42.295502
      1
      0.0
      1.0
      0.0
      0.0
      -1.898702
      -2.553627



In [13]:

    
buildings.to_csv('../data/buildings_with_features.csv', index=False)

Analysis features



In [14]:

    
import xgboost as xgb
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.cross_validation import train_test_split



In [321]:

    
buildings = pd.read_csv('../data/buildings_with_features.csv')
balanced_data = pd.read_csv('../data/balanced_data.csv')
balanced_keys = pd.read_csv('../data/balanced_keys.csv')



In [322]:

    
buildings.head(2)









    Out[322]:






  
    
      
      addr
      blighted
      event_id_list
      lat
      lon
      llcrnrlon
      llcrnrlat
      urcrnrlon
      urcrnrlat
      building_id
      311-calls
      crimes
      blight_violations
      permit_cnt
      norm_lon
      norm_lat
    
  
  
    
      0
      13500 norborne
      0
      ['138721']
      42.3842
      -83.2975
      -83.297706
      42.384098
      -83.297294
      42.384302
      0
      0.0
      1.0
      0.0
      0.0
      -1.905905
      -0.172687
    
    
      1
      26200 mich ave inkster mi 48141
      0
      ['116216']
      42.2954
      -83.2968
      -83.297006
      42.295298
      -83.296594
      42.295502
      1
      0.0
      1.0
      0.0
      0.0
      -1.898702
      -2.553627



In [323]:

    
balanced_data.head(3)









    Out[323]:






  
    
      
      addr
      event_id_list
      lat
      lon
      llcrnrlon
      llcrnrlat
      urcrnrlon
      urcrnrlat
      building_id
    
  
  
    
      0
      15360 tracey
      [448971, 118987]
      42.403010
      -83.180173
      -83.180379
      42.402908
      -83.179968
      42.403112
      38754
    
    
      1
      4817 chopin
      [143959]
      42.332660
      -83.134467
      -83.134672
      42.332558
      -83.134261
      42.332762
      60569
    
    
      2
      5300 wayburn st
      [13405]
      42.400448
      -82.955224
      -82.955430
      42.400346
      -82.955019
      42.400550
      121589



In [324]:

    
balanced_buildings = buildings.loc[buildings['building_id'].isin(balanced_data['building_id'].values)].copy()



In [325]:

    
feature_names = ['norm_lat', 'norm_lon', '311-calls', 'crimes', 'blight_violations']
feature_types = ['float', 'float', 'int', 'int', 'int', 'int']
names_to_drop = ['building_id','blighted','addr','event_id_list','lat','lon','llcrnrlon','llcrnrlat','urcrnrlon','urcrnrlat', 'permit_cnt']



In [326]:

    
labels = balanced_buildings['blighted']
data = balanced_buildings.drop(names_to_drop, axis=1, inplace=False)



In [327]:

    
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=0)



In [328]:

    
x, x_eval, y, y_eval = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=500)



In [329]:

    
dtrain = xgb.DMatrix(x, label=y)
deval = xgb.DMatrix(x_eval, label=y_eval)
dtest = xgb.DMatrix(x_test, label=y_test)
param = {
    'booster': 'gbtree',
    'subsample': 1.0,
    'max_depth': 5,
    'min_child_weight': 2,
    'eta': 0.2,
    'gamma': 3,
    'objective':'binary:logistic',
    'eval_metric': 'auc',
    'lambda': 3, # L2 regularization,
    'alpha': 1  # L1 regularization
}
watchlist = [(deval, 'eval'), (dtrain, 'train')]
num_round = 1000



In [330]:

    
evals_result = {}
bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result, early_stopping_rounds=20, verbose_eval=False)



In [331]:

    
plt.plot(np.arange(len(evals_result['eval']['auc'])), evals_result['eval']['auc'], 'b-',\
        np.arange(len(evals_result['train']['auc'])), evals_result['train']['auc'], 'r--')
plt.legend(['Eval','Train'], loc='best')
plt.ylim(0.8,1.0)
plt.show()



In [332]:

    
preds_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)



In [333]:

    
round(roc_auc_score(y_test, preds_proba), 5)









    Out[333]:





0.86687000000000003



In [334]:

    
dtrain_total = xgb.DMatrix(x_train, label=y_train)

Cross Validation



In [335]:

    
res = xgb.cv(param, dtrain_total, num_boost_round=10, nfold=5, metrics={'auc'}, seed=99,\
             callbacks=[xgb.callback.print_evaluation(show_stdv=False),\
                       xgb.callback.early_stop(3)])









    



[0]	train-auc:0.851934	test-auc:0.838149
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 3 rounds.
[1]	train-auc:0.852181	test-auc:0.838536
[2]	train-auc:0.854876	test-auc:0.840773
[3]	train-auc:0.857873	test-auc:0.842309
[4]	train-auc:0.862297	test-auc:0.846874
[5]	train-auc:0.86441	test-auc:0.848298
[6]	train-auc:0.865076	test-auc:0.849315
[7]	train-auc:0.866884	test-auc:0.849802
[8]	train-auc:0.867735	test-auc:0.849666
[9]	train-auc:0.869543	test-auc:0.849594



In [336]:

    
plt.plot(np.arange(len(preds_proba)), preds_proba, 'bo', alpha=0.2)
plt.show()



In [337]:

    
y_pred_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)



In [338]:

    
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba, pos_label=1)
plt.figure(figsize=(5,5))
plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr, tpr, 'ro',label='RF', alpha=0.4)
plt.title("ROC Curve")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.show()



In [342]:

    
fig, ax = plt.subplots(figsize=(20,15), dpi=600)
xgb.plot_tree(bst, num_trees=0, ax=ax)
plt.show()



In [42]:

    
feat_imp = [(k,v) for (k,v) in bst.get_fscore().items()]   # python3
feat_imp.sort()



In [43]:

    
feat_names, feat_imps = zip(*feat_imp)



In [44]:

    
feat_imp_vis = pd.Series(feat_imps, index=feat_names)

fig = plt.figure()
feat_imp_vis.plot(kind='bar', title='Feature importance')









    Out[44]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f7362692e80>

According to this simplified analysis, location has the most predictive power for blightness. ('norm_lat' and 'norm_lon' correspond to 'normalized latitude' and 'normalized longitude' respectively)

More features from differentiating crimes



In [45]:

    
data_crimes = pd.read_csv('../data/data_crime.csv')









    



/usr/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2705: DtypeWarning: Columns (2) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [46]:

    
data_crimes.CATEGORY.unique()









    Out[46]:





array(['ASSAULT', 'LARCENY', 'STOLEN VEHICLE', 'WEAPONS OFFENSES',
       'TRAFFIC VIOLATIONS-MOTORCYCLE VIOLATIONS', 'BURGLARY',
       'AGGRAVATED ASSAULT', 'DAMAGE TO PROPERTY',
       'TRAFFIC VIOLATIONS-DRIVING ON SUSPENDED', 'ROBBERY', 'FRAUD',
       'OBSTRUCTING JUDICIARY', 'DANGEROUS DRUGS', 'HOMICIDE',
       'SOLICITATION', 'OUIL DISPOSE OF VEHICLE TO AVOID FORFEITURE',
       'FAMILY OFFENSE', 'ARSON', 'ESCAPE', 'OBSTRUCTING THE POLICE',
       'RUNAWAY', 'BRIBERY', 'KIDNAPING', 'EXTORTION', 'OTHER BURGLARY',
       'STOLEN PROPERTY', 'HEALTH-SAFETY', 'VAGRANCY (OTHER)',
       'ENVIRONMENT', 'EMBEZZLEMENT', 'FORGERY', 'CONSPIRACY BY COMPUTER',
       'ANTITRUST', 'PUBLIC PEACE', 'LIQUOR', 'OUIL', 'OBSCENITY',
       'SOVEREIGNTY', 'TAX REVENUE', 'NEGLIGENT HOMICIDE', 'GAMBLING',
       'IMMIGRATION', 'CONGRESS', 'JUSTIFIABLE HOMICIDE', 'REVOKED',
       'ELECTION LAWS', 'DRUNKENNESS', 'FELONY DEATH FROM FLEEING VEHICLE',
       'MISCELLANEOUS ARREST', 'MILITARY'], dtype=object)



In [47]:

    
crime_categories = {'more_serious': ['ASSAULT', 'LARCENY', 'STOLEN VEHICLE', 'BURGLARY', 'AGGRAVATED ASSAULT',\
                    'ROBBERY', 'KIDNAPING', 'OTHER BURGLARY', 'NEGLIGENT HOMICIDE', 'JUSTIFIABLE HOMICIDE',\
                    'FELONY DEATH FROM FLEEING VEHICLE', 'DANGEROUS DRUGS', 'ARSON', 'HOMICIDE'], \
                   'less_serious': ['WEAPONS OFFENSES', 'TRAFFIC VIOLATIONS-MOTORCYCLE VIOLATIONS', \
                    'DAMAGE TO PROPERTY', 'TRAFFIC VIOLATIONS-DRIVING ON SUSPENDED', 'FRAUD', 'OBSTRUCTING THE POLICE',\
                   'RUNAWAY', 'BRIBERY', 'EXTORTION', 'STOLEN PROPERTY', 'HEALTH-SAFETY', 'VAGRANCY (OTHER)', \
                    'ENVIRONMENT', 'EMBEZZLEMENT', 'FORGERY', 'CONSPIRACY BY COMPUTER', 'ANTITRUST', 'PUBLIC PEACE',\
                   'LIQUOR', 'OUIL', 'OBSCENITY', 'SOVEREIGNTY', 'TAX REVENUE', 'GAMBLING', 'IMMIGRATION', 'CONGRESS',\
                   'REVOKED', 'ELECTION LAWS', 'DRUNKENNESS', 'MISCELLANEOUS ARREST', 'MILITARY', 'SOLICITATION', \
                   'OUIL DISPOSE OF VEHICLE TO AVOID FORFEITURE', 'FAMILY OFFENSE', 'ESCAPE', 'OBSTRUCTING JUDICIARY']}

# based on data from gis.chicagopolice.org



In [48]:

    
data_crimes.head(2)









    Out[48]:






  
    
      
      ROWNUM
      CASEID
      INCINO
      CATEGORY
      OFFENSEDESCRIPTION
      STATEOFFENSEFILECLASS
      INCIDENTDATE
      HOUR
      SCA
      PRECINCT
      COUNCIL
      NEIGHBORHOOD
      CENSUSTRACT
      LON
      LAT
      LOCATION
      addr
      event_id
      type
    
  
  
    
      0
      53256
      1953933
      1506030028.1
      ASSAULT
      ASSAULT AND BATTERY/SIMPLE ASSAULT
      13001.0
      06/03/2015 12:00:00 AM
      2
      1007.0
      10.0
      City Council District 5
      PETOSKEY-OTSEGO
      5334.0
      -83.1221
      42.3678
      PETOSKEY\n09100\n(42.3676, -83.1219)
      09100 petoskey
      19679
      2
    
    
      1
      17631
      1917717
      1503010158.1
      LARCENY
      LARCENY - PARTS AND ACCESSORIES FROM VEHICLE
      23006.0
      03/01/2015 12:00:00 AM
      9
      608.0
      6.0
      City Council District 7
      GRANDALE
      5452.0
      -83.2035
      42.3724
      00 PLYMOUTH AND MANSFIELD\n(42.3725, -83.2033)
      00 plymouth and mansfield
      19680
      2



In [49]:

    
def cat_crime(crime_str):
    '''numerical category: 
       ---- more_serious: 1
       ---- less_serious: 0
       ---- unclassified: -1
    '''
    if crime_str in crime_categories['more_serious']:
        return 1
    elif crime_str in crime_categories['less_serious']:
        return 0
    else:
        return -1



In [50]:

    
data_crimes['num_cat'] = data_crimes['CATEGORY'].apply(cat_crime)



In [51]:

    
data_crimes['num_cat'].unique()   # all crimes classified, no -1 encountered









    Out[51]:





array([1, 0])



In [52]:

    
buildings.head(1)  # refresher









    Out[52]:






  
    
      
      addr
      blighted
      event_id_list
      lat
      lon
      llcrnrlon
      llcrnrlat
      urcrnrlon
      urcrnrlat
      building_id
      311-calls
      crimes
      blight_violations
      permit_cnt
      norm_lon
      norm_lat
    
  
  
    
      0
      13500 norborne
      0
      ['138721']
      42.3842
      -83.2975
      -83.297706
      42.384098
      -83.297294
      42.384302
      0
      0.0
      1.0
      0.0
      0.0
      -1.905905
      -0.172687



In [53]:

    
less_serious_crime_event_ids = data_crimes.loc[data_crimes['num_cat']==0,'event_id'].values



In [54]:

    
more_serious_crime_event_ids = data_crimes.loc[data_crimes['num_cat']==1,'event_id'].values



In [55]:

    
buildings.loc[:, 'event_id_list'] = buildings.loc[:,'event_id_list'].copy().apply(lambda x: str_to_list(x))



In [56]:

    
buildings['less_serious_crimes'] = 0       # count of less serious crimes
buildings['more_serious_crimes'] = 0       # count of more serious crimes
buildings['event_id_list'] = buildings['event_id_list'].apply(lambda x: [int(i.rstrip("'").lstrip("'")) for i in x])

for i in range(buildings.shape[0]):
    for event in buildings.loc[i, 'event_id_list']:
        if event in less_serious_crime_event_ids:
            buildings.loc[i, 'less_serious_crimes'] += 1
        elif event in more_serious_crime_event_ids:
            buildings.loc[i, 'more_serious_crimes'] += 1



In [57]:

    
buildings.head(1)









    Out[57]:






  
    
      
      addr
      blighted
      event_id_list
      lat
      lon
      llcrnrlon
      llcrnrlat
      urcrnrlon
      urcrnrlat
      building_id
      311-calls
      crimes
      blight_violations
      permit_cnt
      norm_lon
      norm_lat
      less_serious_crimes
      more_serious_crimes
    
  
  
    
      0
      13500 norborne
      0
      [138721]
      42.3842
      -83.2975
      -83.297706
      42.384098
      -83.297294
      42.384302
      0
      0.0
      1.0
      0.0
      0.0
      -1.905905
      -0.172687
      1
      0



In [58]:

    
buildings.to_csv('../data/buildings_with_features_2.csv', index=False)



In [59]:

    
balanced_buildings = buildings.loc[buildings['building_id'].isin(balanced_data['building_id'].values)].copy()



In [60]:

    
feature_names = ['norm_lat', 'norm_lon', '311-calls', 'blight_violations', 'less_serious_crimes', 'more_serious_crimes']
feature_types = ['float', 'float', 'int', 'int', 'int', 'int']
names_to_drop = ['building_id','blighted','addr','event_id_list','lat','lon','llcrnrlon','llcrnrlat','urcrnrlon','urcrnrlat', 'permit_cnt', 'crimes']



In [61]:

    
labels = balanced_buildings['blighted']
data = balanced_buildings.drop(names_to_drop, axis=1, inplace=False)



In [62]:

    
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=0)



In [63]:

    
x, x_eval, y, y_eval = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=500)



In [207]:

    
dtrain = xgb.DMatrix(x, label=y)
deval = xgb.DMatrix(x_eval, label=y_eval)
dtest = xgb.DMatrix(x_test, label=y_test)
param = {
    'booster': 'gbtree',
    'subsample': 1.0,
    'max_depth': 8,
    'min_child_weight': 8,
    'eta': 0.1,
    'gamma': 5,
    'objective':'binary:logistic',
    'eval_metric': 'auc',
    'lambda': 2, # L2 regularization,
    'alpha': 0  # L1 regularization
}
watchlist = [(deval, 'eval'), (dtrain, 'train')]
num_round = 2000



In [208]:

    
evals_result = {}
bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result, early_stopping_rounds=20, verbose_eval=False)



In [209]:

    
plt.plot(np.arange(len(evals_result['eval']['auc'])), evals_result['eval']['auc'], 'b-',\
        np.arange(len(evals_result['train']['auc'])), evals_result['train']['auc'], 'r--')
plt.legend(['Eval','Train'], loc='best')
plt.ylim(0.8,1.0)
plt.show()



In [210]:

    
preds_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)



In [211]:

    
round(roc_auc_score(y_test, preds_proba), 5)









    Out[211]:





0.86460000000000004



In [212]:

    
dtrain_total = xgb.DMatrix(x_train, label=y_train)

Cross Validation



In [213]:

    
res = xgb.cv(param, dtrain_total, num_boost_round=10, nfold=5, metrics={'auc'}, seed=99,\
             callbacks=[xgb.callback.print_evaluation(show_stdv=False),\
                       xgb.callback.early_stop(3)])









    



[0]	train-auc:0.856021	test-auc:0.839913
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 3 rounds.
[1]	train-auc:0.857758	test-auc:0.84213
[2]	train-auc:0.858127	test-auc:0.842366
[3]	train-auc:0.859393	test-auc:0.842376
[4]	train-auc:0.859571	test-auc:0.842448
[5]	train-auc:0.860936	test-auc:0.84314
[6]	train-auc:0.861729	test-auc:0.843287
[7]	train-auc:0.862135	test-auc:0.842917
[8]	train-auc:0.863142	test-auc:0.844508
[9]	train-auc:0.864206	test-auc:0.844289



In [214]:

    
plt.plot(np.arange(len(preds_proba)), preds_proba, 'bo', alpha=0.2)
plt.show()



In [215]:

    
y_pred_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)



In [216]:

    
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba, pos_label=1)
plt.figure(figsize=(5,5))
plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr, tpr, 'ro',label='RF', alpha=0.4)
plt.title("ROC Curve")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.show()



In [217]:

    
feat_imp = [(k,v) for (k,v) in bst.get_fscore().items()]   # python3
feat_imp.sort()



In [218]:

    
feat_names, feat_imps = zip(*feat_imp)



In [219]:

    
feat_imp_vis = pd.Series(feat_imps, index=feat_names)

fig = plt.figure()
feat_imp_vis.plot(kind='bar', title='Feature importance')









    Out[219]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f737d6ed390>

The additional feature created using differentiated crimes did not provide a better answer.

SVM



In [220]:

    
# Train using another balanced data set (1:1 blighted vs nonblighted) with nonblighted of another list of random indexes
balanced_data_2 = pd.read_csv('../data/balanced_data_2.csv')
balanced_keys_2 = pd.read_csv('../data/balanced_keys_2.csv')



In [221]:

    
balanced_buildings_2 = buildings.loc[buildings['building_id'].isin(balanced_data_2['building_id'].values)].copy()



In [222]:

    
feature_names = ['norm_lat', 'norm_lon', '311-calls', 'blight_violations', 'less_serious_crimes', 'more_serious_crimes']
feature_types = ['float', 'float', 'int', 'int', 'int', 'int']
names_to_drop = ['building_id','blighted','addr','event_id_list','lat','lon','llcrnrlon','llcrnrlat','urcrnrlon','urcrnrlat', 'permit_cnt', 'crimes']



In [223]:

    
labels_2 = balanced_buildings_2['blighted']
data_2 = balanced_buildings_2.drop(names_to_drop, axis=1, inplace=False)



In [224]:

    
x_train, x_test, y_train, y_test = train_test_split(data_2, labels_2, test_size=0.2, stratify=labels, random_state=0)



In [225]:

    
x, x_eval, y, y_eval = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=500)



In [226]:

    
from sklearn.svm import SVC



In [232]:

    
clf = SVC(C=0.8, kernel='rbf', probability=True)



In [233]:

    
clf.fit(x_train, y_train)









    Out[233]:





SVC(C=0.8, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)



In [234]:

    
y_pred_2 = clf.predict_proba(x_test)



In [235]:

    
round(roc_auc_score(y_test, y_pred_2[:,1]), 5)









    Out[235]:





0.83265



In [236]:

    
fpr, tpr, thresholds = roc_curve(y_test, y_pred_2[:,1], pos_label=1)
plt.figure(figsize=(5,5))
plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr, tpr, 'ro',label='RF', alpha=0.4)
plt.title("ROC Curve")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.show()

More features from differentiating 'blight violations'

We still have only a very small number of features so far. And one should still benefit from exploring features.



In [343]:

    
data_violations = pd.read_csv('../data/data_bv.csv')









    



/usr/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2705: DtypeWarning: Columns (6,10,11,12) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [344]:

    
data_violations.columns









    Out[344]:





Index(['TicketID', 'TicketNumber', 'AgencyName', 'ViolName',
       'ViolationStreetNumber', 'ViolationStreetName', 'MailingStreetNumber',
       'MailingStreetName', 'MailingCity', 'MailingState', 'MailingZipCode',
       'NonUsAddressCode', 'Country', 'TicketIssuedDT', 'TicketIssuedTime',
       'HearingDT', 'CourtTime', 'ViolationCode', 'ViolDescription',
       'Disposition', 'FineAmt', 'AdminFee', 'LateFee', 'StateFee',
       'CleanUpCost', 'JudgmentAmt', 'PaymentStatus', 'Void',
       'ViolationCategory', 'ViolationAddress', 'MailingAddress', 'coord',
       'addr', 'lon', 'lat', 'event_id', 'type'],
      dtype='object')



In [345]:

    
data_violations.FineAmt.unique()     # this indicate importance of violations









    Out[345]:





array(['$1500.00', '$100.00', '$3500.00', '$200.00', '$2500.00', '$500.00',
       '$10000.00', '$1000.00', '$5000.00', '$250.00', '$25.00', '$300.00',
       '$50.00', '$750.00', '$20.00', '$125.00', '$1470.00', '$350.00',
       '$600.00', '$3000.00', '$1250.00', '$400.00', '$170.00', '$1750.00',
       '$1200.00', '$2000.00', '$970.00', '$7000.00', nan, '$150.00',
       '$2695.00', '$8000.00', '$95.00', '$345.00', '$270.00', '$30.00',
       '$450.00', '$160.00', '$220.00', '$375.00', '$1030.00', '$1.00',
       '$677.00', '$280.00', '$4000.00', '$90.00', '$290.00', '$370.00',
       '$1450.00', '$800.00', '$0.00', '$60.00', '$440.00', '$470.00',
       '$230.00'], dtype=object)



In [346]:

    
len(data_violations.ViolationCode.value_counts())   # Too many kind of violations if by category









    Out[346]:





265



In [347]:

    
buildings = pd.read_csv('../data/buildings_with_features_2.csv')
balanced_data = pd.read_csv('../data/balanced_data.csv')
balanced_keys = pd.read_csv('../data/balanced_keys.csv')



In [348]:

    
def get_num_amt(FineAmt):
    '''convert FineAmt string to numerical values'''
    amt_str = FineAmt.tolist()[0]
    if isinstance(amt_str, float):   # nan
        # print(amt_str)
        return 0
    amt_str = amt_str.lstrip('$')
    amt = float(amt_str)
    return amt



In [349]:

    
buildings.loc[:, 'event_id_list'] = buildings.loc[:,'event_id_list'].copy().apply(lambda x: str_to_list(x))



In [350]:

    
buildings.head(1)









    Out[350]:






  
    
      
      addr
      blighted
      event_id_list
      lat
      lon
      llcrnrlon
      llcrnrlat
      urcrnrlon
      urcrnrlat
      building_id
      311-calls
      crimes
      blight_violations
      permit_cnt
      norm_lon
      norm_lat
      less_serious_crimes
      more_serious_crimes
    
  
  
    
      0
      13500 norborne
      0
      [138721]
      42.3842
      -83.2975
      -83.297706
      42.384098
      -83.297294
      42.384302
      0
      0.0
      1.0
      0.0
      0.0
      -1.905905
      -0.172687
      1
      0



In [351]:

    
buildings['trivial_v'] = 0       # count of violation of minimal importance (< $100)
buildings['small_v'] = 0       # count of violation of small importance ($100 <= v < $1000 )
buildings['medium_v'] = 0      # count of violation with ($1000 <= v < $5000)
buildings['heavy_v'] = 0       # count of violation with (>=$5000)

buildings['event_id_list'] = buildings['event_id_list'].apply(lambda x: [int(i.rstrip("'").lstrip("'")) for i in x])



In [352]:

    
violation_events = data_violations['event_id'].values
for i in range(buildings.shape[0]):
    for event in buildings.loc[i, 'event_id_list']:
        if event in violation_events:
            amt = get_num_amt(data_violations.loc[data_violations['event_id']==event, 'FineAmt'])
            if amt < 100:
                buildings.loc[i, 'trivial_v'] += 1
            elif amt >= 100 and amt < 1000:
                buildings.loc[i, 'small_v'] += 1
            elif amt >= 1000 and amt < 5000:
                buildings.loc[i, 'medium_v'] += 1
            elif amt >= 5000:
                buildings.loc[i, 'heavy_v'] += 1
            else: # nan
                buildings.loc[i, 'trivial_v'] += 1



In [353]:

    
buildings.head(1)









    Out[353]:






  
    
      
      addr
      blighted
      event_id_list
      lat
      lon
      llcrnrlon
      llcrnrlat
      urcrnrlon
      urcrnrlat
      building_id
      ...
      blight_violations
      permit_cnt
      norm_lon
      norm_lat
      less_serious_crimes
      more_serious_crimes
      trivial_v
      small_v
      medium_v
      heavy_v
    
  
  
    
      0
      13500 norborne
      0
      [138721]
      42.3842
      -83.2975
      -83.297706
      42.384098
      -83.297294
      42.384302
      0
      ...
      0.0
      0.0
      -1.905905
      -0.172687
      1
      0
      0
      0
      0
      0
    
  

1 rows × 22 columns



In [1029]:

    
feature_names = ['norm_lat', 'norm_lon', '311-calls', 'less_serious_crimes', 'more_serious_crimes', 'trivial_v', 'small_v', 'medium_v', 'heavy_v']
feature_types = ['float', 'float', 'int', 'int', 'int', 'int']
names_to_drop = ['building_id','blighted','addr','event_id_list','lat','lon','llcrnrlon','llcrnrlat','urcrnrlon','urcrnrlat', 'permit_cnt', 'crimes', 'blight_violations']



In [1030]:

    
buildings.to_csv('../data/buildings_with_features_3.csv', index=False)



In [1031]:

    
balanced_buildings = buildings.loc[buildings['building_id'].isin(balanced_data['building_id'].values)].copy()



In [1032]:

    
labels = balanced_buildings['blighted']
data = balanced_buildings.drop(names_to_drop, axis=1, inplace=False)



In [1033]:

    
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=1234)



In [1038]:

    
x, x_eval, y, y_eval = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=6890)



In [1039]:

    
dtrain = xgb.DMatrix(x, label=y)
deval = xgb.DMatrix(x_eval, label=y_eval)
dtest = xgb.DMatrix(x_test, label=y_test)
param = {
        'booster': 'gbtree',
        'subsample': 1.0,
        'max_depth': 7,
        'min_child_weight': 4.0,
        'eta': 0.2,
        'gamma': 3,
        'objective':'binary:logistic',
        'eval_metric': 'auc',
        'lambda': 3, # L2 regularization,
        'alpha': 0.5  # L1 regularization
    }
watchlist = [(deval, 'eval'), (dtrain, 'train')]
num_round = 1000



In [1040]:

    
evals_result = {}
bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result, early_stopping_rounds=40, verbose_eval=False)



In [1041]:

    
plt.plot(np.arange(len(evals_result['eval']['auc'])), evals_result['eval']['auc'], 'b-',\
        np.arange(len(evals_result['train']['auc'])), evals_result['train']['auc'], 'r--')
plt.legend(['Eval','Train'], loc='best')
plt.ylim(0.8,1.0)
plt.show()



In [1042]:

    
preds_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)



In [1043]:

    
round(roc_auc_score(y_test, preds_proba), 5)









    Out[1043]:





0.86672000000000005



In [1044]:

    
dtrain_total = xgb.DMatrix(x_train, label=y_train)

Cross Validation



In [1045]:

    
res = xgb.cv(param, dtrain_total, num_boost_round=10, nfold=5, metrics={'auc'}, seed=99,\
             callbacks=[xgb.callback.print_evaluation(show_stdv=False),\
                       xgb.callback.early_stop(3)])









    



[0]	train-auc:0.84792	test-auc:0.824964
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 3 rounds.
[1]	train-auc:0.849289	test-auc:0.826005
[2]	train-auc:0.852226	test-auc:0.82769
[3]	train-auc:0.854858	test-auc:0.829408
[4]	train-auc:0.85832	test-auc:0.832602
[5]	train-auc:0.860584	test-auc:0.833464
[6]	train-auc:0.862362	test-auc:0.835505
[7]	train-auc:0.865221	test-auc:0.83639
[8]	train-auc:0.868371	test-auc:0.839194
[9]	train-auc:0.87067	test-auc:0.840195



In [972]:

    
plt.plot(np.arange(len(preds_proba)), preds_proba, 'bo', alpha=0.2)
plt.show()



In [973]:

    
y_pred_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)



In [974]:

    
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba, pos_label=1)
plt.figure(figsize=(5,5))
plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr, tpr, 'ro',label='RF', alpha=0.4)
plt.title("ROC Curve")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.show()



In [975]:

    
feat_imp = [(k,v) for (k,v) in bst.get_fscore().items()]   # python3
feat_imp.sort()



In [976]:

    
feat_names, feat_imps = zip(*feat_imp)



In [977]:

    
feat_imp_vis = pd.Series(feat_imps, index=feat_names)
feat_imp_vis.sort_values(inplace=True)
fig = plt.figure()
feat_imp_vis.plot(kind='bar', title='Feature importance')









    Out[977]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f734eec32e8>

Bagging

More features produced by differentiating kinds of crimes or kinds of violations did not improve our power of predictions. We can use bagging to take advantage of extra samples not utilized for nonblighted buildings.

Extra data



In [1309]:

    
blighted_buildings = buildings.loc[buildings['blighted'] == 1, :].copy()
nonblighted_buildings = buildings.loc[buildings['blighted']==0, :].copy()



In [1310]:

    
n_blighted = blighted_buildings.shape[0]
n_nonblighted = nonblighted_buildings.shape[0]
print("number of blighted buildings: %d" % n_blighted)
print("number of non-blighted buildings: %d " % n_nonblighted)









    



number of blighted buildings: 2615
number of non-blighted buildings: 128112

Hold test data



In [1371]:

    
feature_names = ['norm_lat', 'norm_lon', '311-calls', 'crimes', 'blight_violations', ]
feature_types = ['float', 'float', 'int', 'int', 'int', 'int']
names_to_drop = ['building_id','blighted','addr','event_id_list','lat','lon','llcrnrlon','llcrnrlat','urcrnrlon','urcrnrlat', 'less_serious_crimes', 'more_serious_crimes', 'trivial_v', 'small_v', 'medium_v', 'heavy_v', 'permit_cnt']



In [1372]:

    
n_test_b = int(n_blighted*0.2)  # number for blighted buildings in test



In [1373]:

    
index_b_test = np.random.choice(blighted_buildings.index, n_test_b, replace=False) #blighted
index_nb_test = np.random.choice(nonblighted_buildings.index, n_test_b, replace=False) #nonblighted



In [1374]:

    
blighted_test = blighted_buildings.loc[index_b_test,:]
nonblighted_test = nonblighted_buildings.loc[index_nb_test,:]
balanced_test = pd.concat([blighted_test.copy(), nonblighted_test.copy()])
balanced_test = balanced_test.sample(frac=1, replace=False).reset_index(drop=True)



In [1375]:

    
test_x = balanced_test.drop(names_to_drop, axis=1, inplace=False)
test_y = balanced_test.loc[:,['blighted']].copy()

Train data



In [1376]:

    
# Train data are chosen from rest of the buildings

rest_blighted_buildings = blighted_buildings.loc[~blighted_buildings.index.isin(index_b_test),:].copy()
rest_nonblighted_buildings = nonblighted_buildings.loc[~nonblighted_buildings.index.isin(index_nb_test),:].copy()



In [1377]:

    
n_train_b = rest_blighted_buildings.shape[0]   # number of rows to choose from each kind



In [1394]:

    
indexes_b_train = []

# choose 5 set of train from the same samples with replacement.
for i in range(5):
    index_b_train = np.random.choice(rest_blighted_buildings.index, n_train_b, replace=False)
    index_nb_train = np.random.choice(rest_nonblighted_buildings.index, n_train_b, replace=False)
    indexes_b_train.append([index_b_train, index_nb_train])



In [1395]:

    
train_list = []
for index_pair in indexes_b_train:
    index_b, index_nb = tuple(index_pair)
    blighted_train = rest_blighted_buildings.loc[index_b,:]
    nonblighted_train = rest_nonblighted_buildings.loc[index_nb,:]
    balanced_train = pd.concat([blighted_train.copy(), nonblighted_train.copy()])
    balanced_train = balanced_train.sample(frac=1, replace=False).reset_index(drop=True)
    y_train = balanced_train['blighted'].copy()
    x_train = balanced_train.drop(names_to_drop, axis=1, inplace=False)
    train_list.append((x_train, y_train))



In [1474]:

    
param = {
        'booster': 'gbtree',
        'subsample': 1.0,
        'max_depth': 6,
        'min_child_weight': 3.0,
        'eta': 0.2,
        'gamma': 3.0,
        'objective':'binary:logistic',
        'eval_metric': 'auc',
        'lambda': 4, # L2 regularization,
        'alpha': 0  # L1 regularization
    }

num_round = 1000



In [1475]:

    
def train_model(x, x_eval, y, y_eval, param):
    #x, x_eval, y, y_eval = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)
    dtrain = xgb.DMatrix(x, label=y)
    deval = xgb.DMatrix(x_eval, label=y_eval)
    watchlist = [(deval, 'eval'), (dtrain, 'train')]
    evals_result={}
    bst = xgb.train(param, dtrain, num_boost_round=1000, evals=watchlist, evals_result=evals_result,
                   early_stopping_rounds=40, verbose_eval=False)
    return (bst, evals_result)

One example



In [1476]:

    
x_train = train_list[0][0]
y_train = train_list[0][1]
x, x_eval, y, y_eval = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)



In [1477]:

    
bst, evals_result = train_model(x, x_eval, y, y_eval, param)



In [1478]:

    
plt.plot(np.arange(len(evals_result['eval']['auc'])), evals_result['eval']['auc'], 'b-',\
        np.arange(len(evals_result['train']['auc'])), evals_result['train']['auc'], 'r--')
plt.legend(['Eval','Train'], loc='best')
plt.ylim(0.82,0.9)
plt.show()

Cross Validation



In [1479]:

    
res = xgb.cv(param, dtrain_total, num_boost_round=10, nfold=5, metrics={'auc'}, seed=9999,\
             callbacks=[xgb.callback.print_evaluation(show_stdv=False),\
                       xgb.callback.early_stop(3)])









    



[0]	train-auc:0.842692	test-auc:0.826799
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 3 rounds.
[1]	train-auc:0.844584	test-auc:0.828081
[2]	train-auc:0.849348	test-auc:0.829962
[3]	train-auc:0.8533	test-auc:0.831619
[4]	train-auc:0.856323	test-auc:0.834337
[5]	train-auc:0.859551	test-auc:0.835003
[6]	train-auc:0.861824	test-auc:0.835337
[7]	train-auc:0.86735	test-auc:0.839599
[8]	train-auc:0.868933	test-auc:0.840904
[9]	train-auc:0.872053	test-auc:0.842506



In [1480]:

    
dtest = xgb.DMatrix(test_x, label=test_y['blighted'])



In [1481]:

    
preds_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)



In [1482]:

    
round(roc_auc_score(test_y, preds_proba), 5)









    Out[1482]:





0.85331000000000001

Average model



In [1488]:

    
bsts = []
fig, axes = plt.subplots(len(train_list), sharex=True, sharey=True, figsize=(6,12))
for train_data,i in zip(train_list,range(len(axes))):
    x_train, y_train = train_data
    x, x_eval, y, y_eval = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)
    bst, evals_result = train_model(x, x_eval, y, y_eval, param)
    
    axes[i].plot(np.arange(len(evals_result['eval']['auc'])), evals_result['eval']['auc'], 'b-',\
            np.arange(len(evals_result['train']['auc'])), evals_result['train']['auc'], 'r--')
    bsts.append(bst)

plt.legend(['Eval','Train'], loc='best')
plt.ylim(0.8,0.9)
plt.show()



In [1484]:

    
#preds_proba = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
y_preds = np.mean(np.array([bst.predict(dtest, ntree_limit=bst.best_ntree_limit) for bst in bsts]), axis=0)



In [1485]:

    
round(roc_auc_score(test_y, y_preds), 5)









    Out[1485]:





0.86246999999999996

An AUC score of 0.8625 was achieved by reducing variance.



In [1489]:

    
fpr, tpr, thresholds = roc_curve(test_y, y_preds, pos_label=1)
plt.figure(figsize=(5,5))
plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr, tpr, 'ro',label='RF', alpha=0.4)
plt.title("ROC Curve")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.savefig('../data/ROC_Curve_combined.png')
plt.show()



In [ ]:

	addr	blighted	event_id_list	lat	lon	llcrnrlon	llcrnrlat	urcrnrlon	urcrnrlat	building_id
0	13500 norborne	0	[138721]	42.3842	-83.2975	-83.297706	42.384098	-83.297294	42.384302	0
1	26200 mich ave inkster mi 48141	0	[116216]	42.2954	-83.2968	-83.297006	42.295298	-83.296594	42.295502	1

	event_id	lon	lat	addr	type
0	0	-83.161039	42.383998	13120-13130 ilene st	1
1	1	-83.080919	42.440471	1485 e outer dr	1

	311-calls	crimes	blight_violations	permit_cnt
count	130727.000000	130727.000000	130727.000000	130727.000000
mean	0.134142	0.831328	0.760715	0.021373
std	0.390493	1.094696	0.958943	0.155434
min	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000	0.000000	0.000000
50%	0.000000	1.000000	0.000000	0.000000
75%	0.000000	1.000000	1.000000	0.000000
max	12.000000	9.000000	12.000000	5.000000

	addr	blighted	event_id_list	lat	lon	llcrnrlon	llcrnrlat	urcrnrlon	urcrnrlat	building_id	311-calls	crimes	blight_violations	permit_cnt	norm_lon	norm_lat
0	13500 norborne	0	['138721']	42.3842	-83.2975	-83.297706	42.384098	-83.297294	42.384302	0	0.0	1.0	0.0	0.0	-1.905905	-0.172687
1	26200 mich ave inkster mi 48141	0	['116216']	42.2954	-83.2968	-83.297006	42.295298	-83.296594	42.295502	1	0.0	1.0	0.0	0.0	-1.898702	-2.553627

	addr	event_id_list	lat	lon	llcrnrlon	llcrnrlat	urcrnrlon	urcrnrlat	building_id
0	15360 tracey	[448971, 118987]	42.403010	-83.180173	-83.180379	42.402908	-83.179968	42.403112	38754
1	4817 chopin	[143959]	42.332660	-83.134467	-83.134672	42.332558	-83.134261	42.332762	60569
2	5300 wayburn st	[13405]	42.400448	-82.955224	-82.955430	42.400346	-82.955019	42.400550	121589

	ROWNUM	CASEID	INCINO	CATEGORY	OFFENSEDESCRIPTION	STATEOFFENSEFILECLASS	INCIDENTDATE	HOUR	SCA	PRECINCT	COUNCIL	NEIGHBORHOOD	CENSUSTRACT	LON	LAT	LOCATION	addr	event_id	type
0	53256	1953933	1506030028.1	ASSAULT	ASSAULT AND BATTERY/SIMPLE ASSAULT	13001.0	06/03/2015 12:00:00 AM	2	1007.0	10.0	City Council District 5	PETOSKEY-OTSEGO	5334.0	-83.1221	42.3678	PETOSKEY\n09100\n(42.3676, -83.1219)	09100 petoskey	19679	2
1	17631	1917717	1503010158.1	LARCENY	LARCENY - PARTS AND ACCESSORIES FROM VEHICLE	23006.0	03/01/2015 12:00:00 AM	9	608.0	6.0	City Council District 7	GRANDALE	5452.0	-83.2035	42.3724	00 PLYMOUTH AND MANSFIELD\n(42.3725, -83.2033)	00 plymouth and mansfield	19680	2