Feature Selection


In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import bokeh
from bokeh.io import output_notebook
output_notebook()

import os


Loading BokehJS ...

In [2]:
import sqlite3
from sqlalchemy import create_engine

SQL_ENGINE = create_engine('sqlite:///streetlight_cases.db')
df = pd.read_sql_query('SELECT * FROM new_data', SQL_ENGINE)

In [3]:
del df['opened_month']

In [4]:
X = df.drop(labels=['target'], axis=1, inplace=False).values
y = df.target.values

In [5]:
from sklearn.feature_selection import SelectFromModel, SelectKBest, VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

def __feature_importance(X, y):
    forest = RandomForestClassifier(n_estimators=200, random_state=0)
    forest.fit(X, y)
    return forest.feature_importances_

pipe = Pipeline([
        ('normalizer', MinMaxScaler()),
        ('selection_threshold', VarianceThreshold(threshold=(.999 * (1 - .999)))),
        ('selection_kbest', SelectKBest(__feature_importance, k=31)),
        ('classifier', RandomForestClassifier(n_estimators=100))])

In [12]:
from sklearn.metrics import roc_curve, auc, roc_auc_score

def find_cutoff(y_true, y_pred):
    fpr, tpr, threshold = roc_curve(y_true, y_pred)
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.ix[(roc.tf-0).abs().argsort()[:1]]
    return list(roc_t['threshold'])[0]

In [13]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5)

scores = []
cutoffs = []
y_pred = np.zeros(y.shape[0])

for tra, tst in skf.split(X, y):
    X_tra, y_tra = X[tra,:], y[tra]
    X_tst, y_tst = X[tst,:], y[tst]
    pipe.fit(X_tra, y_tra)
    y_proba = pipe.predict_proba(X_tst)
    y_pred[tst] = y_proba[:,1]
    cutoffs.append(find_cutoff(y_tst, y_proba[:,1]))
    scores.append(roc_auc_score(y_tst, y_proba[:,1]))

In [14]:
np.mean(scores), np.std(scores)


Out[14]:
(0.81412969184097828, 0.037819678036738871)

In [15]:
cutoffs


Out[15]:
[0.45000000000000001,
 0.48083333333333333,
 0.55000000000000004,
 0.46071428571428563,
 0.45000000000000001]

In [16]:
cutoff = find_cutoff(y, y_pred)

In [17]:
from sklearn.metrics import classification_report

print classification_report(y, y_pred >= cutoff)


             precision    recall  f1-score   support

          0       0.77      0.74      0.75      4159
          1       0.70      0.74      0.72      3484

avg / total       0.74      0.74      0.74      7643


In [18]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresh = roc_curve(y, y_pred)
auc_roc = auc(fpr, tpr)

print 'cuttoff {:.4f}'.format(cutoff)
plt.title('ROC Curve')
plt.plot(fpr, tpr, 'b',
label='AUC = %0.2f'% auc_roc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


cuttoff 0.4711

In [ ]: