In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import bokeh
from bokeh.io import output_notebook
output_notebook()
import os
In [2]:
import sqlite3
from sqlalchemy import create_engine
SQL_ENGINE = create_engine('sqlite:///streetlight_cases.db')
df = pd.read_sql_query('SELECT * FROM new_data', SQL_ENGINE)
In [3]:
del df['opened_month']
In [4]:
X = df.drop(labels=['target'], axis=1, inplace=False).values
y = df.target.values
In [5]:
from sklearn.feature_selection import SelectFromModel, SelectKBest, VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
def __feature_importance(X, y):
forest = RandomForestClassifier(n_estimators=200, random_state=0)
forest.fit(X, y)
return forest.feature_importances_
pipe = Pipeline([
('normalizer', MinMaxScaler()),
('selection_threshold', VarianceThreshold(threshold=(.999 * (1 - .999)))),
('selection_kbest', SelectKBest(__feature_importance, k=31)),
('classifier', RandomForestClassifier(n_estimators=100))])
In [12]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
def find_cutoff(y_true, y_pred):
fpr, tpr, threshold = roc_curve(y_true, y_pred)
i = np.arange(len(tpr))
roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
roc_t = roc.ix[(roc.tf-0).abs().argsort()[:1]]
return list(roc_t['threshold'])[0]
In [13]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)
scores = []
cutoffs = []
y_pred = np.zeros(y.shape[0])
for tra, tst in skf.split(X, y):
X_tra, y_tra = X[tra,:], y[tra]
X_tst, y_tst = X[tst,:], y[tst]
pipe.fit(X_tra, y_tra)
y_proba = pipe.predict_proba(X_tst)
y_pred[tst] = y_proba[:,1]
cutoffs.append(find_cutoff(y_tst, y_proba[:,1]))
scores.append(roc_auc_score(y_tst, y_proba[:,1]))
In [14]:
np.mean(scores), np.std(scores)
Out[14]:
In [15]:
cutoffs
Out[15]:
In [16]:
cutoff = find_cutoff(y, y_pred)
In [17]:
from sklearn.metrics import classification_report
print classification_report(y, y_pred >= cutoff)
In [18]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresh = roc_curve(y, y_pred)
auc_roc = auc(fpr, tpr)
print 'cuttoff {:.4f}'.format(cutoff)
plt.title('ROC Curve')
plt.plot(fpr, tpr, 'b',
label='AUC = %0.2f'% auc_roc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [ ]: