\left{ \begin{array}{ll} \end{array}\right.
Recall from homework that Bayes rule is $$ g(x) = \left\{ \begin{array}{ll} 1, &\mathbb P \{Y = 1 | X = x \} > \mathbb P \{Y = 0 | X = x \} \\ 0, &{\rm otherwise}\end{array}\right. $$ Another way to write this event is (for $f_X(x) > 0$) $$ f_{Y,X}(1, x) = \mathbb P \{Y = 1 | X = x \} f_X(x) > \mathbb P \{Y = 0 | X = x \} f_X(x) = f_{Y,X} (0, x) $$ Let $\pi = \mathbb P \{ Y = 1\}$ then this is also $$ \pi f_{X|Y}(x | 1) > (1 - \pi) f_{X|Y} (x|0) $$ which is $$ \frac{f_{X|Y}(x | 1)}{f_{X|Y} (x|0)} > \tau = \frac{1-\pi}{\pi} $$
A generative method does the following
Reasonable heuristic for estimating a density $\hat f_X$, based on data $x_1,\ldots,x_n$ is
Let the Boxcar kernel function be $$ k(\|x_0-x_1\|) = \frac{1\{ \| x_0 - x_1 \| \le 1 \}}{{\rm Vol}(B(1))} $$ then the number of pts within $\epsilon$ is $$ N(x,\epsilon) = {\rm Vol}(B(1)) \sum_i k\left( \frac{\| x - x_i \|}{\epsilon} \right) $$ and the density estimate is $$ \hat f(x) = \frac 1n \sum_i \frac{{\rm Vol}(B(1))}{{\rm Vol}(B(\epsilon))} \cdot k\left( \frac{\| x - x_i \|}{\epsilon} \right) $$ this is equal to $$ \hat f(x) = \frac 1n \sum_i \frac{1}{\epsilon^p} \cdot k\left( \frac{\| x - x_i \|}{\epsilon} \right) $$
General kernel density estimate is based on a kernel such that $$ \int k(\|x-x_0\|) dx = 1 $$ a kernel that is a valid probability density is a common choice.
Then KDE is $$ \hat f(x) = \frac 1n \sum_i \frac{1}{\epsilon^p} \cdot k\left( \frac{\| x - x_i \|}{\epsilon} \right) $$ where $p$ is the dimensionality of the X space.
For each $y = 0,1$ let $x_1,\ldots,x_{n_y}$ be the predictor data with $Y = y$
Let $\pi$ be the proportion of $Y = 1$ then let $\tau = (1 - \pi) / \pi$.
Predict $\hat y = 1$ for a new $x$ if $$ \frac{f_{X|Y}(x | 1)}{f_{X|Y} (x|0)} > \tau $$ and $\hat y=0$ otherwise.
In [4]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import plotnine as p9
import matplotlib.pyplot as plt
import itertools
import warnings
warnings.simplefilter("ignore")
from sklearn import neighbors, preprocessing, impute, metrics, model_selection, linear_model, svm, feature_selection
In [3]:
from matplotlib.pyplot import rcParams
rcParams['figure.figsize'] = 6,6
In [7]:
def train_bank_to_xy(bank):
"""standardize and impute training"""
bank_sel = bank[['age','balance','duration','y']].values
X,y = bank_sel[:,:-1], bank_sel[:,-1]
scaler = preprocessing.StandardScaler().fit(X)
imputer = impute.SimpleImputer(fill_value=0).fit(X)
trans_prep = lambda Z: imputer.transform(scaler.transform(Z))
X = trans_prep(X)
y = 2*(y == 'yes')-1
return (X, y), trans_prep
def test_bank_to_xy(bank, trans_prep):
"""standardize and impute test"""
bank_sel = bank[['age','balance','duration','y']].values
X,y = bank_sel[:,:-1], bank_sel[:,-1]
X = trans_prep(X)
y = 2*(y == 'yes')-1
return (X, y)
In [9]:
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
In [6]:
bank = pd.read_csv('../../data/bank.csv',sep=';',na_values=['unknown',999,'nonexistent'])
bank.info()
In [10]:
bank_tr, bank_te = model_selection.train_test_split(bank,test_size=.33)
In [11]:
p9.ggplot(bank_tr, p9.aes(x = 'age',fill = 'y')) + p9.geom_density(alpha=.2)
Out[11]:
In [12]:
(X_tr, y_tr), trans_prep = train_bank_to_xy(bank_tr)
X_te, y_te = test_bank_to_xy(bank_te, trans_prep)
In [13]:
def plot_conf_score(y_te,score,tau):
y_pred = 2*(score > tau) - 1
classes = [1,-1]
conf = metrics.confusion_matrix(y_te, y_pred,labels=classes)
plot_confusion_matrix(conf, classes)
Most classifiers are "soft" because they can output a score, higher means more likely to be $Y=1$
In [14]:
score_dur = X_te[:,2]
In [15]:
p9.ggplot(bank_tr[['duration','y']].dropna(axis=0)) + p9.aes(x = 'duration',fill = 'y')\
+ p9.geom_density(alpha=.5)
Out[15]:
In [16]:
plot_conf_score(y_te,score_dur,1.)
In [17]:
plot_conf_score(y_te,score_dur,2.)
In [22]:
## Fit and find NNs
nn = neighbors.NearestNeighbors(n_neighbors=10,metric="l2")
nn.fit(X_tr)
dists, NNs = nn.kneighbors(X_te)
NNs[1], y_tr[NNs[1]].mean(), y_te[1]
Out[22]:
In [23]:
score_nn = np.array([(y_tr[knns] == 1).mean() for knns in NNs])
plot_conf_score(y_te,score_nn,.2)
In [25]:
nn = neighbors.KNeighborsClassifier(n_neighbors=10)
nn.fit(X_tr, y_tr)
score_nn = nn.predict_proba(X_te)[:,1]
plot_conf_score(y_te,score_nn,.2)
In [26]:
def print_top_k(score_dur,y_te,k_top):
ordering = np.argsort(score_dur)[::-1]
print("k: score, y")
for k, (yv,s) in enumerate(zip(y_te[ordering],score_dur[ordering])):
print("{}: {}, {}".format(k,s,yv))
if k >= k_top - 1:
break
In [27]:
print_top_k(score_dur,y_te,10)
In [28]:
plt.style.use('ggplot')
In [29]:
fpr_dur, tpr_dur, threshs = metrics.roc_curve(y_te,score_dur)
plt.figure(figsize=(6,6))
plt.plot(fpr_dur,tpr_dur)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title("ROC for 'duration'")
Out[29]:
In [30]:
def plot_temp():
plt.figure(figsize=(6,6))
plt.plot(fpr_dur,tpr_dur,label='duration')
plt.plot(fpr_nn,tpr_nn,label='knn')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend()
plt.title("ROC")
In [31]:
fpr_nn, tpr_nn, threshs = metrics.roc_curve(y_te,score_nn)
plot_temp()
In [32]:
def plot_temp():
plt.figure(figsize=(6,6))
plt.plot(rec_dur,prec_dur,label='duration')
plt.plot(rec_nn,prec_nn,label='knn')
plt.xlabel('recall')
plt.ylabel('precision')
plt.legend()
plt.title("PR curve")
In [33]:
prec_dur, rec_dur, threshs = metrics.precision_recall_curve(y_te,score_dur)
prec_nn, rec_nn, threshs = metrics.precision_recall_curve(y_te,score_nn)
plot_temp()