Recall from homework that Bayes rule is $$ g(x) = \left\{ \begin{array}{ll} 1, &\mathbb P \{Y = 1 | X = x \} > \mathbb P \{Y = 0 | X = x \} \\ 0, &{\rm otherwise}\end{array}\right. $$ Another way to write this event is (for $f_X(x) > 0$) $$ f_{Y,X}(1, x) = \mathbb P \{Y = 1 | X = x \} f_X(x) > \mathbb P \{Y = 0 | X = x \} f_X(x) = f_{Y,X} (0, x) $$ Let $\pi = \mathbb P \{ Y = 1\}$ then this is also $$ \pi f_{X|Y}(x | 1) > (1 - \pi) f_{X|Y} (x|0) $$ which is $$ \frac{f_{X|Y}(x | 1)}{f_{X|Y} (x|0)} > \tau = \frac{1-\pi}{\pi} $$
A generative method does the following
Reasonable heuristic for estimating a density $\hat f_X$, based on data $x_1,\ldots,x_n$ is
Let the Boxcar kernel function be $$ k(\|x_0-x_1\|) = \frac{1\{ \| x_0 - x_1 \| \le 1 \}}{{\rm Vol}(B(1))} $$ then the number of pts within $\epsilon$ is $$ N(x,\epsilon) = {\rm Vol}(B(1)) \sum_i k\left( \frac{\| x - x_i \|}{\epsilon} \right) $$ and the density estimate is $$ \hat f(x) = \frac 1n \sum_i \frac{{\rm Vol}(B(1))}{{\rm Vol}(B(\epsilon))} \cdot k\left( \frac{\| x - x_i \|}{\epsilon} \right) $$ this is equal to $$ \hat f(x) = \frac 1n \sum_i \frac{1}{\epsilon^p} \cdot k\left( \frac{\| x - x_i \|}{\epsilon} \right) $$
General kernel density estimate is based on a kernel such that $$ \int k(\|x-x_0\|) dx = 1. $$
Then KDE is $$ \hat f(x') = \frac 1n \sum_i \frac{1}{\epsilon^p} \cdot k\left( \frac{\| x' - x_i \|}{\epsilon} \right) $$ where $p$ is the dimensionality of the X space. $\epsilon$ is a bandwidth parameter.
For each $y = 0,1$ let $x_1,\ldots,x_{n_y}$ be the predictor data with $Y = y$
Let $\pi$ be the proportion of $Y = 1$ then let $\tau = (1 - \pi) / \pi$.
Predict $\hat y = 1$ for a new $x'$ if $$ \frac{\hat f_{1}(x')}{\hat f_{0} (x')} > \tau $$ and $\hat y=0$ otherwise.
Let $x_0,x_1 \in \mathbb R^p$ and $$k(\|x_0 - x_1\|) = \frac{1}{(2\pi)^{k/2}} \exp \left(- \frac 12 \|x_0 - x_1\|_2^2 \right).$$ How do we know that this is a valid kernel for multivariate density estimation?
Suppose that you used this kernel to obtain a multivariate density estimate, $\hat f: \mathbb R^p \rightarrow \mathbb R$, and also used the subroutine in Naive Bayes to estimate $\hat f_N(x') = \prod_j \hat f_j(x_j')$. Will these return the same results? Think about the boxcar kernel with bandwidth of 1, what are the main differences between these methods?
In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import plotnine as p9
import matplotlib.pyplot as plt
import itertools
import warnings
warnings.simplefilter("ignore")
from sklearn import neighbors, preprocessing, impute, metrics, model_selection, linear_model, svm, feature_selection
In [2]:
from matplotlib.pyplot import rcParams
rcParams['figure.figsize'] = 6,6
In [35]:
def train_bank_to_xy(bank):
"""standardize and impute training"""
bank_sel = bank[['age','balance','duration','y']].values
X,y = bank_sel[:,:-1], bank_sel[:,-1]
scaler = preprocessing.StandardScaler().fit(X)
imputer = impute.SimpleImputer(fill_value=0).fit(X)
trans_prep = lambda Z: imputer.transform(scaler.transform(Z))
X = trans_prep(X)
y = (y == 'yes')*1
return (X, y), trans_prep
def test_bank_to_xy(bank, trans_prep):
"""standardize and impute test"""
bank_sel = bank[['age','balance','duration','y']].values
X,y = bank_sel[:,:-1], bank_sel[:,-1]
X = trans_prep(X)
y = (y == 'yes')*1
return (X, y)
In [36]:
bank = pd.read_csv('../../data/bank.csv',sep=';',na_values=['unknown',999,'nonexistent'])
bank.info()
In [37]:
bank_tr, bank_te = model_selection.train_test_split(bank,test_size=.33)
In [38]:
p9.ggplot(bank_tr, p9.aes(x = 'age',fill = 'y')) + p9.geom_density(alpha=.2)
Out[38]:
In [39]:
(X_tr, y_tr), trans_prep = train_bank_to_xy(bank_tr)
X_te, y_te = test_bank_to_xy(bank_te, trans_prep)
In [62]:
def plot_conf_score(y_te,score,tau):
y_classes = (1,0)
cf_inds = ["Pred {}".format(c) for c in y_classes]
cf_cols = ["True {}".format(c) for c in y_classes]
y_pred = score_dur > tau
return pd.DataFrame(metrics.confusion_matrix(y_pred,y_te,labels=y_classes),index=cf_inds,columns=cf_cols)
Most classifiers are "soft" because they can output a score, higher means more likely to be $Y=1$
In [41]:
score_dur = X_te[:,2]
In [42]:
p9.ggplot(bank_tr[['duration','y']].dropna(axis=0)) + p9.aes(x = 'duration',fill = 'y')\
+ p9.geom_density(alpha=.5)
Out[42]:
In [43]:
y_te
Out[43]:
In [64]:
plot_conf_score(y_te,score_dur,1.)
Out[64]:
In [65]:
plot_conf_score(y_te,score_dur,2.)
Out[65]:
In [67]:
## Fit and find NNs
nn = neighbors.NearestNeighbors(n_neighbors=10,metric="l2")
nn.fit(X_tr)
dists, NNs = nn.kneighbors(X_te)
NNs[1], y_tr[NNs[1]].mean(), y_te[1]
Out[67]:
In [68]:
score_nn = np.array([(y_tr[knns] == 1).mean() for knns in NNs])
plot_conf_score(y_te,score_nn,.2)
Out[68]:
In [69]:
nn = neighbors.KNeighborsClassifier(n_neighbors=10)
nn.fit(X_tr, y_tr)
score_nn = nn.predict_proba(X_te)[:,1]
plot_conf_score(y_te,score_nn,.2)
Out[69]:
In [70]:
def print_top_k(score_dur,y_te,k_top):
ordering = np.argsort(score_dur)[::-1]
print("k: score, y")
for k, (yv,s) in enumerate(zip(y_te[ordering],score_dur[ordering])):
print("{}: {}, {}".format(k,s,yv))
if k >= k_top - 1:
break
In [71]:
print_top_k(score_dur,y_te,10)
In [72]:
plt.style.use('ggplot')
In [73]:
fpr_dur, tpr_dur, threshs = metrics.roc_curve(y_te,score_dur)
plt.figure(figsize=(6,6))
plt.plot(fpr_dur,tpr_dur)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title("ROC for 'duration'")
Out[73]:
In [76]:
def plot_temp():
plt.figure(figsize=(6,6))
plt.plot(fpr_dur,tpr_dur,label='duration')
plt.plot(fpr_nn,tpr_nn,label='knn')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend()
plt.title("ROC")
In [77]:
fpr_nn, tpr_nn, threshs = metrics.roc_curve(y_te,score_nn)
plot_temp()
In [78]:
def plot_temp():
plt.figure(figsize=(6,6))
plt.plot(rec_dur,prec_dur,label='duration')
plt.plot(rec_nn,prec_nn,label='knn')
plt.xlabel('recall')
plt.ylabel('precision')
plt.legend()
plt.title("PR curve")
In [79]:
prec_dur, rec_dur, threshs = metrics.precision_recall_curve(y_te,score_dur)
prec_nn, rec_nn, threshs = metrics.precision_recall_curve(y_te,score_nn)
plot_temp()
In [91]:
from sklearn import discriminant_analysis
## Init previous predictors list
preds = [("Duration",score_dur), ("NN", score_nn)]
## Fit and predict with LDA
lda = discriminant_analysis.LinearDiscriminantAnalysis()
lda.fit(X_tr,y_tr)
score_pred = lda.predict_log_proba(X_te)[:,1]
preds += [("LDA",score_pred)]
## Fit and predict with QDA
qda = discriminant_analysis.QuadraticDiscriminantAnalysis()
qda.fit(X_tr,y_tr)
score_pred = qda.predict_log_proba(X_te)[:,1]
preds += [("QDA",score_pred)]
In [92]:
def plot_pr_models(X_te, y_te, preds):
plt.figure(figsize=(6,6))
for name, score_preds in preds:
prec, rec, threshs = metrics.precision_recall_curve(y_te,score_preds)
plt.plot(rec,prec,label=name)
plt.xlabel('recall')
plt.ylabel('precision')
plt.legend()
plt.title("PR curve")
In [93]:
plot_pr_models(X_te, y_te, preds)