In [2]:
import numpy as np
import pandas as pd
In [372]:
# data loading
data_train = pd.read_csv("spambase.train", header=None)
data_test = pd.read_csv("spambase.test", header=None)
def spamdata(data):
"""returns a tuple (target value, feature values)"""
D = np.asarray(data, dtype=float)
return np.asarray(D[:,-1], dtype=int), np.asarray(D[:,:-1])
t_train, X_train_raw = spamdata(data_train)
t_test, X_test_raw = spamdata(data_test)
In [439]:
# pre-processing
def preprocess_median(X, median = None):
"""
If median is None, compute the median and return tuple
"""
if median is None:
median = np.median(X, axis=0)
return np.asarray(X > median, dtype=int), median
else:
return np.asarray(X > median, dtype=int)
# _, medians = preprocess_median(np.concatenate((X_train_raw, X_test_raw)))
# X_train = preprocess_median(X_train_raw, medians)
# X_test = preprocess_median(X_test_raw, medians)
X_train, medians = preprocess_median(X_train_raw)
X_test = preprocess_median(X_test_raw, medians)
In [440]:
# dimension constants
M = 2 #our pre-processing fixes every feature to be either 0 or 1.
C = 2
D = X_train.shape[1]
print(C,D,M)
In [441]:
def train_NB(X, t, C, D, M, alpha = None, beta = None):
"""
Alpha and beta are Dirichlet prior parameters for *mean* estimate.
"""
N = t.size
if alpha is None:
alpha = np.zeros(C)
if beta is None:
beta = np.zeros(shape=(C,D,M))
pi = np.zeros(shape=(C))
theta = np.zeros(shape=(C,D,M))
alpha_sum = np.sum(alpha)
N = t.size
for c in range(C):
indices = t == c
Nc = np.count_nonzero(indices)
pi[c] = (Nc + alpha[c]) / (N + alpha_sum)
for d in range(D):
counts = np.bincount(X[indices, d], minlength = M)
theta[c,d,:] = (counts + beta[c,d,:]) / (Nc + np.sum(beta[c,d,:]))
return pi, theta
# compute the MLE (equivalent to mean estimate with Dirichlet prior parameters all equal to 0)
pi0, theta0 = train_NB(X_train, t_train, C, D, M)
"""it's enough to print m=0 params only, because theta[:,:,1] = 1 - theta[:,:,0]"""
# print (pi0, theta0[:,:,0])
Out[441]:
In [442]:
def pred_NB(X, pi, theta, returnRatio = False):
"""
make predictions by comparing the probabilities in the log space.
"""
D = theta.shape[1]
probs = np.sum(np.log(theta[:,np.arange(D),X]), axis=2).T + np.log(pi)
if returnRatio:
return np.argmax(probs ,axis=1), probs
else:
return np.argmax(probs ,axis=1)
def test_NB(X, t, pi, theta):
"""
returns the error rates
"""
t_pred = pred_NB(X, pi, theta)
return np.sum(t != t_pred) / t.size
print(test_NB(X_test,t_test, pi0, theta0))
In [504]:
from sklearn.metrics import mutual_info_score, normalized_mutual_info_score
def calc_MI(x, y, bins):
c_xy = np.histogram2d(x, y, (bins,2))[0]
print(c_xy.shape)
mi = mutual_info_score(None, None, contingency=c_xy)
return mi
mutual_info = np.asarray([mutual_info_score(t_train,X_train[:,i]) for i in range(D)], dtype=np.float64)
mut = np.exp(1.0/mutual_info ** 0.2) #unnormalized: 0.143
# mut = 10.0 / ((mutual_info) ** 0.25) #unnormalized: 0.143
# print(mut)
beta = np.tile(np.tile(mut, (M, 1)).T, (C,1,1))
pi0, theta0 = train_NB(X_train, t_train, C, D, M)
pi1, theta1 = train_NB(X_train, t_train, C, D, M, alpha = np.ones(C),beta = np.ones((C,D,M)))
pi2, theta2 = train_NB(X_train, t_train, C, D, M, alpha = np.ones(C),beta = beta)
print(test_NB(X_train, t_train,pi0, theta0), test_NB(X_test, t_test,pi0, theta0))
print(test_NB(X_train, t_train,pi2, theta2), test_NB(X_test, t_test,pi2, theta2))
print(pi0, theta0[0,:,0] / theta0[1,:,0])
# print(pi2, theta2[0,:,0] / theta2[1,:,0])
print(mut)
In [505]:
import matplotlib.pyplot as plt
plt.scatter(np.abs(np.log(theta0[0,:,0] / theta0[1,:,0])), mutual_info)
plt.scatter(np.abs(np.log(theta0[0,:,1kk] / theta0[1,:,1])), mutual_info, color='red')
plt.show()
In [ ]: