In [37]:
import numpy as np
import copy
import bayes_logistic as bl
import pandas as pd
In [38]:
# Download and process data
url="http://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data"
df = pd.read_csv(url)
y = df.values[:, 17]
y = y.astype(int)
X = np.delete(df.values, 17, 1)
X = np.delete(X, 0, 1)
n_samples, n_features = X.shape
# Add bias column to the feature matrix
B = np.ones((n_samples, n_features + 1))
B[:, 1:] = X
X = B
In [39]:
# Perform feature scaling using mean normalization
for col in range(1, n_features):
v = X[:, col]
mean = v.mean()
std = v.std()
X[:, col] = (X[:, col] - mean)/std
In [40]:
# The data is divided into training and test sets
TRAINING_PERCENTAGE = 0.7
TEST_PERCENTAGE = 0.3
training_cnt = int(n_samples*TRAINING_PERCENTAGE)
training_X = X[:training_cnt,:]
training_y = y[:training_cnt]
test_cnt = int(n_samples*TEST_PERCENTAGE)
test_X = X[training_cnt:,:]
test_y = y[training_cnt:]
In [41]:
# Train the model
w_prior = np.zeros(n_features + 1)
H_prior = np.diag(np.ones(n_features + 1))*0.001
GD_BATCH_SIZE = training_cnt
ITERATION_CNT = 5
w = training_X.shape[1]
w_prior = np.zeros(w)
H_prior = np.diag(np.ones(w))*0.001
for i in range(0, ITERATION_CNT):
for idx in range(0, training_cnt, GD_BATCH_SIZE):
batch_size = GD_BATCH_SIZE if (idx + GD_BATCH_SIZE) < training_cnt else training_cnt - idx
w_posterior, H_posterior = bl.fit_bayes_logistic(training_y[idx:batch_size],
training_X[idx:batch_size,:],
w_prior, H_prior, solver = 'BFGS')
w_prior = copy.copy(w_posterior)
H_prior = copy.copy(H_posterior)
w_fit = w_prior
H_fit = H_prior
In [42]:
# Perform Test
y_cnt = 0
test_probs = bl.bayes_logistic_prob(test_X, w_fit, H_fit)
prediction_cnt = 0
for idx in range(0, test_cnt):
if test_probs[idx] > 0.5 and test_y[idx] == 1:
prediction_cnt += 1
y_cnt += 1
prediction_accuracy = (100.0*prediction_cnt)/y_cnt
print "Prediction Accuracy for test set %.02f" % prediction_accuracy