In [37]:
import numpy as np
import copy
import bayes_logistic as bl
import pandas as pd

In [38]:
# Download and process data

url="http://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data"
df = pd.read_csv(url)
y = df.values[:, 17] 
y = y.astype(int)
X = np.delete(df.values, 17, 1)
X = np.delete(X, 0, 1)

n_samples, n_features = X.shape

# Add bias column to the feature matrix
B = np.ones((n_samples, n_features + 1)) 
B[:, 1:] = X 
X = B

In [39]:
# Perform feature scaling using mean normalization
for col in range(1, n_features):
    v = X[:, col]
    mean = v.mean()
    std = v.std()
    X[:, col] = (X[:, col] - mean)/std

In [40]:
# The data is divided into training and test sets
TRAINING_PERCENTAGE = 0.7 
TEST_PERCENTAGE  = 0.3 

training_cnt = int(n_samples*TRAINING_PERCENTAGE)
training_X = X[:training_cnt,:]
training_y = y[:training_cnt]

test_cnt = int(n_samples*TEST_PERCENTAGE)
test_X = X[training_cnt:,:]
test_y = y[training_cnt:]

In [41]:
# Train the model 
w_prior = np.zeros(n_features + 1)
H_prior = np.diag(np.ones(n_features + 1))*0.001

GD_BATCH_SIZE = training_cnt
ITERATION_CNT = 5
w = training_X.shape[1]
w_prior = np.zeros(w)
H_prior = np.diag(np.ones(w))*0.001

for i in range(0, ITERATION_CNT):
    for idx in range(0, training_cnt, GD_BATCH_SIZE):
        batch_size = GD_BATCH_SIZE if (idx + GD_BATCH_SIZE) < training_cnt else training_cnt - idx
        w_posterior, H_posterior = bl.fit_bayes_logistic(training_y[idx:batch_size],
                                                     training_X[idx:batch_size,:],
                                                     w_prior, H_prior, solver = 'BFGS')
        w_prior = copy.copy(w_posterior)
        H_prior = copy.copy(H_posterior)

w_fit = w_prior
H_fit = H_prior

In [42]:
# Perform Test
y_cnt = 0
test_probs = bl.bayes_logistic_prob(test_X, w_fit, H_fit)
prediction_cnt = 0
for idx in range(0, test_cnt):
    if test_probs[idx] > 0.5 and test_y[idx] == 1:
        prediction_cnt += 1
    y_cnt += 1

prediction_accuracy = (100.0*prediction_cnt)/y_cnt
print "Prediction Accuracy for test set %.02f" % prediction_accuracy


Prediction Accuracy for test set 60.34