In [1]:
import numpy as np
import pandas as pd
import active
import experiment
import logistic_regression as logr
from sklearn import datasets # The Iris dataset is imported from here.
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 1
%aimport active
%aimport experiment
%aimport logistic_regression
np.set_printoptions(precision=4)
In [2]:
plt.rcParams['axes.labelsize'] = 15
plt.rcParams['axes.titlesize'] = 15
plt.rcParams['xtick.labelsize'] = 15
plt.rcParams['ytick.labelsize'] = 15
plt.rcParams['legend.fontsize'] = 15
plt.rcParams['figure.titlesize'] = 18
In this experiment we work with a data set with 2 classes and 8 features where the classes are divided between whether or not a patient has diabetes and the 8 features correspond to 8 health measurements.
The 2 classes in this data set are not linearly seperable and it is known that the data set has missing variables.
We work with all features of the data set and randomly divide the data set into two halves, to be used for training and testing.
In [3]:
names = ['diabetes', 'num preg', 'plasma', 'bp', 'skin fold', 'insulin', 'bmi', 'pedigree', 'age']
data = pd.read_csv('diabetes_scale.csv', header=None, names=names)
data['ones'] = np.ones((data.shape[0], 1)) # Add a column of ones
data.head()
Out[3]:
In [4]:
np.random.seed(1)
size = data.shape[0]
index = np.arange(size)
np.random.shuffle(index)
training_index = index[:int(size/2)]
testing_index = index[int(size/2):]
See Section 7.5 of the report.
In [5]:
Y = data['diabetes']
X = data[['num preg', 'plasma', 'bp', 'skin fold', 'insulin', 'bmi', 'pedigree', 'age', 'ones']]
X = np.array(X)
Y = np.array(Y)
Y[Y==-1] = 0
X_diabetes_training = X[training_index]
Y_diabetes_training = Y[training_index]
X_diabetes_testing = X[testing_index]
Y_diabetes_testing = Y[testing_index]
In [6]:
print(X_diabetes_testing)
In [7]:
print(Y_diabetes_testing)
In [8]:
print(X_diabetes_training)
In [9]:
print(Y_diabetes_training)
In [66]:
n = 10
iterations = 15
X_training = X_diabetes_training
Y_training = Y_diabetes_training
X_testing = X_diabetes_testing
Y_testing = Y_diabetes_testing
Here we compute the average accuracy over 10 tests, where for each test the accuracy is computed by training logistic regression on 1 to 15 randomly selected patterns from the same fixed training set, which comprises a uniform sample of half the data set.
We note here that the optimization process sometimes fails to converge.
In [53]:
average_accuracies_logr_15 = \
logr.experiment(n, iterations, X_testing, Y_testing, X_training, Y_training)
In [54]:
print(average_accuracies_logr_15)
Here we compute the average accuracy over 10 tests, where for each test the accuracy is computed by training logistic regression on 1 to 30 randomly selected patterns from the same fixed training set, which comprises a uniform sample of half the data set.
We compute the accuracy from training logistic regression on 50% and 100% of the training data set also.
We note here that the optimization process sometimes fails to converge.
In [55]:
iterations = 30
In [56]:
average_accuracies_logr_30 = \
logr.experiment(n, iterations, X_testing, Y_testing, X_training, Y_training)
In [57]:
print(average_accuracies_logr_30)
Here we train logistic regression on 50% of the training data set, selected random uniformly, and repeat this 10 times to compute the average accuracy.
In [58]:
n = 10
In [59]:
size = X_training.shape[0]
size_half = int(size/2)
accuracies = []
for i in range(n):
index_all = np.arange(size)
np.random.shuffle(index_all)
index_half = index_all[:size_half]
X_training_half = X_training[index_half]
Y_training_half = Y_training[index_half]
w_half = logr.train(X_training_half, Y_training_half)
predictions = logr.predict(w_half, X_training_half)
accuracy = logr.compute_accuracy(predictions, Y_training_half)
accuracies.append(accuracy)
accuracies = np.array(accuracies)
average_accuracy_training_half = np.sum(accuracies)/n
In [60]:
print('The average accuracy training on 50% of the training data set is',\
average_accuracy)
Here we simply train logistic regression on half of the data and test it on the other half.
In [67]:
w_training = logr.train(X_training, Y_training)
predictions = logr.predict(w_training, X_testing)
accuracy_training_all = logr.compute_accuracy(predictions, Y_testing)
In [68]:
print("The accuracy training on the whole training data set is", accuracy)
In [72]:
Y = data['diabetes']
X = data[['num preg', 'plasma', 'bp', 'skin fold', 'insulin', 'bmi', 'pedigree', 'age', 'ones']]
X = np.array(X)
Y = np.array(Y)
X_diabetes_training = X[training_index]
Y_diabetes_training = Y[training_index]
X_diabetes_testing = X[testing_index]
Y_diabetes_testing = Y[testing_index]
In [73]:
n = 10
iterations = 15
X_testing = X_diabetes_testing
Y_testing = Y_diabetes_testing
X_training = X_diabetes_training
Y_training = Y_diabetes_training
In [74]:
average_accuracies_ac_15 = \
experiment.experiment(n, iterations, X_testing, Y_testing,
X_training, Y_training, center='ac',
sample=1, M=None)
In [75]:
average_accuracies_cc_15 = \
experiment.experiment(n, iterations, X_testing, Y_testing,
X_training, Y_training, center='cc',
sample=1, M=None)
In [76]:
average_accuracies_rand_15 = \
experiment.experiment(n, iterations, X_testing, Y_testing,
X_training, Y_training, center='random',
sample=1, M=None)
In [82]:
plt.figure(figsize=(12,7))
queries = np.arange(1, iterations + 1)
plt.plot(queries, average_accuracies_logr_15, 'mx-', label='LR',
markevery=5,
lw=1.5, ms=10, markerfacecolor='none', markeredgewidth=1.5,
markeredgecolor = 'm')
plt.plot(queries, average_accuracies_ac_15, 'r^-', label='AC',
markevery=5,
lw=1.5, ms=10, markerfacecolor='none', markeredgewidth=1.5,
markeredgecolor = 'r')
plt.plot(queries, average_accuracies_cc_15, 'go-', label='CC',
markevery=5,
lw=1.5, ms=10, markerfacecolor='none', markeredgewidth=1.5,
markeredgecolor = 'g')
plt.plot(queries, average_accuracies_rand_15, 'bs-', label='Random',
markevery=5,
lw=1.5, ms=10, markerfacecolor='none', markeredgewidth=1.5,
markeredgecolor = 'b')
plt.plot(queries, [average_accuracy_training_half]*queries.shape[0], 'k--',
color = '0.4', label='LR - half',lw=1.5, ms=10)
plt.plot(queries, [accuracy_training_all]*queries.shape[0], 'k-',
color = '0.4', label='LR - all',lw=1.5, ms=10)
plt.xlabel('Number of iterations')
plt.ylabel('Accuracy averaged over %d tests' % n)
plt.title('Average accuracy of a cutting plane active learning procedure (diabetes data set)')
plt.legend(loc='best')
plt.savefig('diabetes_experiment_all_15.png', dpi=600, bbox_inches='tight', transparent=True)
plt.show()