In [5]:
import os
import numpy as np
import pymatbridge as pymat
In [6]:
matlab = pymat.Matlab()
matlab.start()
Out[6]:
In [36]:
%%matlab --size 1600,2000
clear all;
load('hw1progde.mat');
H = [0.01,0.05,0.07,0.09,0.1,0.5];
for i=1:length(H)
h = H(i);
g_y = [];
e_y= [];
ht_y = [];
h_y= [];
N_bins = ceil(1/h)+1;
for j = 1:N_bins
ht_y(end+1) = length(find( x_tr>=(j-1)*h & x_tr<(j*h) ))/(length(x_tr)*h);
end
for x = 0:0.001:1
g_y(end+1) = problem_5_1_a(x,x_tr,h, 'gaussian');
e_y(end+1) = problem_5_1_a(x,x_tr,h, 'epanechnikov');
h_y(end+1) = ht_y(floor(x/h)+1);
%h_y(end+1) = problem_5_1_a(x,x_tr,h, 'histogram');
end
subplot(4,2,i);
plot(0:0.001:1,g_y,0:0.001:1,e_y, 0:0.001:1, h_y);
legend('Gaussian Kernel','Epanechnikov Kernel', 'Histogram');
tl = sprintf('KDE for bandwidth h=%f',h);
title(tl);
print(tl, '-dpng');
%%%%%%%%%%% Shuffle data
end
test_data_size = size(x_te,1);
ix = randperm(test_data_size);
x_te_shuffled = x_te(ix);
N_split = 19;
list_size = test_data_size/N_split;
list_of_shuffled_te = [];
for i=1:N_split
list_of_shuffled_te(end+1,:) = x_te_shuffled(list_size*(i-1)+1 : list_size*i,:);
end
var_g = [];
var_e= [];
var_h = [];
for i=1:length(H)
h = H(i);
x_points = linspace(0, 1, 50);
all_fh_g = [];
all_fh_e = [];
all_fh_h = [];
g_y = [];
e_y= [];
ht_y = [];
h_y= [];
N_bins = ceil(1/h)+1;
for i=1:N_split
data = list_of_shuffled_te(i);
g_y = [];
e_y= [];
h_y= [];
ht_y = [];
for t = 1:N_bins
ht_y(end+1) = length(find( data>=(t-1)*h & data<(t*h) ))/(length(data)*h);
end
for j=1:length(x_points)
x=x_points(j);
g_y(end+1) = problem_5_1_a(x,data,h, 'gaussian');
e_y(end+1) = problem_5_1_a(x,data,h, 'epanechnikov');
h_y(end+1) = ht_y(floor(x/h)+1);
end
all_fh_g(end+1,:) = g_y;
all_fh_e(end+1,:) = e_y;
all_fh_h(end+1,:) = h_y;
end
f_h_g = mean(all_fh_g);
f_h_e = mean(all_fh_e);
f_h_h = mean(all_fh_h);
s_g = 0;
for i=1:N_split
s_g = s_g+sum((all_fh_g(i,:)-f_h_g).^2);
end
s_e = 0;
for i=1:N_split
s_e = s_e+sum((all_fh_e(i,:)-f_h_e).^2);
end
s_h = 0;
for i=1:N_split
s_h = s_h+sum((all_fh_h(i,:)-f_h_h).^2);
end
s_g=s_g/(N_split*length(x_points));
s_e=s_e/(N_split*length(x_points));
s_h=s_h/(N_split*length(x_points));
var_g(end+1)=s_g;
var_e(end+1)=s_e;
var_h(end+1)=s_h;
end
subplot(4,2,7:8)
plot(H,var_g,H,var_e,H,var_h);
disp('Generated plots, stored as pngs');
legend('Gaussian Kernel','Epanechnikov Kernel', 'Histogram');
title('Distribution of Variance with h')
xlabel('Bandwidth (h)')
ylabel('Variance')
In [41]:
%%matlab
%disp('************ Problem 5.2(d) Knn Start*******************')
%disp('Problem 5.2(d)')
[train_data, train_label] = preprocessing_ttt('hw1ttt_train.data');
[valid_data, valid_label] = preprocessing_ttt('hw1ttt_valid.data');
[test_data, test_label] = preprocessing_ttt('hw1ttt_test.data');
KKK = [1,3,5,7,9,11,13,15];
disp(sprintf('k\tTraining\tValidation\tTest'))
for i=1:length(KKK)
t=KKK(i);
[valid_accu, train_accu] = knn_classify(train_data, train_label, valid_data, valid_label, t);
[test_accu, train_accu] = knn_classify(train_data, train_label, test_data, test_label, t);
text = sprintf('%f\t%f\t%f\t%f\n', t, train_accu, valid_accu, test_accu);
disp(text);
end
%disp('************ Problem 5.2(d) Knn End*******************')
In [43]:
%%matlab
%disp('************ Problem 5.2(d) Naive Bayes Start*******************')
%disp('Problem 5.2(d)')
[train_data, train_label] = preprocessing_ttt('hw1ttt_train.data');
[valid_data, valid_label] = preprocessing_ttt('hw1ttt_valid.data');
[test_data, test_label] = preprocessing_ttt('hw1ttt_test.data');
[test_accu, train_accu] = naive_bayes(train_data, train_label, test_data, test_label);
[valid_accu, train_accu] = naive_bayes(train_data, train_label, valid_data, valid_label);
disp('-------Tic Tac Toe Data:--------');
textstr=sprintf('Training Accuracy: %f\nValidation Accuracy: %f\nTesting Accuracy:%f\n', train_accu, valid_accu, test_accu);
disp(textstr)
[train_data, train_label] = preprocessing_nursery('hw1nursery_train.data');
[valid_data, valid_label] = preprocessing_nursery('hw1nursery_valid.data');
[test_data, test_label] = preprocessing_nursery('hw1nursery_test.data');
disp('-------Nursery Data:--------');
textstr=sprintf('Training Accuracy: %f\nValidation Accuracy: %f\nTesting Accuracy:%f\n', train_accu, valid_accu, test_accu);
disp(textstr)
[test_accu, train_accu] = naive_bayes(train_data, train_label, test_data, test_label);
[valid_accu, train_accu] = naive_bayes(train_data, train_label, valid_data, valid_label);
%disp('************ Problem 5.2(d) Naive Bayes End*******************')
In [46]:
%%matlab
[train_data, train_label] = preprocessing_ttt('hw1ttt_train.data');
[valid_data, valid_label] = preprocessing_ttt('hw1ttt_valid.data');
[test_data, test_label] = preprocessing_ttt('hw1ttt_test.data');
disp(sprintf('MinLeaf\tTrain(gdi)\tTrain(dev)\tValid(gdi)\tValid(dev)\tTest(gdi)\tValid(dev)'));
for ml=1:10
[accu_train_gdi, accu_train_dev, accu_valid_gdi, accu_valid_dev] = decision_tree(train_data, train_label, valid_data, valid_label, ml);
[accu_train_gdi, accu_train_dev, accu_test_gdi, accu_test_dev] = decision_tree(train_data, train_label, test_data, test_label, ml);
text = sprintf('%f\t%f\t%f\t%f\t%f\t%f',ml, accu_train_gdi, accu_train_dev, accu_valid_gdi, accu_valid_dev, accu_test_gdi, accu_test_dev);
disp(text);
end
In [49]:
%%matlab --size 1600,1600
load('hw1boundary.mat');
%new_data = unifrnd(0,1,100,100);
new_x = linspace(0,1,100);
new_y = linspace(0,1,100);
new_data = [];%[new_x; new_y];
for i=1:length(new_x)
x = new_x(i);
for j=1:length(new_y)
y= new_y(j);
new_data(end+1,:) = [x,y];
end
end
KKK=[1,5,15,25];
for i=1:4
k=KKK(i);
[train_accu, test_data_labels, train_data_labels] = knn_decision_boundary(features, labels, new_data, KKK(i));
pluses = new_data(test_data_labels == 1,:);
minuses = new_data(test_data_labels == -1,:);
subplot(2,2,i)
plot(pluses(:,1),pluses(:,2),'r+', minuses(:,1),minuses(:,2),'b^');
xlabel('X');
ylabel('Y');
title(sprintf('k=%f',k));
textstr = sprintf('boundary_k%f.png',k);
end
disp('Generated plots, nothing to output');
In [ ]: