In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from scipy.stats import f
from scipy.stats import norm
from scipy.stats import wilcoxon
Load and plot the data from the MOA csv files. Provide a plot for each data set with each of the classification algorithms and build a matrix with the averaged accuracies, which will be used for testing for statistical improvements to the accuracy of a classifier. All of the plots are saved to the plots/ directory as a pdf.
In [8]:
pth = 'outputs2/'
base = 'hoeff'
algs = [ 'bagging-'+base, 'pame1-bag-'+base, 'pame3-bag-'+base]
#algs = [base, 'bagging-'+base, 'boosting-'+base,
# 'pame1-bag-'+base, 'pame2-bag-'+base, 'pame3-bag-'+base,
# 'pame1-boo-'+base, 'pame2-boo-'+base, 'pame3-boo-'+base]
# 'spam' , 'poker'
datas = ['abalone_v2', 'airlines', 'breast-w' ,'car_v2' ,'cmc_v2' ,'colic' ,'cov_v2' ,'credit-a' ,
'diabetes' ,'dow' ,'elecNormNew' ,'german' ,'haberman' ,'hepatitis' ,'hyper' ,'image' ,
'magic' ,'noaa' ,'sea' ,'sick' ,'splice_v2']
datas= ['sea','acute-inflammation','acute-nephritis','adult_train','bank','blood','breast-cancer','breast-cancer-wisc-diag','breast-cancer-wisc','breast-cancer-wisc-prog','chess-krvkp','congressional-voting','conn-bench-sonar-mines-rocks','connect-4','credit-approval','cylinder-bands','echocardiogram','fertility','haberman-survival','heart-hungarian','hepatitis','hill-valley_train','horse-colic_train','ilpd-indian-liver','ionosphere','magic','mammographic','miniboone','molec-biol-promoter','monks-1_train','monks-2_train','monks-3_train','mushroom','musk-1','musk-2','oocytes_merluccius_nucleus_4d','oocytes_trisopterus_nucleus_2f','ozone','parkinsons','pima','pittsburg-bridges-T-OR-D','planning','ringnorm','spambase','spect_train','spectf_train','statlog-australian-credit','statlog-german-credit','statlog-heart','tic-tac-toe','titanic','twonorm','vertebral-column-2clases']
nalg = len(algs)
ndatas = len(datas)
F = 'classifications correct (percent)'
TZ = 'model cost (RAM-Hours)'
T = 'learning evaluation instances'
mrk = ['o', 's', '^', '8', 'p', 'D', '+', '*', 'x']
ln = []
lims = [[35,65], [55,85], [70,95], [60,95], [45,65], [45,80], [60,90], [30,90], [40,80],
[42,58], [65,96], [35,75], [55,85], [10,90], [76,96], [62,78], [65,90], [64,82],
[94,106], [70,95], [92,98], [75,100]]
accs = np.zeros((ndatas, nalg))
mdlcst = np.zeros((ndatas, nalg))
for j in range(ndatas):
#z = lims[j]
plt.figure()
#print "Running datas " + datas[j]
for i in range(nalg):
#print " -- " + algs[i]
#results-car_v2-pame1-hoeff.csv
if datas[j] == 'splice_v2':
nsep = 50
else:
nsep = 5
#print pth+'results-'+datas[j]+'-'+algs[i]+'.csv'
df = pd.read_csv(pth+'results-'+datas[j]+'-'+algs[i]+'.csv')
t = range(0, len(df[T]), nsep)
plt.plot(df[T][t], df[F][t], marker=mrk[i])
accs[j, i] = df[F].mean()
mdlcst[j, i] = df[TZ].mean()
#plt.ylim(z[0],z[1])
plt.legend(["Bag", "PAME-I", "PAME-II"], ncol=2, loc='best')
font = {'size' : 13}
plt.rc('font', **font)
plt.savefig('plots/results-'+datas[j]+'-'+base+'-error.pdf', format='pdf')
In [2]:
pth = 'outputs2/'
base = 'hoeff'
algs = [ 'bagging-'+base, 'pame1-bag-'+base, 'pame3-bag-'+base]
#algs = [base, 'bagging-'+base, 'boosting-'+base,
# 'pame1-bag-'+base, 'pame2-bag-'+base, 'pame3-bag-'+base,
# 'pame1-boo-'+base, 'pame2-boo-'+base, 'pame3-boo-'+base]
# 'spam' , 'poker'
datas = ['abalone_v2', 'airlines', 'breast-w' ,'car_v2' ,'cmc_v2' ,'colic' ,'cov_v2' ,'credit-a' ,
'diabetes' ,'dow' ,'elecNormNew' ,'german' ,'haberman' ,'hepatitis' ,'hyper' ,'image' ,
'magic' ,'noaa' ,'sea' ,'sick' ,'splice_v2']
datas= ['acute-inflammation','acute-nephritis','adult_train','bank','blood','breast-cancer','breast-cancer-wisc-diag','breast-cancer-wisc','breast-cancer-wisc-prog','chess-krvkp','congressional-voting','conn-bench-sonar-mines-rocks','connect-4','credit-approval','cylinder-bands','echocardiogram','fertility','haberman-survival','heart-hungarian','hepatitis','hill-valley_train','horse-colic_train','ilpd-indian-liver','ionosphere','magic','mammographic','miniboone','molec-biol-promoter','monks-1_train','monks-2_train','monks-3_train','mushroom','musk-1','musk-2','oocytes_merluccius_nucleus_4d','oocytes_trisopterus_nucleus_2f','ozone','parkinsons','pima','pittsburg-bridges-T-OR-D','planning','ringnorm','spambase','spect_train','spectf_train','statlog-australian-credit','statlog-german-credit','statlog-heart','tic-tac-toe','titanic','twonorm','vertebral-column-2clases']
nalg = len(algs)
ndatas = len(datas)
TZ = 'model cost (RAM-Hours)'
T = 'learning evaluation instances'
mrk = ['o', 's', '^', '8', 'p', 'D']
ln = []
lims = [[35,65], [55,85], [70,95], [60,95], [45,65], [45,80], [60,90], [30,90], [40,80],
[42,58], [65,96], [35,75], [55,85], [10,90], [76,96], [62,78], [65,90], [64,82],
[94,106], [70,95], [92,98], [75,100]]
for j in range(ndatas):
#z = lims[j]
plt.figure()
#print "Running datas " + datas[j]
for i in range(nalg):
#print " -- " + algs[i]
#results-car_v2-pame1-hoeff.csv
if datas[j] == 'splice_v2':
nsep = 50
else:
nsep = 5
df = pd.read_csv(pth+'results-'+datas[j]+'-'+algs[i]+'.csv')
t = range(0, len(df[T]), nsep)
plt.plot(df[T][t], df[TZ][t], marker=mrk[i])
#plt.ylim(z[0],z[1])
plt.legend(["Bag", "PAME-I", "PAME-II"], ncol=2, loc='best')
font = {'size' : 13}
plt.rc('font', **font)
plt.savefig('plots/results-'+datas[j]+'-'+base+'-model-cost.pdf', format='pdf')
In [5]:
pth = 'outputs2/'
base = 'hoeff'
algs = ['pame1-bag-'+base]
# 'spam' , 'poker'
datas = ['abalone_v2', 'airlines', 'breast-w' ,'car_v2' ,'cmc_v2' ,'colic' ,'cov_v2' ,'credit-a' ,
'diabetes' ,'dow' ,'elecNormNew' ,'german' ,'haberman' ,'hepatitis' ,'hyper' ,'image' ,
'magic' ,'noaa' ,'sea' ,'sick' ,'splice_v2']
datas= ['acute-inflammation','acute-nephritis','adult_train','bank','blood','breast-cancer','breast-cancer-wisc-diag','breast-cancer-wisc','breast-cancer-wisc-prog','chess-krvkp','congressional-voting','conn-bench-sonar-mines-rocks','connect-4','credit-approval','cylinder-bands','echocardiogram','fertility','haberman-survival','heart-hungarian','hepatitis','hill-valley_train','horse-colic_train','ilpd-indian-liver','ionosphere','magic','mammographic','miniboone','molec-biol-promoter','monks-1_train','monks-2_train','monks-3_train','mushroom','musk-1','musk-2','oocytes_merluccius_nucleus_4d','oocytes_trisopterus_nucleus_2f','ozone','parkinsons','pima','pittsburg-bridges-T-OR-D','planning','ringnorm','spambase','spect_train','spectf_train','statlog-australian-credit','statlog-german-credit','statlog-heart','tic-tac-toe','titanic','twonorm','vertebral-column-2clases']
nalg = len(algs)
ndatas = len(datas)
F = 'negative weights'
T = 'learning evaluation instances'
mrk = ['o', 's', '^', '8', 'p', 'D']
ln = []
lims = [[35,65], [55,85], [70,95], [60,95], [45,65], [45,80], [60,90], [30,90], [40,80],
[42,58], [65,96], [35,75], [55,85], [10,90], [76,96], [62,78], [65,90], [64,82],
[94,106], [70,95], [92,98], [75,100]]
accs = np.zeros((ndatas, nalg))
mdlcst = np.zeros((ndatas, nalg))
for j in range(ndatas):
#z = lims[j]
plt.figure()
#print "Running datas " + datas[j]
for i in range(nalg):
#print " -- " + algs[i]
#results-car_v2-pame1-hoeff.csv
if datas[j] == 'splice_v2':
nsep = 50
else:
nsep = 5
df = pd.read_csv(pth+'results-'+datas[j]+'-'+algs[i]+'.csv')
t = range(0, len(df[T]), nsep)
plt.plot(df[T][t], df[F][t], marker=mrk[i])
accs[j, i] = df[F].mean()
mdlcst[j, i] = df[TZ].mean()
#plt.ylim(z[0],z[1])
plt.legend(["PAME-1"], loc='best')
font = {'size' : 13}
plt.rc('font', **font)
plt.savefig('plots/results-'+datas[j]+'-'+base+'nweights.pdf', format='pdf')
In [3]:
pth = 'outputs2/'
base = 'hoeff'
algs = ['bagging-'+base,'pame1-bag-'+base, 'pame3-bag-'+base]
algs = [ 'bagging-'+base, 'pame1-bag-'+base, 'pame3-bag-'+base]
#algs = [base, 'bagging-'+base, 'boosting-'+base,
# 'pame1-bag-'+base, 'pame2-bag-'+base, 'pame3-bag-'+base,
# 'pame1-boo-'+base, 'pame2-boo-'+base, 'pame3-boo-'+base]
# 'spam' , 'poker'
datas = ['abalone_v2', 'airlines', 'breast-w' ,'car_v2' ,'cmc_v2' ,'colic' ,'cov_v2' ,'credit-a' ,
'diabetes' ,'dow' ,'elecNormNew' ,'german' ,'haberman' ,'hepatitis' ,'hyper' ,'image' ,
'magic' ,'noaa' ,'sea' ,'sick' ,'splice_v2']
datas= ['acute-inflammation','acute-nephritis','adult_train','bank','blood','breast-cancer','breast-cancer-wisc-diag','breast-cancer-wisc-prog','chess-krvkp','congressional-voting','conn-bench-sonar-mines-rocks','connect-4','credit-approval','cylinder-bands','echocardiogram','fertility','haberman-survival','heart-hungarian','hepatitis','hill-valley_train','horse-colic_train','ilpd-indian-liver','ionosphere','magic','mammographic','miniboone','molec-biol-promoter','mushroom','musk-1','musk-2','oocytes_merluccius_nucleus_4d','oocytes_trisopterus_nucleus_2f','ozone','parkinsons','pima','pittsburg-bridges-T-OR-D','planning','ringnorm','spambase','spect_train','spectf_train','statlog-australian-credit','statlog-german-credit','statlog-heart','tic-tac-toe','titanic','twonorm','vertebral-column-2clases']
datas= ['acute-inflammation','acute-nephritis','adult_train','bank','blood',
'breast-cancer','breast-cancer-wisc-diag','chess-krvkp','congressional-voting',
'conn-bench-sonar-mines-rocks','connect-4','credit-approval','cylinder-bands',
'echocardiogram','fertility','haberman-survival','heart-hungarian','hepatitis',
'hill-valley_train','horse-colic_train','ilpd-indian-liver','ionosphere','magic','mammographic',
'miniboone','molec-biol-promoter','mushroom','musk-2','oocytes_merluccius_nucleus_4d',
'oocytes_trisopterus_nucleus_2f','ozone','parkinsons','pima','pittsburg-bridges-T-OR-D',
'planning','ringnorm','spambase','sea','spect_train','spectf_train','statlog-australian-credit',
'statlog-german-credit','statlog-heart','tic-tac-toe','titanic','twonorm','vertebral-column-2clases']
nalg = len(algs)
ndatas = len(datas)
F = 'classifications correct (percent)'
accs = np.zeros((ndatas, nalg))
mdlcst = np.zeros((ndatas, nalg))
for j in range(ndatas):
for i in range(nalg):
df = pd.read_csv(pth+'results-'+datas[j]+'-'+algs[i]+'.csv')
accs[j, i] = df[F].mean()
Z = np.zeros((nalg, nalg))
ranks = np.zeros((ndatas, nalg))
for i in range(ndatas):
ss = np.argsort(accs[i])
for m, n in map(None, range(nalg), ss):
ranks[i, n] = nalg - m
#ranks[i] = nalg-rs[np.argsort(accs[i])]
R = ranks.mean(axis=0)
# chi-squared stat
XF = 12*ndatas/(nalg*(nalg+1)) * ((R**2).sum() - (nalg*(nalg+1)**2)/4)
FF = (ndatas-1)*XF/(ndatas*(nalg-1)-XF)
pF = 1 - f.cdf(FF, nalg-1, (nalg-1)*(ndatas-1))
if pF < 0.05:
print "Friedman: Reject the Null Hypothesis"
else:
print "Friedman: Fail to Reject the Null Hypothesis"
print " "
for i in range(nalg):
for j in range(nalg):
Z[i,j] = (R[i]-R[j])/(np.sqrt(1.*(nalg*(nalg+1))/(6*ndatas)))
s = ' & '
for alg in algs:
s += alg + ' & '
print s
for j in range(ndatas):
s = datas[j] + ' & '
for i in range(nalg-1):
s += str(np.round(1000*accs[j,i])/1000) + ' (' + str(ranks[j,i]) + ') & '
s += str(np.round(1000*accs[j,nalg-1])/1000) + ' (' + str(ranks[j,nalg-1]) + ') \\\\'
print s
s = ' & '
for i in range(nalg-1):
s += str(np.round(100*R[i])/100) + ' & '
s += str(np.round(100*R[nalg-1])/100) + ' \\\\ '
print s
In [9]:
pth = 'outputs2/'
base = 'hoeff'
algs = ['bagging-'+base,'pame1-bag-'+base, 'pame3-bag-'+base]
#algs = [base, 'bagging-'+base, 'boosting-'+base,
# 'pame1-bag-'+base, 'pame2-bag-'+base, 'pame3-bag-'+base,
# 'pame1-boo-'+base, 'pame2-boo-'+base, 'pame3-boo-'+base]
# 'spam' , 'poker'
datas = ['abalone_v2', 'airlines', 'breast-w' ,'car_v2' ,'cmc_v2' ,'colic' ,'cov_v2' ,'credit-a' ,
'diabetes' ,'dow' ,'elecNormNew' ,'german' ,'haberman' ,'hepatitis' ,'hyper' ,'image' ,
'magic' ,'noaa' ,'sea' ,'sick' ,'splice_v2']
datas= ['acute-inflammation','acute-nephritis','adult_train','bank','blood','breast-cancer','breast-cancer-wisc-diag','breast-cancer-wisc','breast-cancer-wisc-prog','chess-krvkp','congressional-voting','conn-bench-sonar-mines-rocks','connect-4','credit-approval','cylinder-bands','echocardiogram','fertility','haberman-survival','heart-hungarian','hepatitis','hill-valley_train','horse-colic_train','ilpd-indian-liver','ionosphere','magic','mammographic','miniboone','molec-biol-promoter','monks-1_train','monks-2_train','monks-3_train','mushroom','musk-1','musk-2','oocytes_merluccius_nucleus_4d','oocytes_trisopterus_nucleus_2f','ozone','parkinsons','pima','pittsburg-bridges-T-OR-D','planning','ringnorm','spambase','spect_train','spectf_train','statlog-australian-credit','statlog-german-credit','statlog-heart','tic-tac-toe','titanic','twonorm','vertebral-column-2clases']
nalg = len(algs)
ndatas = len(datas)
F = 'model cost (RAM-Hours)'
mdlcst = np.zeros((ndatas, nalg))
for j in range(ndatas):
for i in range(nalg):
df = pd.read_csv(pth+'results-'+datas[j]+'-'+algs[i]+'.csv')
mdlcst[j, i] = df[F].mean()
Z = np.zeros((nalg, nalg))
ranks = np.zeros((ndatas, nalg))
for i in range(ndatas):
ss = np.argsort(accs[i])
for m, n in map(None, range(nalg), ss):
ranks[i, n] = 1 + m
R = ranks.mean(axis=0)
# chi-squared stat
XF = 12*ndatas/(nalg*(nalg+1)) * ((R**2).sum() - (nalg*(nalg+1)**2)/4)
FF = (ndatas-1)*XF/(ndatas*(nalg-1)-XF)
pF = 1 - f.cdf(FF, nalg-1, (nalg-1)*(ndatas-1))
if pF < 0.05:
print "Friedman: Reject the Null Hypothesis"
else:
print "Friedman: Fail to Reject the Null Hypothesis"
print " "
for i in range(nalg):
for j in range(nalg):
Z[i,j] = (R[i]-R[j])/(np.sqrt(1.*(nalg*(nalg+1))/(6*ndatas)))
for j in range(ndatas):
s = datas[j] + ' & '
for i in range(nalg-1):
s += str(100000000*mdlcst[j,i]) + ' (' + str(ranks[j,i]) + ') & '
s += str(mdlcst[j,nalg-1]) + ' (' + str(ranks[j,nalg-1]) + ') \\\\'
print s
s = ' & '
for i in range(nalg-1):
s += str(np.round(100*R[i])/100) + ' & '
s += str(np.round(100*R[nalg-1])/100) + ' \\\\ '
print s
In [5]:
for j in range(ndatas):
s = datas[j] + ' & '
for i in range(nalg-1):
s += str(np.round(1000*accs[j,i])/1000) + ' (' + str(ranks[j,i]) + ') & '
s += str(np.round(1000*accs[j,nalg-1])/1000) + ' (' + str(ranks[j,nalg-1]) + ') \\\\'
print s
s = ' & '
for i in range(nalg-1):
s += str(np.round(100*R[i])/100) + ' & '
s += str(np.round(100*R[nalg-1])/100) + ' \\\\ '
print s
In [ ]:
df = pd.read_csv('outputs/kl-elec.csv')
out = plt.hist(df['KL-div'], 20)
plt.xlabel('KL-divergence')
plt.ylabel('Relative Frequency')
font = {'size' : 13}
plt.rc('font', **font)
plt.savefig('plots/kl-elec.pdf', format='pdf')
plt.figure()
df = pd.read_csv('outputs/kl-abalone.csv')
out = plt.hist(df['KL-div'], 20)
plt.xlabel('KL-divergence')
plt.ylabel('Relative Frequency')
font = {'size' : 13}
plt.rc('font', **font)
plt.savefig('plots/kl-abalone.pdf', format='pdf')
In [7]:
_,p = wilcoxon(accs[:,0], accs[:,2])
print "p: ", p
In [7]:
pth = 'outputs2/'
base = 'hoeff'
algs = [ 'bagging-'+base, 'pame1-bag-'+base, 'pame3-bag-'+base]
algs = ['bagging-'+base, 'boosting-'+base, 'pame1-boo-'+base, 'pame3-boo-'+base]
#algs = [base, 'bagging-'+base, 'boosting-'+base,
# 'pame1-bag-'+base, 'pame2-bag-'+base, 'pame3-bag-'+base,
# 'pame1-boo-'+base, 'pame2-boo-'+base, 'pame3-boo-'+base]
# 'spam' , 'poker'
datas = ['abalone_v2', 'airlines', 'breast-w' ,'car_v2' ,'cmc_v2' ,'colic' ,'cov_v2' ,'credit-a' ,
'diabetes' ,'dow' ,'elecNormNew' ,'german' ,'haberman' ,'hepatitis' ,'hyper' ,'image' ,
'magic' ,'noaa' ,'sea' ,'sick' ,'splice_v2']
datas= ['acute-inflammation','acute-nephritis','adult_train','bank','blood','breast-cancer','breast-cancer-wisc-diag','breast-cancer-wisc','breast-cancer-wisc-prog','chess-krvkp','congressional-voting','conn-bench-sonar-mines-rocks','connect-4','credit-approval','cylinder-bands','echocardiogram','fertility','haberman-survival','heart-hungarian','hepatitis','hill-valley_train','horse-colic_train','ilpd-indian-liver','ionosphere','magic','mammographic','miniboone','molec-biol-promoter','monks-1_train','monks-2_train','monks-3_train','mushroom','musk-1','musk-2','oocytes_merluccius_nucleus_4d','oocytes_trisopterus_nucleus_2f','ozone','parkinsons','pima','pittsburg-bridges-T-OR-D','planning','ringnorm','spambase','spect_train','spectf_train','statlog-australian-credit','statlog-german-credit','statlog-heart','tic-tac-toe','titanic','twonorm','vertebral-column-2clases']
nalg = len(algs)
ndatas = len(datas)
F = 'classifications correct (percent)'
alpha = .05
idxs = []
accs = np.zeros((ndatas, nalg))
mdlcst = np.zeros((ndatas, nalg))
for j in range(ndatas):
mmax = 0
idx = 0
for i in range(nalg):
df = pd.read_csv(pth+'results-'+datas[j]+'-'+algs[i]+'.csv')
accs[j, i] = df[F].mean()
if accs[j, i] > mmax:
mmax = accs[j, i]
idx = i
idxs.append(idx)
_,p_ba_p1 = wilcoxon(accs[:,0], accs[:,2])
_,p_ba_p3 = wilcoxon(accs[:,0], accs[:,3])
_,p_bo_p1 = wilcoxon(accs[:,1], accs[:,2])
_,p_bo_p3 = wilcoxon(accs[:,1], accs[:,3])
if p_ba_p1 <= alpha:
if accs[:,2].sum() > accs[:,0].sum():
print "PAME1 is better than Bagging ("+str(p_ba_p1)+")"
if p_ba_p3 <= alpha:
if accs[:,3].sum() > accs[:,0].sum():
print "PAME3 is better than Bagging ("+str(p_ba_p3)+")"
if p_bo_p1 <= alpha:
if accs[:,2].sum() > accs[:,1].sum():
print "PAME1 is better than Boosting ("+str(p_bo_p1)+")"
if p_bo_p3 <= alpha:
if accs[:,3].sum() > accs[:,1].sum():
print "PAME3 is better than Boosting ("+str(p_bo_p3)+")"
print ' '
print accs[:,2].sum(), ' ', accs[:,0].sum()
print accs[:,2].sum(), ' ', accs[:,1].sum()
print accs[:,3].sum(), ' ', accs[:,0].sum()
print accs[:,3].sum(), ' ', accs[:,1].sum()
print p_ba_p1, p_ba_p3, p_bo_p1,p_bo_p3
print ' '
print ' '
for j in range(ndatas):
s = datas[j].replace('_train', '').replace('_', '-') + ' & '
for i in range(nalg-1):
if idxs[j] == i:
s += str(1.*int(100*accs[j,i])/100) + '$^*$ & '
else:
s += str(1.*int(100*accs[j,i])/100) + ' & '
if idxs[j] == nalg-1:
s += str(1.*int(100*accs[j,nalg-1])/100) + '$^*$ \\\\'
else:
s += str(1.*int(100*accs[j,nalg-1])/100) + ' \\\\'
print s
In [8]:
print p_ba_p1, p_ba_p3, p_bo_p1,p_bo_p3
In [13]:
print len(datas)
In [5]:
print accs
In [6]:
accs.shape
Out[6]:
In [ ]: