Imports and About

Import the needed packages for plotting and setup Matplotlib for inline plotting. This Jupyter notebook is used to summarized the data from the PAME experiments script (pame-experiments.sh) and statistical hypothesis tests for improvements.


In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from scipy.stats import f
from scipy.stats import norm
from scipy.stats import wilcoxon


/Users/gditzler/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

Plotters

Load and plot the data from the MOA csv files. Provide a plot for each data set with each of the classification algorithms and build a matrix with the averaged accuracies, which will be used for testing for statistical improvements to the accuracy of a classifier. All of the plots are saved to the plots/ directory as a pdf.


In [8]:
pth = 'outputs2/'
base = 'hoeff'
algs = [ 'bagging-'+base, 'pame1-bag-'+base, 'pame3-bag-'+base]
#algs = [base, 'bagging-'+base, 'boosting-'+base, 
#        'pame1-bag-'+base, 'pame2-bag-'+base, 'pame3-bag-'+base, 
#        'pame1-boo-'+base, 'pame2-boo-'+base, 'pame3-boo-'+base]
# 'spam' , 'poker'
datas = ['abalone_v2', 'airlines', 'breast-w' ,'car_v2' ,'cmc_v2' ,'colic' ,'cov_v2' ,'credit-a' ,
         'diabetes' ,'dow' ,'elecNormNew' ,'german' ,'haberman' ,'hepatitis' ,'hyper' ,'image' ,
         'magic' ,'noaa' ,'sea' ,'sick' ,'splice_v2']
datas=  ['sea','acute-inflammation','acute-nephritis','adult_train','bank','blood','breast-cancer','breast-cancer-wisc-diag','breast-cancer-wisc','breast-cancer-wisc-prog','chess-krvkp','congressional-voting','conn-bench-sonar-mines-rocks','connect-4','credit-approval','cylinder-bands','echocardiogram','fertility','haberman-survival','heart-hungarian','hepatitis','hill-valley_train','horse-colic_train','ilpd-indian-liver','ionosphere','magic','mammographic','miniboone','molec-biol-promoter','monks-1_train','monks-2_train','monks-3_train','mushroom','musk-1','musk-2','oocytes_merluccius_nucleus_4d','oocytes_trisopterus_nucleus_2f','ozone','parkinsons','pima','pittsburg-bridges-T-OR-D','planning','ringnorm','spambase','spect_train','spectf_train','statlog-australian-credit','statlog-german-credit','statlog-heart','tic-tac-toe','titanic','twonorm','vertebral-column-2clases']
nalg = len(algs)
ndatas = len(datas)
F = 'classifications correct (percent)'
TZ = 'model cost (RAM-Hours)'
T = 'learning evaluation instances'
mrk = ['o', 's', '^', '8', 'p', 'D', '+', '*', 'x']
ln = []
lims = [[35,65], [55,85], [70,95], [60,95], [45,65], [45,80], [60,90], [30,90], [40,80], 
        [42,58], [65,96], [35,75], [55,85], [10,90], [76,96], [62,78], [65,90], [64,82],
        [94,106], [70,95], [92,98], [75,100]]


accs = np.zeros((ndatas, nalg))
mdlcst = np.zeros((ndatas, nalg))

for j in range(ndatas):
    #z = lims[j]
    plt.figure()
    #print "Running datas " + datas[j]
    for i in range(nalg):
        #print "   -- " + algs[i]
        #results-car_v2-pame1-hoeff.csv
        if datas[j] == 'splice_v2':
            nsep = 50
        else:
            nsep = 5
        #print pth+'results-'+datas[j]+'-'+algs[i]+'.csv'
        df = pd.read_csv(pth+'results-'+datas[j]+'-'+algs[i]+'.csv')
        t = range(0, len(df[T]), nsep)
        plt.plot(df[T][t], df[F][t], marker=mrk[i])
        accs[j, i] = df[F].mean()
        mdlcst[j, i] = df[TZ].mean()
    
    
    #plt.ylim(z[0],z[1])
    plt.legend(["Bag", "PAME-I", "PAME-II"], ncol=2, loc='best')
    font = {'size'   : 13}
    plt.rc('font', **font)
    plt.savefig('plots/results-'+datas[j]+'-'+base+'-error.pdf', format='pdf')



In [2]:
pth = 'outputs2/'
base = 'hoeff'
algs = [ 'bagging-'+base, 'pame1-bag-'+base, 'pame3-bag-'+base]
#algs = [base, 'bagging-'+base, 'boosting-'+base, 
#        'pame1-bag-'+base, 'pame2-bag-'+base, 'pame3-bag-'+base, 
#        'pame1-boo-'+base, 'pame2-boo-'+base, 'pame3-boo-'+base]
# 'spam' , 'poker'
datas = ['abalone_v2', 'airlines', 'breast-w' ,'car_v2' ,'cmc_v2' ,'colic' ,'cov_v2' ,'credit-a' ,
         'diabetes' ,'dow' ,'elecNormNew' ,'german' ,'haberman' ,'hepatitis' ,'hyper' ,'image' ,
         'magic' ,'noaa' ,'sea' ,'sick' ,'splice_v2']
datas=  ['acute-inflammation','acute-nephritis','adult_train','bank','blood','breast-cancer','breast-cancer-wisc-diag','breast-cancer-wisc','breast-cancer-wisc-prog','chess-krvkp','congressional-voting','conn-bench-sonar-mines-rocks','connect-4','credit-approval','cylinder-bands','echocardiogram','fertility','haberman-survival','heart-hungarian','hepatitis','hill-valley_train','horse-colic_train','ilpd-indian-liver','ionosphere','magic','mammographic','miniboone','molec-biol-promoter','monks-1_train','monks-2_train','monks-3_train','mushroom','musk-1','musk-2','oocytes_merluccius_nucleus_4d','oocytes_trisopterus_nucleus_2f','ozone','parkinsons','pima','pittsburg-bridges-T-OR-D','planning','ringnorm','spambase','spect_train','spectf_train','statlog-australian-credit','statlog-german-credit','statlog-heart','tic-tac-toe','titanic','twonorm','vertebral-column-2clases']
nalg = len(algs)
ndatas = len(datas)
TZ = 'model cost (RAM-Hours)'
T = 'learning evaluation instances'
mrk = ['o', 's', '^', '8', 'p', 'D']
ln = []
lims = [[35,65], [55,85], [70,95], [60,95], [45,65], [45,80], [60,90], [30,90], [40,80], 
        [42,58], [65,96], [35,75], [55,85], [10,90], [76,96], [62,78], [65,90], [64,82],
        [94,106], [70,95], [92,98], [75,100]]


for j in range(ndatas):
    #z = lims[j]
    plt.figure()
    #print "Running datas " + datas[j]
    for i in range(nalg):
        #print "   -- " + algs[i]
        #results-car_v2-pame1-hoeff.csv
        if datas[j] == 'splice_v2':
            nsep = 50
        else:
            nsep = 5
                
        df = pd.read_csv(pth+'results-'+datas[j]+'-'+algs[i]+'.csv')
        t = range(0, len(df[T]), nsep)
        plt.plot(df[T][t], df[TZ][t], marker=mrk[i])    
    
    #plt.ylim(z[0],z[1])
    plt.legend(["Bag", "PAME-I", "PAME-II"], ncol=2, loc='best')
    font = {'size'   : 13}
    plt.rc('font', **font)
    plt.savefig('plots/results-'+datas[j]+'-'+base+'-model-cost.pdf', format='pdf')


/Users/gditzler/anaconda/lib/python2.7/site-packages/matplotlib/pyplot.py:516: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

In [5]:
pth = 'outputs2/'
base = 'hoeff'
algs = ['pame1-bag-'+base]
# 'spam' , 'poker'
datas = ['abalone_v2', 'airlines', 'breast-w' ,'car_v2' ,'cmc_v2' ,'colic' ,'cov_v2' ,'credit-a' ,
         'diabetes' ,'dow' ,'elecNormNew' ,'german' ,'haberman' ,'hepatitis' ,'hyper' ,'image' ,
         'magic' ,'noaa' ,'sea' ,'sick' ,'splice_v2']
datas=  ['acute-inflammation','acute-nephritis','adult_train','bank','blood','breast-cancer','breast-cancer-wisc-diag','breast-cancer-wisc','breast-cancer-wisc-prog','chess-krvkp','congressional-voting','conn-bench-sonar-mines-rocks','connect-4','credit-approval','cylinder-bands','echocardiogram','fertility','haberman-survival','heart-hungarian','hepatitis','hill-valley_train','horse-colic_train','ilpd-indian-liver','ionosphere','magic','mammographic','miniboone','molec-biol-promoter','monks-1_train','monks-2_train','monks-3_train','mushroom','musk-1','musk-2','oocytes_merluccius_nucleus_4d','oocytes_trisopterus_nucleus_2f','ozone','parkinsons','pima','pittsburg-bridges-T-OR-D','planning','ringnorm','spambase','spect_train','spectf_train','statlog-australian-credit','statlog-german-credit','statlog-heart','tic-tac-toe','titanic','twonorm','vertebral-column-2clases']

nalg = len(algs)
ndatas = len(datas)
F = 'negative weights'
T = 'learning evaluation instances'
mrk = ['o', 's', '^', '8', 'p', 'D']
ln = []
lims = [[35,65], [55,85], [70,95], [60,95], [45,65], [45,80], [60,90], [30,90], [40,80], 
        [42,58], [65,96], [35,75], [55,85], [10,90], [76,96], [62,78], [65,90], [64,82],
        [94,106], [70,95], [92,98], [75,100]]


accs = np.zeros((ndatas, nalg))
mdlcst = np.zeros((ndatas, nalg))

for j in range(ndatas):
    #z = lims[j]
    plt.figure()
    #print "Running datas " + datas[j]
    for i in range(nalg):
        #print "   -- " + algs[i]
        #results-car_v2-pame1-hoeff.csv
        if datas[j] == 'splice_v2':
            nsep = 50
        else:
            nsep = 5
                
        df = pd.read_csv(pth+'results-'+datas[j]+'-'+algs[i]+'.csv')
        t = range(0, len(df[T]), nsep)
        plt.plot(df[T][t], df[F][t], marker=mrk[i])
        accs[j, i] = df[F].mean()
        mdlcst[j, i] = df[TZ].mean()
    
    
    #plt.ylim(z[0],z[1])
    plt.legend(["PAME-1"], loc='best')
    font = {'size'   : 13}
    plt.rc('font', **font)
    plt.savefig('plots/results-'+datas[j]+'-'+base+'nweights.pdf', format='pdf')


Statistical Hypothesis Testing

Apply the Friedman test and Bonferroni-Dunn test as described by Janez Demsar.


In [3]:
pth = 'outputs2/'
base = 'hoeff'

algs = ['bagging-'+base,'pame1-bag-'+base, 'pame3-bag-'+base]
algs = [ 'bagging-'+base, 'pame1-bag-'+base, 'pame3-bag-'+base]

#algs = [base, 'bagging-'+base, 'boosting-'+base, 
#        'pame1-bag-'+base, 'pame2-bag-'+base, 'pame3-bag-'+base, 
#        'pame1-boo-'+base, 'pame2-boo-'+base, 'pame3-boo-'+base]
# 'spam' , 'poker'
datas = ['abalone_v2', 'airlines', 'breast-w' ,'car_v2' ,'cmc_v2' ,'colic' ,'cov_v2' ,'credit-a' ,
         'diabetes' ,'dow' ,'elecNormNew' ,'german' ,'haberman' ,'hepatitis' ,'hyper' ,'image' ,
         'magic' ,'noaa' ,'sea' ,'sick' ,'splice_v2']
datas=  ['acute-inflammation','acute-nephritis','adult_train','bank','blood','breast-cancer','breast-cancer-wisc-diag','breast-cancer-wisc-prog','chess-krvkp','congressional-voting','conn-bench-sonar-mines-rocks','connect-4','credit-approval','cylinder-bands','echocardiogram','fertility','haberman-survival','heart-hungarian','hepatitis','hill-valley_train','horse-colic_train','ilpd-indian-liver','ionosphere','magic','mammographic','miniboone','molec-biol-promoter','mushroom','musk-1','musk-2','oocytes_merluccius_nucleus_4d','oocytes_trisopterus_nucleus_2f','ozone','parkinsons','pima','pittsburg-bridges-T-OR-D','planning','ringnorm','spambase','spect_train','spectf_train','statlog-australian-credit','statlog-german-credit','statlog-heart','tic-tac-toe','titanic','twonorm','vertebral-column-2clases']
datas=  ['acute-inflammation','acute-nephritis','adult_train','bank','blood',
         'breast-cancer','breast-cancer-wisc-diag','chess-krvkp','congressional-voting',
         'conn-bench-sonar-mines-rocks','connect-4','credit-approval','cylinder-bands',
         'echocardiogram','fertility','haberman-survival','heart-hungarian','hepatitis',
         'hill-valley_train','horse-colic_train','ilpd-indian-liver','ionosphere','magic','mammographic',
         'miniboone','molec-biol-promoter','mushroom','musk-2','oocytes_merluccius_nucleus_4d',
         'oocytes_trisopterus_nucleus_2f','ozone','parkinsons','pima','pittsburg-bridges-T-OR-D',
         'planning','ringnorm','spambase','sea','spect_train','spectf_train','statlog-australian-credit',
         'statlog-german-credit','statlog-heart','tic-tac-toe','titanic','twonorm','vertebral-column-2clases']
nalg = len(algs)
ndatas = len(datas)
F = 'classifications correct (percent)'

accs = np.zeros((ndatas, nalg))
mdlcst = np.zeros((ndatas, nalg))

for j in range(ndatas):
    for i in range(nalg):
        df = pd.read_csv(pth+'results-'+datas[j]+'-'+algs[i]+'.csv')
        accs[j, i] = df[F].mean()

Z = np.zeros((nalg, nalg))
ranks = np.zeros((ndatas, nalg))
for i in range(ndatas):
    ss = np.argsort(accs[i])
    for m, n in map(None, range(nalg), ss):
        ranks[i, n] = nalg - m
    #ranks[i] = nalg-rs[np.argsort(accs[i])]
R = ranks.mean(axis=0)

# chi-squared stat
XF = 12*ndatas/(nalg*(nalg+1)) * ((R**2).sum() - (nalg*(nalg+1)**2)/4)
FF = (ndatas-1)*XF/(ndatas*(nalg-1)-XF)


pF = 1 - f.cdf(FF, nalg-1, (nalg-1)*(ndatas-1))
if pF < 0.05: 
    print "Friedman: Reject the Null Hypothesis"
else:
    print "Friedman: Fail to Reject the Null Hypothesis"
print " "        

for i in range(nalg):
    for j in range(nalg):
        Z[i,j] = (R[i]-R[j])/(np.sqrt(1.*(nalg*(nalg+1))/(6*ndatas)))

        
s = ' & '
for alg in algs: 
    s += alg + ' & '
print s

for j in range(ndatas):
    s = datas[j] + ' & '
    for i in range(nalg-1):
        s += str(np.round(1000*accs[j,i])/1000) + ' (' + str(ranks[j,i]) + ') & '
    s += str(np.round(1000*accs[j,nalg-1])/1000) + ' (' + str(ranks[j,nalg-1]) + ') \\\\'
    print s
s = ' & '
for i in range(nalg-1):
    s += str(np.round(100*R[i])/100) + ' & '
s += str(np.round(100*R[nalg-1])/100) + ' \\\\ '
print s


Friedman: Reject the Null Hypothesis
 
 & bagging-hoeff & pame1-bag-hoeff & pame3-bag-hoeff & 
acute-inflammation & 84.83 (1.0) & 78.202 (3.0) & 82.458 (2.0) \\
acute-nephritis & 90.565 (1.0) & 80.279 (3.0) & 89.553 (2.0) \\
adult_train & 83.339 (1.0) & 76.508 (3.0) & 82.329 (2.0) \\
bank & 88.156 (1.0) & 80.933 (3.0) & 86.179 (2.0) \\
blood & 76.83 (1.0) & 66.993 (3.0) & 72.665 (2.0) \\
breast-cancer & 75.239 (2.0) & 61.518 (3.0) & 76.167 (1.0) \\
breast-cancer-wisc-diag & 86.184 (2.0) & 84.672 (3.0) & 88.953 (1.0) \\
chess-krvkp & 88.355 (1.0) & 83.645 (3.0) & 86.732 (2.0) \\
congressional-voting & 58.56 (1.0) & 51.374 (3.0) & 55.364 (2.0) \\
conn-bench-sonar-mines-rocks & 61.258 (1.0) & 50.602 (3.0) & 53.914 (2.0) \\
connect-4 & 75.42 (1.0) & 65.678 (3.0) & 74.364 (2.0) \\
credit-approval & 76.044 (1.0) & 72.899 (3.0) & 75.884 (2.0) \\
cylinder-bands & 65.389 (2.0) & 57.802 (3.0) & 67.845 (1.0) \\
echocardiogram & 77.366 (2.0) & 66.134 (3.0) & 79.733 (1.0) \\
fertility & 93.007 (1.0) & 91.678 (3.0) & 92.622 (2.0) \\
haberman-survival & 71.414 (1.0) & 63.545 (3.0) & 68.721 (2.0) \\
heart-hungarian & 74.863 (2.0) & 66.95 (3.0) & 77.539 (1.0) \\
hepatitis & 70.975 (2.0) & 54.17 (3.0) & 72.105 (1.0) \\
hill-valley_train & 48.109 (3.0) & 53.561 (1.0) & 51.765 (2.0) \\
horse-colic_train & 66.981 (1.0) & 61.532 (3.0) & 66.369 (2.0) \\
ilpd-indian-liver & 65.995 (1.0) & 57.6 (3.0) & 62.555 (2.0) \\
ionosphere & 78.968 (2.0) & 72.244 (3.0) & 80.181 (1.0) \\
magic & 78.993 (1.0) & 71.833 (3.0) & 78.767 (2.0) \\
mammographic & 78.374 (2.0) & 72.549 (3.0) & 78.412 (1.0) \\
miniboone & 89.767 (1.0) & 85.97 (3.0) & 89.015 (2.0) \\
molec-biol-promoter & 59.935 (3.0) & 63.036 (2.0) & 67.584 (1.0) \\
mushroom & 92.092 (3.0) & 96.276 (2.0) & 96.534 (1.0) \\
musk-2 & 85.156 (2.0) & 81.783 (3.0) & 85.632 (1.0) \\
oocytes_merluccius_nucleus_4d & 60.0 (2.0) & 54.867 (3.0) & 62.379 (1.0) \\
oocytes_trisopterus_nucleus_2f & 52.957 (3.0) & 54.611 (2.0) & 56.11 (1.0) \\
ozone & 97.502 (1.0) & 95.069 (3.0) & 97.165 (2.0) \\
parkinsons & 69.169 (2.0) & 63.716 (3.0) & 70.285 (1.0) \\
pima & 72.994 (1.0) & 60.844 (3.0) & 69.86 (2.0) \\
pittsburg-bridges-T-OR-D & 87.283 (2.0) & 76.544 (3.0) & 87.404 (1.0) \\
planning & 73.008 (1.0) & 60.47 (3.0) & 67.425 (2.0) \\
ringnorm & 98.18 (1.0) & 97.206 (3.0) & 98.01 (2.0) \\
spambase & 82.336 (2.0) & 78.887 (3.0) & 83.708 (1.0) \\
sea & 86.72 (1.0) & 78.441 (3.0) & 86.333 (2.0) \\
spect_train & 62.053 (1.0) & 60.568 (2.0) & 57.035 (3.0) \\
spectf_train & 60.163 (1.0) & 48.95 (3.0) & 57.035 (2.0) \\
statlog-australian-credit & 64.598 (1.0) & 51.716 (3.0) & 59.087 (2.0) \\
statlog-german-credit & 68.685 (1.0) & 58.884 (3.0) & 64.21 (2.0) \\
statlog-heart & 72.627 (1.0) & 63.123 (3.0) & 68.599 (2.0) \\
tic-tac-toe & 67.017 (1.0) & 57.373 (3.0) & 64.621 (2.0) \\
titanic & 76.955 (1.0) & 65.75 (3.0) & 74.784 (2.0) \\
twonorm & 97.201 (1.0) & 95.586 (3.0) & 97.007 (2.0) \\
vertebral-column-2clases & 72.269 (1.0) & 57.312 (3.0) & 67.164 (2.0) \\
 & 1.45 & 2.87 & 1.68 \\ 

In [9]:
pth = 'outputs2/'
base = 'hoeff'
algs = ['bagging-'+base,'pame1-bag-'+base, 'pame3-bag-'+base]
#algs = [base, 'bagging-'+base, 'boosting-'+base, 
#        'pame1-bag-'+base, 'pame2-bag-'+base, 'pame3-bag-'+base, 
#        'pame1-boo-'+base, 'pame2-boo-'+base, 'pame3-boo-'+base]
# 'spam' , 'poker'
datas = ['abalone_v2', 'airlines', 'breast-w' ,'car_v2' ,'cmc_v2' ,'colic' ,'cov_v2' ,'credit-a' ,
         'diabetes' ,'dow' ,'elecNormNew' ,'german' ,'haberman' ,'hepatitis' ,'hyper' ,'image' ,
         'magic' ,'noaa' ,'sea' ,'sick' ,'splice_v2']
datas=  ['acute-inflammation','acute-nephritis','adult_train','bank','blood','breast-cancer','breast-cancer-wisc-diag','breast-cancer-wisc','breast-cancer-wisc-prog','chess-krvkp','congressional-voting','conn-bench-sonar-mines-rocks','connect-4','credit-approval','cylinder-bands','echocardiogram','fertility','haberman-survival','heart-hungarian','hepatitis','hill-valley_train','horse-colic_train','ilpd-indian-liver','ionosphere','magic','mammographic','miniboone','molec-biol-promoter','monks-1_train','monks-2_train','monks-3_train','mushroom','musk-1','musk-2','oocytes_merluccius_nucleus_4d','oocytes_trisopterus_nucleus_2f','ozone','parkinsons','pima','pittsburg-bridges-T-OR-D','planning','ringnorm','spambase','spect_train','spectf_train','statlog-australian-credit','statlog-german-credit','statlog-heart','tic-tac-toe','titanic','twonorm','vertebral-column-2clases']
nalg = len(algs)
ndatas = len(datas)
F = 'model cost (RAM-Hours)'
mdlcst = np.zeros((ndatas, nalg))

for j in range(ndatas):
    for i in range(nalg):
        df = pd.read_csv(pth+'results-'+datas[j]+'-'+algs[i]+'.csv')
        mdlcst[j, i] = df[F].mean()
        
Z = np.zeros((nalg, nalg))
ranks = np.zeros((ndatas, nalg))
for i in range(ndatas):
    ss = np.argsort(accs[i])
    for m, n in map(None, range(nalg), ss):
        ranks[i, n] = 1 + m
R = ranks.mean(axis=0)

# chi-squared stat
XF = 12*ndatas/(nalg*(nalg+1)) * ((R**2).sum() - (nalg*(nalg+1)**2)/4)
FF = (ndatas-1)*XF/(ndatas*(nalg-1)-XF)

pF = 1 - f.cdf(FF, nalg-1, (nalg-1)*(ndatas-1))
if pF < 0.05: 
    print "Friedman: Reject the Null Hypothesis"
else:
    print "Friedman: Fail to Reject the Null Hypothesis"
print " "        


for i in range(nalg):
    for j in range(nalg):
        Z[i,j] = (R[i]-R[j])/(np.sqrt(1.*(nalg*(nalg+1))/(6*ndatas)))
        
for j in range(ndatas):
    s = datas[j] + ' & '
    for i in range(nalg-1):
        s += str(100000000*mdlcst[j,i]) + ' (' + str(ranks[j,i]) + ') & '
    s += str(mdlcst[j,nalg-1]) + ' (' + str(ranks[j,nalg-1]) + ') \\\\'
    print s
s = ' & '
for i in range(nalg-1):
    s += str(np.round(100*R[i])/100) + ' & '
s += str(np.round(100*R[nalg-1])/100) + ' \\\\ '
print s


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-9-648c2de6cbdf> in <module>()
     25     ss = np.argsort(accs[i])
     26     for m, n in map(None, range(nalg), ss):
---> 27         ranks[i, n] = 1 + m
     28 R = ranks.mean(axis=0)
     29 

IndexError: index 3 is out of bounds for axis 1 with size 3

In [5]:
for j in range(ndatas):
    s = datas[j] + ' & '
    for i in range(nalg-1):
        s += str(np.round(1000*accs[j,i])/1000) + ' (' + str(ranks[j,i]) + ') & '
    s += str(np.round(1000*accs[j,nalg-1])/1000) + ' (' + str(ranks[j,nalg-1]) + ') \\\\'
    print s
s = ' & '
for i in range(nalg-1):
    s += str(np.round(100*R[i])/100) + ' & '
s += str(np.round(100*R[nalg-1])/100) + ' \\\\ '
print s


acute-inflammation & 84.83 (3.0) & 78.202 (1.0) & 82.458 (2.0) \\
acute-nephritis & 90.565 (3.0) & 80.279 (1.0) & 89.553 (2.0) \\
adult_train & 83.339 (3.0) & 76.508 (1.0) & 82.329 (2.0) \\
bank & 88.156 (3.0) & 80.933 (1.0) & 86.179 (2.0) \\
blood & 76.83 (3.0) & 66.993 (1.0) & 72.665 (2.0) \\
breast-cancer & 75.239 (2.0) & 61.518 (1.0) & 76.167 (3.0) \\
breast-cancer-wisc-diag & 86.184 (2.0) & 84.672 (1.0) & 88.953 (3.0) \\
breast-cancer-wisc & 90.244 (3.0) & 83.883 (1.0) & 88.004 (2.0) \\
breast-cancer-wisc-prog & 75.671 (3.0) & 68.78 (2.0) & 64.922 (1.0) \\
chess-krvkp & 88.355 (3.0) & 83.645 (1.0) & 86.732 (2.0) \\
congressional-voting & 58.56 (3.0) & 51.374 (1.0) & 55.364 (2.0) \\
conn-bench-sonar-mines-rocks & 61.258 (3.0) & 50.602 (1.0) & 53.914 (2.0) \\
connect-4 & 75.42 (3.0) & 65.678 (1.0) & 74.364 (2.0) \\
credit-approval & 76.044 (3.0) & 72.899 (1.0) & 75.884 (2.0) \\
cylinder-bands & 65.389 (2.0) & 57.802 (1.0) & 67.845 (3.0) \\
echocardiogram & 77.366 (2.0) & 66.134 (1.0) & 79.733 (3.0) \\
fertility & 93.007 (3.0) & 91.678 (1.0) & 92.622 (2.0) \\
haberman-survival & 71.414 (3.0) & 63.545 (1.0) & 68.721 (2.0) \\
heart-hungarian & 74.863 (2.0) & 66.95 (1.0) & 77.539 (3.0) \\
hepatitis & 70.975 (2.0) & 54.17 (1.0) & 72.105 (3.0) \\
hill-valley_train & 48.109 (1.0) & 53.561 (3.0) & 51.765 (2.0) \\
horse-colic_train & 66.981 (3.0) & 61.532 (1.0) & 66.369 (2.0) \\
ilpd-indian-liver & 65.995 (3.0) & 57.6 (1.0) & 62.555 (2.0) \\
ionosphere & 78.968 (2.0) & 72.244 (1.0) & 80.181 (3.0) \\
magic & 78.993 (3.0) & 71.833 (1.0) & 78.767 (2.0) \\
mammographic & 78.374 (2.0) & 72.549 (1.0) & 78.412 (3.0) \\
miniboone & 89.767 (3.0) & 85.97 (1.0) & 89.015 (2.0) \\
molec-biol-promoter & 59.935 (1.0) & 63.036 (2.0) & 67.584 (3.0) \\
monks-1_train & 65.17 (3.0) & 47.559 (1.0) & 60.434 (2.0) \\
monks-2_train & 57.147 (3.0) & 45.969 (2.0) & 44.051 (1.0) \\
monks-3_train & 68.119 (1.0) & 73.855 (3.0) & 69.09 (2.0) \\
mushroom & 92.092 (1.0) & 96.276 (2.0) & 96.534 (3.0) \\
musk-1 & 67.414 (3.0) & 58.879 (1.0) & 66.898 (2.0) \\
musk-2 & 85.156 (2.0) & 81.783 (1.0) & 85.632 (3.0) \\
oocytes_merluccius_nucleus_4d & 60.0 (2.0) & 54.867 (1.0) & 62.379 (3.0) \\
oocytes_trisopterus_nucleus_2f & 52.957 (1.0) & 54.611 (2.0) & 56.11 (3.0) \\
ozone & 97.502 (3.0) & 95.069 (1.0) & 97.165 (2.0) \\
parkinsons & 69.169 (2.0) & 63.716 (1.0) & 70.285 (3.0) \\
pima & 72.994 (3.0) & 60.844 (1.0) & 69.86 (2.0) \\
pittsburg-bridges-T-OR-D & 87.283 (2.0) & 76.544 (1.0) & 87.404 (3.0) \\
planning & 73.008 (3.0) & 60.47 (1.0) & 67.425 (2.0) \\
ringnorm & 98.18 (3.0) & 97.206 (1.0) & 98.01 (2.0) \\
spambase & 82.336 (2.0) & 78.887 (1.0) & 83.708 (3.0) \\
spect_train & 62.053 (3.0) & 60.568 (2.0) & 57.035 (1.0) \\
spectf_train & 60.163 (3.0) & 48.95 (1.0) & 57.035 (2.0) \\
statlog-australian-credit & 64.598 (3.0) & 51.716 (1.0) & 59.087 (2.0) \\
statlog-german-credit & 68.685 (3.0) & 58.884 (1.0) & 64.21 (2.0) \\
statlog-heart & 72.627 (3.0) & 63.123 (1.0) & 68.599 (2.0) \\
tic-tac-toe & 67.017 (3.0) & 57.373 (1.0) & 64.621 (2.0) \\
titanic & 76.955 (3.0) & 65.75 (1.0) & 74.784 (2.0) \\
twonorm & 97.201 (3.0) & 95.586 (1.0) & 97.007 (2.0) \\
vertebral-column-2clases & 72.269 (3.0) & 57.312 (1.0) & 67.164 (2.0) \\
 & 2.56 & 1.19 & 2.25 \\ 

In [ ]:
df = pd.read_csv('outputs/kl-elec.csv')
out = plt.hist(df['KL-div'], 20)
plt.xlabel('KL-divergence')
plt.ylabel('Relative Frequency')
font = {'size'   : 13}
plt.rc('font', **font)
plt.savefig('plots/kl-elec.pdf', format='pdf')

plt.figure()
df = pd.read_csv('outputs/kl-abalone.csv')
out = plt.hist(df['KL-div'], 20)
plt.xlabel('KL-divergence')
plt.ylabel('Relative Frequency')
font = {'size'   : 13}
plt.rc('font', **font)
plt.savefig('plots/kl-abalone.pdf', format='pdf')

In [7]:
_,p = wilcoxon(accs[:,0], accs[:,2])
print "p: ", p


p:  0.0192590939371

In [7]:
pth = 'outputs2/'
base = 'hoeff'
algs = [ 'bagging-'+base, 'pame1-bag-'+base, 'pame3-bag-'+base]
algs = ['bagging-'+base, 'boosting-'+base, 'pame1-boo-'+base, 'pame3-boo-'+base]
#algs = [base, 'bagging-'+base, 'boosting-'+base, 
#        'pame1-bag-'+base, 'pame2-bag-'+base, 'pame3-bag-'+base, 
#        'pame1-boo-'+base, 'pame2-boo-'+base, 'pame3-boo-'+base]
# 'spam' , 'poker'
datas = ['abalone_v2', 'airlines', 'breast-w' ,'car_v2' ,'cmc_v2' ,'colic' ,'cov_v2' ,'credit-a' ,
         'diabetes' ,'dow' ,'elecNormNew' ,'german' ,'haberman' ,'hepatitis' ,'hyper' ,'image' ,
         'magic' ,'noaa' ,'sea' ,'sick' ,'splice_v2']
datas=  ['acute-inflammation','acute-nephritis','adult_train','bank','blood','breast-cancer','breast-cancer-wisc-diag','breast-cancer-wisc','breast-cancer-wisc-prog','chess-krvkp','congressional-voting','conn-bench-sonar-mines-rocks','connect-4','credit-approval','cylinder-bands','echocardiogram','fertility','haberman-survival','heart-hungarian','hepatitis','hill-valley_train','horse-colic_train','ilpd-indian-liver','ionosphere','magic','mammographic','miniboone','molec-biol-promoter','monks-1_train','monks-2_train','monks-3_train','mushroom','musk-1','musk-2','oocytes_merluccius_nucleus_4d','oocytes_trisopterus_nucleus_2f','ozone','parkinsons','pima','pittsburg-bridges-T-OR-D','planning','ringnorm','spambase','spect_train','spectf_train','statlog-australian-credit','statlog-german-credit','statlog-heart','tic-tac-toe','titanic','twonorm','vertebral-column-2clases']
nalg = len(algs)
ndatas = len(datas)
F = 'classifications correct (percent)'
alpha = .05
idxs = []

accs = np.zeros((ndatas, nalg))
mdlcst = np.zeros((ndatas, nalg))

for j in range(ndatas):
    mmax = 0
    idx = 0
    for i in range(nalg):
        df = pd.read_csv(pth+'results-'+datas[j]+'-'+algs[i]+'.csv')
        accs[j, i] = df[F].mean()
        if accs[j, i] > mmax:
            mmax = accs[j, i]
            idx = i
    idxs.append(idx)

_,p_ba_p1 = wilcoxon(accs[:,0], accs[:,2])
_,p_ba_p3 = wilcoxon(accs[:,0], accs[:,3])
_,p_bo_p1 = wilcoxon(accs[:,1], accs[:,2])
_,p_bo_p3 = wilcoxon(accs[:,1], accs[:,3])

if p_ba_p1 <= alpha:
    if accs[:,2].sum() > accs[:,0].sum():
        print "PAME1 is better than Bagging ("+str(p_ba_p1)+")"
if p_ba_p3 <= alpha:
    if accs[:,3].sum() > accs[:,0].sum():
        print "PAME3 is better than Bagging ("+str(p_ba_p3)+")"
if p_bo_p1 <= alpha:
    if accs[:,2].sum() > accs[:,1].sum():
        print "PAME1 is better than Boosting ("+str(p_bo_p1)+")"
if p_bo_p3 <= alpha:
    if accs[:,3].sum() > accs[:,1].sum():
        print "PAME3 is better than Boosting ("+str(p_bo_p3)+")"
        
print ' '
print accs[:,2].sum(), ' ', accs[:,0].sum()
print accs[:,2].sum(), ' ', accs[:,1].sum()
print accs[:,3].sum(), ' ', accs[:,0].sum()
print accs[:,3].sum(), ' ', accs[:,1].sum()
print p_ba_p1, p_ba_p3, p_bo_p1,p_bo_p3
print ' '
print ' '

for j in range(ndatas):
    s = datas[j].replace('_train', '').replace('_', '-') + ' & '
    for i in range(nalg-1):
        if idxs[j] == i:
            s += str(1.*int(100*accs[j,i])/100) + '$^*$ & '
        else:
            s += str(1.*int(100*accs[j,i])/100) + ' & '
    if idxs[j] == nalg-1:
        s += str(1.*int(100*accs[j,nalg-1])/100)  + '$^*$ \\\\'
    else:
        s += str(1.*int(100*accs[j,nalg-1])/100)  + ' \\\\'
            
    print s


 
3600.34453849   3900.92869147
3600.34453849   3864.03691756
3629.24036737   3900.92869147
3629.24036737   3864.03691756
1.39819973996e-08 9.95484209119e-08 8.56193591127e-08 2.09232210047e-07
 
 
acute-inflammation & 84.82$^*$ & 78.13 & 75.61 & 83.03 \\
acute-nephritis & 90.56$^*$ & 87.53 & 79.87 & 85.95 \\
adult & 83.33$^*$ & 81.56 & 77.21 & 76.61 \\
bank & 88.15$^*$ & 86.42 & 82.21 & 79.99 \\
blood & 76.83$^*$ & 69.31 & 67.84 & 66.9 \\
breast-cancer & 75.23$^*$ & 70.21 & 67.18 & 65.11 \\
breast-cancer-wisc-diag & 86.18 & 89.73$^*$ & 87.26 & 88.33 \\
breast-cancer-wisc & 90.24$^*$ & 89.66 & 85.26 & 87.55 \\
breast-cancer-wisc-prog & 75.67$^*$ & 64.08 & 65.64 & 65.17 \\
chess-krvkp & 88.35 & 90.35$^*$ & 87.26 & 87.12 \\
congressional-voting & 58.55$^*$ & 55.88 & 52.9 & 51.55 \\
conn-bench-sonar-mines-rocks & 61.25 & 65.76$^*$ & 57.41 & 57.72 \\
connect-4 & 75.42$^*$ & 72.54 & 68.65 & 69.35 \\
credit-approval & 76.04 & 77.16$^*$ & 72.01 & 67.62 \\
cylinder-bands & 65.38 & 69.39$^*$ & 58.84 & 61.53 \\
echocardiogram & 77.36 & 77.73$^*$ & 67.37 & 67.48 \\
fertility & 93.0$^*$ & 91.32 & 92.24 & 78.4 \\
haberman-survival & 71.41$^*$ & 65.09 & 67.65 & 62.92 \\
heart-hungarian & 74.86 & 77.36$^*$ & 68.93 & 70.87 \\
hepatitis & 70.97 & 72.7$^*$ & 60.3 & 69.17 \\
hill-valley & 48.1 & 47.34 & 47.89 & 52.38$^*$ \\
horse-colic & 66.98 & 70.14$^*$ & 64.06 & 62.28 \\
ilpd-indian-liver & 65.99$^*$ & 58.85 & 59.44 & 59.34 \\
ionosphere & 78.96 & 84.56$^*$ & 73.87 & 80.94 \\
magic & 78.99 & 82.39$^*$ & 74.83 & 73.61 \\
mammographic & 78.37 & 79.72$^*$ & 71.04 & 68.49 \\
miniboone & 89.76 & 90.24$^*$ & 87.66 & 88.5 \\
molec-biol-promoter & 59.93 & 68.13$^*$ & 65.28 & 65.37 \\
monks-1 & 65.16$^*$ & 62.03 & 47.48 & 51.53 \\
monks-2 & 57.14 & 57.57$^*$ & 49.89 & 43.48 \\
monks-3 & 68.11$^*$ & 56.44 & 67.02 & 61.74 \\
mushroom & 92.09 & 96.94 & 96.96 & 97.11$^*$ \\
musk-1 & 67.41 & 67.48$^*$ & 61.29 & 65.36 \\
musk-2 & 85.15 & 90.05$^*$ & 86.41 & 83.45 \\
oocytes-merluccius-nucleus-4d & 59.99 & 61.71$^*$ & 57.4 & 57.93 \\
oocytes-trisopterus-nucleus-2f & 52.95 & 57.55$^*$ & 54.27 & 53.73 \\
ozone & 97.5$^*$ & 96.88 & 95.93 & 95.62 \\
parkinsons & 69.16 & 69.38$^*$ & 65.33 & 65.51 \\
pima & 72.99$^*$ & 70.43 & 62.19 & 60.86 \\
pittsburg-bridges-T-OR-D & 87.28 & 87.41$^*$ & 75.17 & 84.99 \\
planning & 73.0$^*$ & 66.04 & 62.8 & 63.15 \\
ringnorm & 98.17$^*$ & 97.29 & 96.77 & 97.07 \\
spambase & 82.33 & 87.03$^*$ & 83.0 & 82.09 \\
spect & 62.05$^*$ & 56.61 & 60.44 & 58.64 \\
spectf & 60.16 & 66.93$^*$ & 45.51 & 56.4 \\
statlog-australian-credit & 64.59$^*$ & 53.18 & 52.23 & 54.2 \\
statlog-german-credit & 68.68$^*$ & 65.2 & 59.88 & 58.0 \\
statlog-heart & 72.62 & 73.53$^*$ & 57.38 & 61.57 \\
tic-tac-toe & 67.01 & 70.7$^*$ & 58.82 & 56.99 \\
titanic & 76.95$^*$ & 74.97 & 66.29 & 65.61 \\
twonorm & 97.2$^*$ & 96.12 & 94.13 & 95.73 \\
vertebral-column-2clases & 72.26$^*$ & 69.07 & 57.8 & 64.92 \\

In [8]:
print p_ba_p1, p_ba_p3, p_bo_p1,p_bo_p3


1.39819973996e-08 9.95484209119e-08 8.56193591127e-08 2.09232210047e-07

In [13]:
print len(datas)


46

In [5]:
print accs


[[ 84.82997828  78.20206216  82.45784813]
 [ 90.56544665  80.27906433  89.55277732]
 [ 83.33922315  76.5083016   82.32861132]
 [ 88.15643793  80.93265389  86.17900637]
 [ 76.83020771  66.99294981  72.66525671]
 [ 75.23931483  61.5178909   76.16689414]
 [ 86.18404949  84.67247893  88.95258561]
 [ 88.35464891  83.64507999  86.73244566]
 [ 58.55974025  51.37415424  55.36402098]
 [ 61.25785971  50.60173375  53.913984  ]
 [ 75.42001775  65.67815385  74.36410059]
 [ 76.04415531  72.89853852  75.88441993]
 [ 65.38946889  57.80233482  67.84491852]
 [ 77.36614395  66.13442162  79.73285534]
 [ 93.00677164  91.67828312  92.62194625]
 [ 71.41439888  63.54533345  68.72138525]
 [ 74.86334885  66.95010766  77.53938256]
 [ 70.97497246  54.16959206  72.10492252]
 [ 48.10921407  53.56066597  51.76470049]
 [ 66.98091518  61.53156754  66.36890491]
 [ 65.99494231  57.59963063  62.55486073]
 [ 78.96764935  72.24437795  80.18052729]
 [ 78.99265242  71.83258642  78.76652771]
 [ 78.37405101  72.54930807  78.41230281]
 [ 89.767       85.97        89.015     ]
 [ 59.93499346  63.03628313  67.58400228]
 [ 92.0921702   96.27581229  96.53414638]
 [ 85.15643172  81.78260074  85.63203884]
 [ 59.9996575   54.86654092  62.37883623]
 [ 52.95696525  54.61080309  56.10984427]
 [ 97.50249966  95.06918185  97.16503599]
 [ 69.16891008  63.71640545  70.28549954]
 [ 72.99397469  60.84429689  69.85990512]
 [ 87.28325649  76.54449155  87.40409925]
 [ 73.00809515  60.46980254  67.42526321]
 [ 98.17996651  97.20580573  98.01035387]
 [ 82.33604351  78.88713232  83.70824298]
 [ 86.7204      78.4406      86.3332    ]
 [ 62.05306953  60.56793232  57.03457643]
 [ 60.16296486  48.94964954  57.03524513]
 [ 64.59766617  51.71558763  59.08667748]
 [ 68.68454113  58.88443997  64.20982753]
 [ 72.62749251  63.12313309  68.5986626 ]
 [ 67.01716807  57.37250352  64.62146299]
 [ 76.95490104  65.75049943  74.78387605]
 [ 97.20125183  95.58572551  97.00724347]
 [ 72.26924388  57.31217613  67.164498  ]]

In [6]:
accs.shape


Out[6]:
(47, 3)

In [ ]: