Read the cross validation intermediate results


In [13]:
import os
from sklearn import metrics
from bionlp.util import fs, io

DATA_PATH = '../data/chmannot/origcmp'
METHODS = ['UDT-RF', 'DF-RbfSVM', 'MNB', 'MEM']

scores = dict([(m, []) for m in METHODS])
for method in METHODS:
    fpaths = sorted([fpath for fpath in fs.listf(DATA_PATH, pattern='pred_crsval_[0-9]*_%s_.*' % method.replace(' ', '_').lower(), full_path=True)])
    for fpath in fpaths:
        npz_file = io.read_npz(fpath)
        pred_lb, true_lb = npz_file['pred_lb'], npz_file['true_lb']
        micro_fscore = metrics.fbeta_score(true_lb, pred_lb, beta=1, average='micro')
        scores[method].append(micro_fscore)

Calculate the t-test and the rank-sum test of the pairwise methods


In [15]:
import itertools

import numpy as np
import pandas as pd
from scipy import stats

EQVAR_PVAL = 0.05

dummy_arr = np.zeros((len(METHODS), len(METHODS)), dtype='float32')
ttest_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)
ttest_pval_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)
rnksm_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)
rnksm_pval_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)

for m_pair in itertools.combinations(METHODS, 2):
    lvn = stats.levene(scores[m_pair[0]], scores[m_pair[1]])
    equal_var = lvn.pvalue < EQVAR_PVAL
    ttest = stats.ttest_ind(scores[m_pair[0]], scores[m_pair[1]], equal_var=equal_var)
    rnksm = stats.ranksums(scores[m_pair[0]], scores[m_pair[1]])
    ttest_df.ix[m_pair], ttest_pval_df.ix[m_pair] = ttest.statistic, ttest.pvalue
    rnksm_df.ix[m_pair], rnksm_pval_df.ix[m_pair] = rnksm.statistic, rnksm.pvalue

ttest_df.to_excel('ttest.xlsx')
ttest_pval_df.to_excel('ttest_pval.xlsx')
rnksm_df.to_excel('rnksm.xlsx')
rnksm_pval_df.to_excel('rnksm_pval.xlsx')