Read the cross validation intermediate results


In [1]:
import os
from sklearn import metrics
from bionlp.util import fs, io

DATA_PATH = '../data/chmannot/filtx'
METHODS = ['RandomForest', 'RbfSVM102-2', 'RbfSVM103-2', 'RbfSVM102-3', 'RbfSVM103-3', 'L1-LinSVC', 'Perceptron', 'MNB', 'MEM', 'ExtraTrees']
LABEL_NUM = 10

lb_scores = []
for lb in xrange(LABEL_NUM):
    scores = dict([(m, []) for m in METHODS])
    for method in METHODS:
        fpaths = sorted([fpath for fpath in fs.listf(DATA_PATH, pattern='pred_crsval_[0-9]*_%s_%i.npz' % (method.replace(' ', '_').lower(), lb), full_path=True)])
        for fpath in fpaths:
            npz_file = io.read_npz(fpath)
            pred_lb, true_lb = npz_file['pred_lb'], npz_file['true_lb']
            micro_fscore = metrics.fbeta_score(true_lb, pred_lb, beta=1, average='micro')
            scores[method].append(micro_fscore)
    lb_scores.append(scores)

Calculate the t-test and the rank-sum test of the pairwise methods


In [11]:
import itertools

import numpy as np
import pandas as pd
from scipy import stats

EQVAR_PVAL = 0.05

dummy_arr = np.zeros((len(METHODS), len(METHODS)), dtype='float32')

ttest_dfs, ttest_pval_dfs, rnksm_dfs, rnksm_pval_dfs = [[] for x in range(4)]
for lb, scores in enumerate(lb_scores):
    ttest_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)
    ttest_pval_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)
    rnksm_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)
    rnksm_pval_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)

    for m_pair in itertools.combinations(METHODS, 2):
        lvn = stats.levene(scores[m_pair[0]], scores[m_pair[1]])
        equal_var = lvn.pvalue < EQVAR_PVAL
        ttest = stats.ttest_ind(scores[m_pair[0]], scores[m_pair[1]], equal_var=equal_var)
        rnksm = stats.ranksums(scores[m_pair[0]], scores[m_pair[1]])
        ttest_df.ix[m_pair], ttest_pval_df.ix[m_pair] = ttest.statistic, ttest.pvalue
        rnksm_df.ix[m_pair], rnksm_pval_df.ix[m_pair] = rnksm.statistic, rnksm.pvalue

    ttest_dfs.append(ttest_df)
    ttest_pval_dfs.append(ttest_pval_df)
    rnksm_dfs.append(rnksm_df)
    rnksm_pval_dfs.append(rnksm_pval_df)
    ttest_df.to_excel('ttest_%i.xlsx' % lb)
    ttest_pval_df.to_excel('ttest_pval_%i.xlsx' % lb)
    rnksm_df.to_excel('rnksm_%i.xlsx' % lb)
    rnksm_pval_df.to_excel('rnksm_pval_%i.xlsx' % lb)
ttest_df_avg = pd.DataFrame(np.array([df.as_matrix() for df in ttest_dfs]).mean(axis=0), index=METHODS, columns=METHODS)
ttest_pval_df_avg = pd.DataFrame(np.array([df.as_matrix() for df in ttest_pval_dfs]).mean(axis=0), index=METHODS, columns=METHODS)
rnksm_df_avg = pd.DataFrame(np.array([df.as_matrix() for df in rnksm_dfs]).mean(axis=0), index=METHODS, columns=METHODS)
rnksm_pval_df_avg = pd.DataFrame(np.array([df.as_matrix() for df in rnksm_pval_dfs]).mean(axis=0), index=METHODS, columns=METHODS)

ttest_df_avg.to_excel('ttest_avg.xlsx')
ttest_pval_df_avg.to_excel('ttest_pval_avg.xlsx')
rnksm_df_avg.to_excel('rnksm_avg.xlsx')
rnksm_pval_df_avg.to_excel('rnksm_pval_avg.xlsx')