In [13]:
import os
from sklearn import metrics
from bionlp.util import fs, io
DATA_PATH = '../data/chmannot/origcmp'
METHODS = ['UDT-RF', 'DF-RbfSVM', 'MNB', 'MEM']
scores = dict([(m, []) for m in METHODS])
for method in METHODS:
fpaths = sorted([fpath for fpath in fs.listf(DATA_PATH, pattern='pred_crsval_[0-9]*_%s_.*' % method.replace(' ', '_').lower(), full_path=True)])
for fpath in fpaths:
npz_file = io.read_npz(fpath)
pred_lb, true_lb = npz_file['pred_lb'], npz_file['true_lb']
micro_fscore = metrics.fbeta_score(true_lb, pred_lb, beta=1, average='micro')
scores[method].append(micro_fscore)
In [15]:
import itertools
import numpy as np
import pandas as pd
from scipy import stats
EQVAR_PVAL = 0.05
dummy_arr = np.zeros((len(METHODS), len(METHODS)), dtype='float32')
ttest_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)
ttest_pval_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)
rnksm_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)
rnksm_pval_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)
for m_pair in itertools.combinations(METHODS, 2):
lvn = stats.levene(scores[m_pair[0]], scores[m_pair[1]])
equal_var = lvn.pvalue < EQVAR_PVAL
ttest = stats.ttest_ind(scores[m_pair[0]], scores[m_pair[1]], equal_var=equal_var)
rnksm = stats.ranksums(scores[m_pair[0]], scores[m_pair[1]])
ttest_df.ix[m_pair], ttest_pval_df.ix[m_pair] = ttest.statistic, ttest.pvalue
rnksm_df.ix[m_pair], rnksm_pval_df.ix[m_pair] = rnksm.statistic, rnksm.pvalue
ttest_df.to_excel('ttest.xlsx')
ttest_pval_df.to_excel('ttest_pval.xlsx')
rnksm_df.to_excel('rnksm.xlsx')
rnksm_pval_df.to_excel('rnksm_pval.xlsx')