In [1]:
import os
from sklearn import metrics
from bionlp.util import fs, io
DATA_PATH = '../data/chmannot/filtx'
METHODS = ['RandomForest', 'RbfSVM102-2', 'RbfSVM103-2', 'RbfSVM102-3', 'RbfSVM103-3', 'L1-LinSVC', 'Perceptron', 'MNB', 'MEM', 'ExtraTrees']
LABEL_NUM = 10
lb_scores = []
for lb in xrange(LABEL_NUM):
scores = dict([(m, []) for m in METHODS])
for method in METHODS:
fpaths = sorted([fpath for fpath in fs.listf(DATA_PATH, pattern='pred_crsval_[0-9]*_%s_%i.npz' % (method.replace(' ', '_').lower(), lb), full_path=True)])
for fpath in fpaths:
npz_file = io.read_npz(fpath)
pred_lb, true_lb = npz_file['pred_lb'], npz_file['true_lb']
micro_fscore = metrics.fbeta_score(true_lb, pred_lb, beta=1, average='micro')
scores[method].append(micro_fscore)
lb_scores.append(scores)
In [11]:
import itertools
import numpy as np
import pandas as pd
from scipy import stats
EQVAR_PVAL = 0.05
dummy_arr = np.zeros((len(METHODS), len(METHODS)), dtype='float32')
ttest_dfs, ttest_pval_dfs, rnksm_dfs, rnksm_pval_dfs = [[] for x in range(4)]
for lb, scores in enumerate(lb_scores):
ttest_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)
ttest_pval_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)
rnksm_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)
rnksm_pval_df = pd.DataFrame(np.copy(dummy_arr), index=METHODS, columns=METHODS)
for m_pair in itertools.combinations(METHODS, 2):
lvn = stats.levene(scores[m_pair[0]], scores[m_pair[1]])
equal_var = lvn.pvalue < EQVAR_PVAL
ttest = stats.ttest_ind(scores[m_pair[0]], scores[m_pair[1]], equal_var=equal_var)
rnksm = stats.ranksums(scores[m_pair[0]], scores[m_pair[1]])
ttest_df.ix[m_pair], ttest_pval_df.ix[m_pair] = ttest.statistic, ttest.pvalue
rnksm_df.ix[m_pair], rnksm_pval_df.ix[m_pair] = rnksm.statistic, rnksm.pvalue
ttest_dfs.append(ttest_df)
ttest_pval_dfs.append(ttest_pval_df)
rnksm_dfs.append(rnksm_df)
rnksm_pval_dfs.append(rnksm_pval_df)
ttest_df.to_excel('ttest_%i.xlsx' % lb)
ttest_pval_df.to_excel('ttest_pval_%i.xlsx' % lb)
rnksm_df.to_excel('rnksm_%i.xlsx' % lb)
rnksm_pval_df.to_excel('rnksm_pval_%i.xlsx' % lb)
ttest_df_avg = pd.DataFrame(np.array([df.as_matrix() for df in ttest_dfs]).mean(axis=0), index=METHODS, columns=METHODS)
ttest_pval_df_avg = pd.DataFrame(np.array([df.as_matrix() for df in ttest_pval_dfs]).mean(axis=0), index=METHODS, columns=METHODS)
rnksm_df_avg = pd.DataFrame(np.array([df.as_matrix() for df in rnksm_dfs]).mean(axis=0), index=METHODS, columns=METHODS)
rnksm_pval_df_avg = pd.DataFrame(np.array([df.as_matrix() for df in rnksm_pval_dfs]).mean(axis=0), index=METHODS, columns=METHODS)
ttest_df_avg.to_excel('ttest_avg.xlsx')
ttest_pval_df_avg.to_excel('ttest_pval_avg.xlsx')
rnksm_df_avg.to_excel('rnksm_avg.xlsx')
rnksm_pval_df_avg.to_excel('rnksm_pval_avg.xlsx')