Process results of experiments on text data


In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#%config InlineBackend.figure_format = 'pdf'
import freqopttest.util as util
import freqopttest.data as data
import freqopttest.ex.exglobal as exglo
from freqopttest.ex.ex4_text import load_nips_TSTData
import freqopttest.kernel as kernel
import freqopttest.tst as tst
import freqopttest.glo as glo
import freqopttest.plot as plot
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import sys

In [ ]:
result_fnames = [
    'ex4-bayes_bayes_d2000_rnoun-me3_J1_rs1000_nma430_d2000_a0.010_trp0.50.p', #0
    'ex4-bayes_deep_d2000_rnoun-me3_J1_rs1000_nma433_d2000_a0.010_trp0.50.p', #1
    'ex4-bayes_learning_d2000_rnoun-me3_J1_rs1000_nma276_d2000_a0.010_trp0.50.p',#2
    'ex4-bayes_neuro_d2000_rnoun-me3_J1_rs1000_nma788_d2000_a0.010_trp0.50.p', #3
    'ex4-deep_learning_d2000_rnoun-me3_J1_rs1000_nma299_d2000_a0.010_trp0.50.p', #4
    'ex4-neuro_learning_d2000_rnoun-me3_J1_rs1000_nma293_d2000_a0.010_trp0.50.p' #5
]
result_fnames = [
    'ex4-bayes_bayes_d2000_rnoun-me3_J1_rs500_nma430_d2000_a0.010_trp0.50.p', #0
    'ex4-bayes_deep_d2000_rnoun-me3_J1_rs500_nma433_d2000_a0.010_trp0.50.p', #1
    'ex4-bayes_learning_d2000_rnoun-me3_J1_rs500_nma276_d2000_a0.010_trp0.50.p',#2
    'ex4-bayes_neuro_d2000_rnoun-me3_J1_rs500_nma788_d2000_a0.010_trp0.50.p', #3
    'ex4-deep_learning_d2000_rnoun-me3_J1_rs500_nma299_d2000_a0.010_trp0.50.p', #4
    'ex4-neuro_learning_d2000_rnoun-me3_J1_rs500_nma293_d2000_a0.010_trp0.50.p' #5
]
result_fnames = [
    'ex4-bayes_bayes_d2000_rnoun-me6_J1_rs500_nma430_d2000_a0.010_trp0.50.p', #0
    'ex4-bayes_deep_d2000_rnoun-me6_J1_rs500_nma433_d2000_a0.010_trp0.50.p', #1
    'ex4-bayes_learning_d2000_rnoun-me6_J1_rs500_nma276_d2000_a0.010_trp0.50.p',#2
    'ex4-bayes_neuro_d2000_rnoun-me6_J1_rs500_nma788_d2000_a0.010_trp0.50.p', #3
    'ex4-deep_learning_d2000_rnoun-me6_J1_rs500_nma299_d2000_a0.010_trp0.50.p', #4
    'ex4-neuro_learning_d2000_rnoun-me6_J1_rs500_nma293_d2000_a0.010_trp0.50.p' #5
]
"""
result_fnames = [
    'ex4-bayes_bayes_d2000_rnoun-me6_J1_rs200_nma430_d2000_a0.010_trp0.50.p', #0
    'ex4-bayes_deep_d2000_rnoun-me6_J1_rs200_nma433_d2000_a0.010_trp0.50.p', #1
    'ex4-bayes_learning_d2000_rnoun-me6_J1_rs200_nma276_d2000_a0.010_trp0.50.p',#2
    'ex4-bayes_neuro_d2000_rnoun-me6_J1_rs200_nma788_d2000_a0.010_trp0.50.p', #3
    'ex4-deep_learning_d2000_rnoun-me6_J1_rs200_nma299_d2000_a0.010_trp0.50.p', #4
    'ex4-neuro_learning_d2000_rnoun-me6_J1_rs200_nma293_d2000_a0.010_trp0.50.p' #5
]
"""
fname_labels = [
    'Bayes-Bayes',
    'Bayes-Deep',
    'Bayes-Learn',
    'Bayes-Neuro',
    'Learn-Deep',
    'Learn-Neuro'
]

#result_fnames = ['ex4-bayes_bayes_d2000_rnoun-me4_J1_rs500_nma430_d2000_a0.010_trp0.50.p']
fname = result_fnames[3]
repeats = 500
print('set #repeats to %d'%repeats)
ex = 4
results = glo.ex_load_result(ex, fname)

In [ ]:
data_fname = results['data_fname']
#labels = ['ME-full', 'ME-opt-0.5', 'ME-full', 'ME-gw-opt', 
#        'ME-grid', 'SCF-full', 'SCF-full', 'SCF-gw-opt', 'SCF-grid',
#        'MMD-lin', '$T^2$']

method = 'ME-full'
method_job_funcs = results['method_job_funcs']
func_names = [f.__name__ for f in method_job_funcs]
func2labels = exglo.get_func2label_map()
method_labels = [func2labels[f] for f in func_names if f in func2labels]
method_index = method_labels.index(method)

results0 = results['results'] 
method_results = results0[:, method_index]

alpha = 0.01
reps = len(method_results)

In [ ]:
def methods_powers(R, reps=repeats):
    """Return the powers of all methods"""
    n_methods = len(R['method_job_funcs'])
    met_powers = np.zeros(n_methods)
    results0 = R['results'] 
    for mi in range(n_methods):
        method_results = results0[:, mi]
        pvals = np.array([method_results[r]['test_result']['pvalue'] for r in range(reps)] )
        met_powers[mi] = np.mean(pvals < alpha)
    return met_powers

def methods_runtimes(R, reps=repeats):
    """Return the runtimes of all methods"""
    n_methods = len(R['method_job_funcs'])
    times = np.zeros(n_methods)
    results0 = R['results'] 
    for mi in range(n_methods):
        method_results = results0[:, mi]
        ts = np.array([method_results[r]['time_secs'] for r in range(reps)] )
        times[mi] = np.mean(ts)
    return times

In [ ]:
met_pows = methods_powers(results)
met_times = methods_runtimes(results)
print('fname: %s'%fname)
print(method_labels)
print('test powers: ')
print(met_pows)
print('runtimes: ')
print(met_times)

In [ ]:
def table_powers(result_fnames, fname_labels, reps=repeats):
    """print a table showing test powers of all methods in all the result files."""
    met_pows = []
    ntes = []
    for fi, fname in enumerate(result_fnames):
        results = glo.ex_load_result(ex, fname)
        tr_proportion = results['tr_proportion']
        te_proportion = 1-tr_proportion
        data_fname = results['data_fname']
        # load data to get the sample size
        #tst_data, n = load_nips_TSTData(data_fname)
        n = results['n']
        nte = int(te_proportion*n)
        ntes.append(nte)
        met_pows.append(methods_powers(results, reps))
        
    method_labels = results['method_labels']
    print(method_labels)
    for fnlabel, mps, n in zip(fname_labels, met_pows, ntes):
        mps_str = [('%.3f'%p).lstrip('0') for p in mps]
        str_row = [fnlabel] + ['%d'%n] + mps_str
        print(' & '.join(str_row))
        print(' \\\\ \n')

#------------
def table_runtimes(result_fnames, fname_labels, reps=repeats):    
    """print a table showing runtimes of all methods in all the result files."""
    met_pows = []
    ntes = []
    for fi, fname in enumerate(result_fnames):
        results = glo.ex_load_result(ex, fname)
        tr_proportion = results['tr_proportion']
        te_proportion = 1-tr_proportion
        data_fname = results['data_fname']
        # load data to get the sample size
        #tst_data, n = load_nips_TSTData(data_fname)
        n = results['n']
        nte = int(te_proportion*n)
        ntes.append(nte)
        met_pows.append(methods_runtimes(results, reps))
        
    method_labels = results['method_labels']
    print(method_labels)
    for fnlabel, mps, n in zip(fname_labels, met_pows, ntes):
        # https://pyformat.info/
        mps_str = [('{0:.4g}'.format(p)).lstrip('0') for p in mps]
        str_row = [fnlabel] + ['%d'%n] + mps_str
        print(' & '.join(str_row))
        print(' \\\\ \n')

In [ ]:
table_powers(result_fnames, fname_labels, repeats)

In [ ]:
table_runtimes(result_fnames, fname_labels, repeats)

Examine learned discriminative words


In [ ]:
# load terms 
data = glo.load_data_file(data_fname)
terms = data['words']

In [ ]:
pvals = np.zeros(reps)
test_methods = []
locs = []
test_stats = np.zeros(reps)
for r in xrange(reps):
    test = method_results[r]['test_method']
    test_methods.append(test)
    test_result = method_results[r]['test_result']
    pvals[r] = test_result['pvalue']
    test_stats[r] = test_result['test_stat']

In [ ]:
test_power = np.mean(pvals < alpha)
plt.plot(pvals)
plt.xlabel('trial')
plt.ylabel('pvalue')
plt.title('test power: %.3g'%(test_power))

In [ ]:
J = 1
dom = np.linspace(stats.chi2.ppf(0.2, J), stats.chi2.ppf(0.99, J), 300)
#weights = np.ones_like(test_stats)/len(test_stats)
nonan = test_stats[np.logical_not(np.isnan(test_stats))]
ns, bins, patches = plt.hist(nonan, bins=30, normed=True)
plt.plot(dom, stats.chi2.pdf(dom, J), 'r-', linewidth=2, label='$\chi^2(%d)$'%J)
plt.xlabel('test stats')
plt.ylabel('normalized frequencies')
plt.legend()

In [ ]:
from __future__ import print_function
# learned test locations from all trials. reps x d
if method == 'ME-full':
    k = 5
    sort_sign = -1
    locs = np.array( [test_methods[r].test_locs[0] for r in range(reps)] )
    scores = np.array([ np.abs(row) for row in locs])
    topk_ind = np.array([ np.argsort(sort_sign*s)[:k] for s in scores])
    ind_count = np.bincount(topk_ind.flatten())
    eff_wind = np.where(ind_count)[0]
    eff_count = ind_count[eff_wind]

    # sort by occurrence frequencies in descending order
    sind = np.argsort(-eff_count)
    seff_wind = eff_wind[sind]
    seff_count = eff_count[sind]
    plt.stem(seff_count)
    for t in terms[seff_wind]:
        #print('"%s", '%t, end='')
        print('%s, '%t, end='')

In [ ]: