In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import freqopttest.util as util
import freqopttest.data as data
import freqopttest.kernel as kernel
import freqopttest.tst as tst
import freqopttest.glo as glo
import os
try:
    import cPickle as pickle 
except:
    import pickle
import scipy.stats as stats
import sys

In [ ]:
# load text data
#fname = 'bayes_neuro_np794_nq788_d300.p'
#fname = 'bayes_neuro_np794_nq788_d300_verb.p'
#fname = 'bayes_neuro_np794_nq788_d300_random_verb.p'
#fname = 'bayes_neuro_np794_nq788_d300_noun.p'
#fname = 'bayes_neuro_np794_nq788_d400_noun.p'
#fname = 'bayes_neuro_np794_nq788_d400_random_verb.p'
#fname = 'bayes_neuro_np794_nq788_d800_random_verb.p'
fname = 'bayes_neuro_np794_nq788_d2000_random_noun.p'

#fname = 'bayes_neuro_np794_nq788_d400.p'

np.random.seed(2984)
fpath = glo.data_file(fname)
with open(fpath, 'r') as f:
    loaded = pickle.load(f)
    
#X = np.sign(loaded['P']) 
#Y = np.sign(loaded['Q']) 
X = loaded['P']
Y = loaded['Q']
terms = loaded['words']
# Want X, Y to have the same sample size
n_min = min(X.shape[0], Y.shape[0])
X = X[:n_min, :]
Y = Y[:n_min, :]
assert(X.shape[0] == Y.shape[0])

# add some noise
#X = X + np.random.randn(X.shape[0], X.shape[1])*1e-2
#Y = Y + np.random.randn(Y.shape[0], Y.shape[1])*1e-2

In [ ]:
np.sum(np.abs(X), 0)

In [ ]:
plt.figure(figsize=(3, 6))
plt.imshow(X, interpolation='None')
plt.ylabel('Docs')
plt.xlabel('Terms')

In [ ]:
# sample source
seed = 24

tst_data = data.TSTData(X, Y)
ss = data.SSResample(tst_data)
tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=seed+1)

Parameter optimization


In [ ]:
alpha = 0.01

op = {'n_test_locs': 1, 'seed': seed+11, 'max_iter': 500, 
     'batch_proportion': 1.0, 'locs_step_size': 50.0, 
      'gwidth_step_size': 0.1, 'tol_fun': 1e-4}
# optimize on the training set
test_locs, gwidth, info = tst.MeanEmbeddingTest.optimize_locs_width(tr, alpha, **op)

In [ ]:
plt.plot(info['obj_values'])
plt.xlabel('iteration')
plt.ylabel('Objective')

In [ ]:
xtr, ytr = tr.xy()
xmean = np.mean(xtr, 0)
ymean = np.mean(ytr, 0)
mean = np.mean(tr.stack_xy(), 0)

xm_norm = np.abs(xmean)/np.linalg.norm(xmean, ord=1)
ym_norm = np.abs(ymean)/np.linalg.norm(ymean, ord=1)
m_norm = np.abs(mean)/np.linalg.norm(mean, ord=1)

plt.plot(mean)

In [ ]:
t = test_locs[0]
#t_norm = t
t_norm = t/np.linalg.norm(t, ord=1)
score = np.abs(t_norm)
#score = (t_norm)
sind = np.argsort(-score)
plt.plot(score[sind])

In [ ]:
for i, w in enumerate(terms[sind]):
    if i%10==0:
        print
    print w,

Two-sample test


In [ ]:
# test with the best Gaussian with 
#test_locs = tst.MeanEmbeddingTest.init_locs_2randn(tr, 2, seed=28)
#gwidth = 10
met = tst.MeanEmbeddingTest(test_locs, gwidth, alpha)
met.perform_test(te)

In [ ]: