In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import freqopttest.util as util
import freqopttest.data as data
import freqopttest.kernel as kernel
import freqopttest.tst as tst
import freqopttest.glo as glo
import os
try:
import cPickle as pickle
except:
import pickle
import scipy.stats as stats
import sys
In [ ]:
# load text data
#fname = 'bayes_neuro_np794_nq788_d300.p'
#fname = 'bayes_neuro_np794_nq788_d300_verb.p'
#fname = 'bayes_neuro_np794_nq788_d300_random_verb.p'
#fname = 'bayes_neuro_np794_nq788_d300_noun.p'
#fname = 'bayes_neuro_np794_nq788_d400_noun.p'
#fname = 'bayes_neuro_np794_nq788_d400_random_verb.p'
#fname = 'bayes_neuro_np794_nq788_d800_random_verb.p'
fname = 'bayes_neuro_np794_nq788_d2000_random_noun.p'
#fname = 'bayes_neuro_np794_nq788_d400.p'
np.random.seed(2984)
fpath = glo.data_file(fname)
with open(fpath, 'r') as f:
loaded = pickle.load(f)
#X = np.sign(loaded['P'])
#Y = np.sign(loaded['Q'])
X = loaded['P']
Y = loaded['Q']
terms = loaded['words']
# Want X, Y to have the same sample size
n_min = min(X.shape[0], Y.shape[0])
X = X[:n_min, :]
Y = Y[:n_min, :]
assert(X.shape[0] == Y.shape[0])
# add some noise
#X = X + np.random.randn(X.shape[0], X.shape[1])*1e-2
#Y = Y + np.random.randn(Y.shape[0], Y.shape[1])*1e-2
In [ ]:
np.sum(np.abs(X), 0)
In [ ]:
plt.figure(figsize=(3, 6))
plt.imshow(X, interpolation='None')
plt.ylabel('Docs')
plt.xlabel('Terms')
In [ ]:
# sample source
seed = 24
tst_data = data.TSTData(X, Y)
ss = data.SSResample(tst_data)
tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=seed+1)
In [ ]:
alpha = 0.01
op = {'n_test_locs': 1, 'seed': seed+11, 'max_iter': 500,
'batch_proportion': 1.0, 'locs_step_size': 50.0,
'gwidth_step_size': 0.1, 'tol_fun': 1e-4}
# optimize on the training set
test_locs, gwidth, info = tst.MeanEmbeddingTest.optimize_locs_width(tr, alpha, **op)
In [ ]:
plt.plot(info['obj_values'])
plt.xlabel('iteration')
plt.ylabel('Objective')
In [ ]:
xtr, ytr = tr.xy()
xmean = np.mean(xtr, 0)
ymean = np.mean(ytr, 0)
mean = np.mean(tr.stack_xy(), 0)
xm_norm = np.abs(xmean)/np.linalg.norm(xmean, ord=1)
ym_norm = np.abs(ymean)/np.linalg.norm(ymean, ord=1)
m_norm = np.abs(mean)/np.linalg.norm(mean, ord=1)
plt.plot(mean)
In [ ]:
t = test_locs[0]
#t_norm = t
t_norm = t/np.linalg.norm(t, ord=1)
score = np.abs(t_norm)
#score = (t_norm)
sind = np.argsort(-score)
plt.plot(score[sind])
In [ ]:
for i, w in enumerate(terms[sind]):
if i%10==0:
print
print w,
In [ ]:
# test with the best Gaussian with
#test_locs = tst.MeanEmbeddingTest.init_locs_2randn(tr, 2, seed=28)
#gwidth = 10
met = tst.MeanEmbeddingTest(test_locs, gwidth, alpha)
met.perform_test(te)
In [ ]: