notebook.community



In [ ]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import freqopttest.util as util
import freqopttest.data as data
import freqopttest.kernel as kernel
import freqopttest.tst as tst
import freqopttest.glo as glo
import os
try:
    import cPickle as pickle 
except:
    import pickle
import scipy.stats as stats
import sys



In [ ]:

    
# load text data
#fname = 'bayes_neuro_np794_nq788_d300.p'
#fname = 'bayes_neuro_np794_nq788_d300_verb.p'
#fname = 'bayes_neuro_np794_nq788_d300_random_verb.p'
#fname = 'bayes_neuro_np794_nq788_d300_noun.p'
#fname = 'bayes_neuro_np794_nq788_d400_noun.p'
#fname = 'bayes_neuro_np794_nq788_d400_random_verb.p'
#fname = 'bayes_neuro_np794_nq788_d800_random_verb.p'
fname = 'bayes_neuro_np794_nq788_d2000_random_noun.p'

#fname = 'bayes_neuro_np794_nq788_d400.p'

np.random.seed(2984)
fpath = glo.data_file(fname)
with open(fpath, 'r') as f:
    loaded = pickle.load(f)
    
#X = np.sign(loaded['P']) 
#Y = np.sign(loaded['Q']) 
X = loaded['P']
Y = loaded['Q']
terms = loaded['words']
# Want X, Y to have the same sample size
n_min = min(X.shape[0], Y.shape[0])
X = X[:n_min, :]
Y = Y[:n_min, :]
assert(X.shape[0] == Y.shape[0])

# add some noise
#X = X + np.random.randn(X.shape[0], X.shape[1])*1e-2
#Y = Y + np.random.randn(Y.shape[0], Y.shape[1])*1e-2



In [ ]:

    
np.sum(np.abs(X), 0)



In [ ]:

    
plt.figure(figsize=(3, 6))
plt.imshow(X, interpolation='None')
plt.ylabel('Docs')
plt.xlabel('Terms')



In [ ]:

    
# sample source
seed = 24

tst_data = data.TSTData(X, Y)
ss = data.SSResample(tst_data)
tr, te = tst_data.split_tr_te(tr_proportion=0.5, seed=seed+1)

Parameter optimization



In [ ]:

    
alpha = 0.01

op = {'n_test_locs': 1, 'seed': seed+11, 'max_iter': 500, 
     'batch_proportion': 1.0, 'locs_step_size': 50.0, 
      'gwidth_step_size': 0.1, 'tol_fun': 1e-4}
# optimize on the training set
test_locs, gwidth, info = tst.MeanEmbeddingTest.optimize_locs_width(tr, alpha, **op)



In [ ]:

    
plt.plot(info['obj_values'])
plt.xlabel('iteration')
plt.ylabel('Objective')



In [ ]:

    
xtr, ytr = tr.xy()
xmean = np.mean(xtr, 0)
ymean = np.mean(ytr, 0)
mean = np.mean(tr.stack_xy(), 0)

xm_norm = np.abs(xmean)/np.linalg.norm(xmean, ord=1)
ym_norm = np.abs(ymean)/np.linalg.norm(ymean, ord=1)
m_norm = np.abs(mean)/np.linalg.norm(mean, ord=1)

plt.plot(mean)



In [ ]:

    
t = test_locs[0]
#t_norm = t
t_norm = t/np.linalg.norm(t, ord=1)
score = np.abs(t_norm)
#score = (t_norm)
sind = np.argsort(-score)
plt.plot(score[sind])



In [ ]:

    
for i, w in enumerate(terms[sind]):
    if i%10==0:
        print
    print w,

Two-sample test



In [ ]:

    
# test with the best Gaussian with 
#test_locs = tst.MeanEmbeddingTest.init_locs_2randn(tr, 2, seed=28)
#gwidth = 10
met = tst.MeanEmbeddingTest(test_locs, gwidth, alpha)
met.perform_test(te)



In [ ]: