In [ ]:
#Visualize Samples from the model
import sys, os, glob
from collections import OrderedDict
sys.path.append('../../')
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['lines.linewidth']=5
mpl.rcParams['lines.markersize']=15
mpl.rcParams['text.usetex']=True
mpl.rcParams['text.latex.unicode']=True
mpl.rcParams['font.family'] = 'serif' 
mpl.rcParams['font.serif'] = 'Times New Roman'
mpl.rcParams['text.latex.preamble']= ['\\usepackage{amsfonts}','\\usepackage{amsmath}']
mpl.rcParams['font.size'] = 30
mpl.rcParams['axes.labelsize']=30
mpl.rcParams['legend.fontsize']=30
#http://stackoverflow.com/questions/22408237/named-colors-in-matplotlib
from utils.misc import getConfigFile, readPickle, loadHDF5, getUniqueIDFromParams
from optvaeutils.viz import getName
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset

In [ ]:
evalData = loadHDF5('../../expt/evaluateIF_params/wikicorp-pl-2-finopt-if_eval.h5')
trainData = loadHDF5('../../expt/evaluateIF_params/wikicorp-pl-2-finopt-if_train.h5')
print trainData.keys()

In [ ]:
from optvaedatasets.load import loadDataset as loadDataset_OVAE
dset = loadDataset_OVAE('wikicorp')

In [ ]:
sums_all = np.array(dset['train'].mean(0)).ravel()*100
#Rare if occurs in less than 5% of documents
widx_rare_all= np.where(sums_all<5)[0]
print sums_all.shape, len(widx_rare_all)

In [ ]:
plt.figure(figsize=(8,9))
plt.plot(np.sort(sums_all)[::-1])
#plt.xticks(np.arange(0,20002,4000), rotation='vertical')
plt.axhline(5,ls='--',lw=2,)
plt.ylabel('\\% of occurence in documents')
plt.title('Wikipedia')
plt.xlabel('Word Indices')
plt.savefig('wiki-sparse.pdf',bbox_inches='tight')

In [ ]:
from scipy.stats import spearmanr, pearsonr
from matplotlib.ticker import NullFormatter
def KL(mu_0, cov_0, mu_f, cov_f):
    assert np.all(cov_0>0.),'should be positive'
    assert np.all(cov_f>0.),'should be positive'
    diff_mu = mu_f-mu_0
    KL      = np.log(cov_f)-np.log(cov_0) - 1. + cov_0/cov_f + diff_mu**2/cov_f
    KL_t    = 0.5*KL.sum(1)
    return KL_t
def normalize(v):
    vmx, vmn = v.max(), v.min()
    return (v-vmn)/(vmx-vmn)

In [ ]:
data        = trainData['data']
klvec_train = KL(trainData['mu_0'],np.exp(trainData['logcov_0']),trainData['mu_f'],np.exp(trainData['logcov_f']))
data        = data[:20000]
klvec_train = klvec_train[:20000]

#sums        = np.array(data.sum(0)).ravel()
#widx_rare   = np.where(sums<1000)[0]
n_rare_words_train = data[:,widx_rare_all].sum(1)
print n_rare_words_train, len(widx_rare_all)

In [ ]:
data              = evalData['data']
klvec_eval        = KL(evalData['mu_0'],np.exp(evalData['logcov_0']),evalData['mu_f'],np.exp(evalData['logcov_f']))

data              = data[:20000]
klvec_eval        = klvec_eval[:20000]
#sums              = np.array(data.sum(0)).ravel()
#widx_rare         = np.where(sums<1000)[0]
n_rare_words_eval = data[:,widx_rare_all].sum(1)
print n_rare_words_eval, len(widx_rare_all)

In [ ]:
fig,axlist = plt.subplots(1,2,figsize=(10,4))

ax = axlist.ravel()[0]
idx_s = np.argsort(klvec_train)[::-1]
print idx_s.shape
ax.plot(np.arange(20000),normalize(klvec_train[idx_s]),alpha=0.5,marker='*',lw=0.1,ms=5)
ax.scatter(np.arange(20000),normalize(n_rare_words_train[idx_s]),alpha=0.03,s=5,c='r')
print spearmanr(n_rare_words_train, b=klvec_train)
ax.set_ylabel('Normalized Values')
ax.set_xlabel('Train')
    
ax = axlist.ravel()[1]
idx_s = np.argsort(klvec_eval)[::-1]
ax.plot(np.arange(20000),normalize(klvec_eval[idx_s]),alpha=0.5,marker='*',lw=0.1,ms=5,label='$\\text{KL}(\\psi(x)||\\psi^*)$')
ax.scatter(np.arange(20000),normalize(n_rare_words_eval[idx_s]),alpha=0.03,s=5,c='r',label='Rare Words Counts')
leg = ax.legend(bbox_to_anchor=(0.95, 1.4), columnspacing=0.1, ncol=2, markerscale=4)
for lh in leg.legendHandles: 
    lh.set_alpha(1)
print spearmanr(n_rare_words_eval, b=klvec_eval)
ax.set_xlabel('Held-out')

for ax in axlist:
    ax.set_yticks(np.arange(0,1.1,0.2))
    plt.sca(ax)
    plt.xticks(np.arange(0,20001,4000),[str(k)+'k' for k in np.arange(0,20001,4000)/1000],rotation=45)
#plt.show()
#plt.savefig('normalized_kl_vs_docs.png',bbox_inches='tight')

In [ ]:
fig,ax = plt.subplots(1,1,figsize=(8,8))

#ax = axlist.ravel()[0]
idx_s = np.argsort(klvec_train)[::-1]
print idx_s.shape
ax.plot(np.arange(20000),normalize(klvec_train[idx_s]),alpha=0.5,marker='*',lw=0.1,ms=5)
ax.scatter(np.arange(20000),normalize(n_rare_words_train[idx_s]),alpha=0.03,s=5,c='r')
print spearmanr(n_rare_words_train, b=klvec_train)
ax.set_ylabel('Normalized Values')
ax.set_xlabel('Train')

#ax = plt.gca()
axins = zoomed_inset_axes(ax, 2, loc=1) # zoom = 6
axins.plot(np.arange(20000),normalize(klvec_train[idx_s]),alpha=0.5,marker='*',lw=0.1,ms=5,label='$\\text{KL}(\\psi(x)||\\psi^*)$')
axins.scatter(np.arange(20000),normalize(n_rare_words_train[idx_s]),alpha=0.03,s=5,c='r',label='Rare Words')
leg = axins.legend(bbox_to_anchor=(0.15, 0.5), columnspacing=0.1, ncol=1, markerscale=4)
for lh in leg.legendHandles: 
    lh.set_alpha(1)
axins.set_xlim(0, 8000)
axins.set_ylim(0, 0.4)
axins.set_xticks([])
axins.set_yticks([])
mark_inset(ax, axins, loc1=2, loc2=4, fc="none", ec="k",lw=1)

ax.set_yticks(np.arange(0,1.1,0.2))
plt.sca(ax)
plt.xticks(np.arange(0,20001,4000),[str(k)+'k' for k in np.arange(0,20001,4000)/1000],rotation=45)
#plt.show()
plt.savefig('normalized_kl_vs_docs-train.png',bbox_inches='tight')

In [ ]:
fig,ax = plt.subplots(1,1,figsize=(8,8))

#ax = axlist.ravel()[0]
idx_s = np.argsort(klvec_eval)[::-1]
print idx_s.shape
ax.plot(np.arange(20000),normalize(klvec_eval[idx_s]),alpha=0.5,marker='*',lw=0.1,ms=5)
ax.scatter(np.arange(20000),normalize(n_rare_words_eval[idx_s]),alpha=0.03,s=5,c='r')
print spearmanr(n_rare_words_train, b=klvec_train)
ax.set_ylabel('Normalized Values')
ax.set_xlabel('Held-out')

#ax = plt.gca()
axins = zoomed_inset_axes(ax, 2, loc=1) # zoom = 6
axins.plot(np.arange(20000),normalize(klvec_eval[idx_s]),alpha=0.5,marker='*',lw=0.1,ms=5,label='$\\text{KL}(\\psi(x)||\\psi^*)$')
axins.scatter(np.arange(20000),normalize(n_rare_words_eval[idx_s]),alpha=0.03,s=5,c='r',label='Rare Words')
leg = axins.legend(bbox_to_anchor=(0.15, 0.5), columnspacing=0.1, ncol=1, markerscale=4)
for lh in leg.legendHandles: 
    lh.set_alpha(1)
axins.set_xlim(0, 8000)
axins.set_ylim(0, 0.4)
axins.set_xticks([])
axins.set_yticks([])
mark_inset(ax, axins, loc1=2, loc2=4, fc="none", ec="k",lw=1)

ax.set_yticks(np.arange(0,1.1,0.2))
plt.sca(ax)
plt.xticks(np.arange(0,20001,4000),[str(k)+'k' for k in np.arange(0,20001,4000)/1000],rotation=45)
#plt.show()
plt.savefig('normalized_kl_vs_docs-eval.png',bbox_inches='tight')

In [ ]:
evall = 'Held-out\n$\\rho = $ %.2f'%(spearmanr(n_rare_words_eval, b=klvec_eval).correlation)
trainl = 'Train\n$\\rho = $ %.2f'%(spearmanr(n_rare_words_train, b=klvec_train).correlation)

plt.figure(figsize=(8,8))
plt.scatter(n_rare_words_eval,klvec_eval,s=6,c='b',marker='s',alpha=0.5,label=evall)
plt.scatter(n_rare_words_train,klvec_train,s=6,c='darkseagreen',edgecolor = None,alpha=0.5,label=trainl)
plt.xticks(np.arange(0,4001,1000),[str(k/1000)+'k' for k in np.arange(0,4001,1000)],rotation='45')
plt.yticks(np.arange(0,400,100))
plt.xlim([0,4000])
plt.ylim([0,400])
plt.xlabel('Number of Rare Words')
plt.ylabel('$\\text{KL}(\\psi(x)||\\psi_{\\text{opt}})$')
plt.legend(loc='best', bbox_to_anchor=(.45, 0.6),ncol=1,columnspacing=0.01, markerscale=6, prop={'size': 30},frameon=True)


ax = plt.gca()
axins = zoomed_inset_axes(ax, 2, loc=4) # zoom = 6
#axins.imshow(Z2, extent=extent, interpolation="nearest", origin="lower")
# sub region of the original image
axins.scatter(n_rare_words_eval,klvec_eval,s=6,c='b',marker='s',alpha=0.5,label=evall)
axins.scatter(n_rare_words_train,klvec_train,s=6,c='darkseagreen',edgecolor = None,alpha=0.2,label=trainl)
axins.set_xlim(1, 1000)
axins.set_ylim(1, 100)
axins.set_xticks([])
axins.set_yticks([])
mark_inset(ax, axins, loc1=2, loc2=4, fc="none", ec="k",lw=1)

plt.savefig('kl_n_rare_words.png',bbox_inches='tight')

In [ ]: