In [ ]:
import inflect,os,sys
from utils.sparse_utils import saveSparseHDF5
from utils.misc import savePickle, saveHDF5, loadHDF5
p = inflect.engine()
suffix = '-split_hyphen'
In [2]:
import numpy as np
from utils.sparse_utils import loadSparseHDF5
In [3]:
#Load wikicorp vocab
assert os.path.exists('./wikicorp/WestburyLab.wikicorp.201004'+suffix+'.feat'),'Feature file not found'
with open('./wikicorp/WestburyLab.wikicorp.201004'+suffix+'.feat','r') as f:
vocab = [k.strip().split(' ')[0] for k in f.readlines()]
print len(vocab)
vocab_arr = np.array(vocab)
In [4]:
#The number of singular nouns (flags->flag)
vlist = []
w2idx = {}
for idx,v in enumerate(vocab):
sv = p.singular_noun(v)
if sv:
vlist.append(sv)
w2idx[sv] = idx
else:
vlist.append(v)
w2idx[v] = idx
vocab_singular_only = set(vlist)
vocab_singular_list = np.array(vlist)
print vocab_singular_list.shape
In [5]:
data = loadSparseHDF5('dataset','./wikicorp/WestburyLab.wikicorp.201004'+suffix+'.h5')
counts = np.array(data.sum(0)).squeeze().astype(int)
print counts.shape
In [6]:
MAXVOCAB = 20000
sorted_idx = list(set(np.argsort(counts)[-MAXVOCAB:].tolist()))
print np.sort(counts[sorted_idx])
print len(sorted_idx),np.max(sorted_idx),np.min(sorted_idx),len(vocab)
#Add vectors corresponding to embedding words
In [7]:
#Number of words we're double counting ~ 4k should be OK
subset_w = [vocab[i] for i in sorted_idx]
dblct = []
for w in subset_w:
if p.singular_noun(w) in subset_w:
dblct.append(w)
print len(dblct)
In [8]:
data_subset = data.tocsc()[:,sorted_idx].tocsr()
In [9]:
features_subset = [vocab[k] for k in sorted_idx]
In [10]:
features_subset_singular = []
for w in features_subset:
kk = p.singular_noun(w)
if kk:
features_subset_singular.append(kk)
else:
features_subset_singular.append(w)
In [11]:
features_subset = np.array(features_subset)
features_subset_singular = np.array(features_subset_singular)
In [12]:
#csr_matrix((data, indices, indptr), shape=(3, 3)).toarray()
print data_subset.max(),data_subset.shape
doccts = data_subset.max(1).toarray().squeeze()
docs_keep_idx = np.where(doccts>5)[0]
print docs_keep_idx.shape
data_subset_minlen = data_subset[docs_keep_idx]
print data_subset_minlen.shape
In [13]:
np.sort(np.array(data_subset_minlen.sum(1)).squeeze())
Out[13]:
In [14]:
rm -rf wikicorp/idx-learning.h5
In [ ]:
pwd
In [15]:
#Restrict the documents
if not os.path.exists('wikicorp/idx-learning.h5'):
np.random.seed(1)
shufidx = np.random.permutation(data_subset_minlen.shape[0])
idx = {}
idx['test'] = shufidx[:100000]
idx['valid'] = shufidx[:100000]
idx['train'] = shufidx[100000:]
saveHDF5('wikicorp/idx-learning.h5',idx)
#idx = loadHDF5('wikicorp/idx-learning.h5')
train_idx, valid_idx, test_idx = idx['train'], idx['valid'], idx['test']
TRAIN = data_subset_minlen[train_idx]
VALID = data_subset_minlen[valid_idx]
TEST = data_subset_minlen[test_idx]
print TRAIN.shape, VALID.shape, TEST.shape
print np.sort(np.array(TRAIN.sum(1)).squeeze()).astype(int), np.sort(np.array(VALID.sum(1)).squeeze()), np.sort(np.array(TEST.sum(1)).squeeze())
In [16]:
os.system('rm -rf ./wikicorp/data-learning.h5 ./wikicorp/misc-learning.pkl')
saveSparseHDF5(TRAIN, 'train', './wikicorp/data-learning.h5')
saveSparseHDF5(VALID, 'valid', './wikicorp/data-learning.h5')
saveSparseHDF5(TEST, 'test' , './wikicorp/data-learning.h5')
savePickle([{},features_subset,features_subset_singular],'./wikicorp/misc-learning.pkl')