Process WikiCorp Dataset

* Use this for learning [Features chosen based on frequency only]

In [ ]:
import inflect,os,sys 
from utils.sparse_utils import saveSparseHDF5
from utils.misc import savePickle, saveHDF5, loadHDF5
p = inflect.engine()
suffix = '-split_hyphen'

In [2]:
import numpy as np
from utils.sparse_utils import loadSparseHDF5

In [3]:
#Load wikicorp vocab
assert os.path.exists('./wikicorp/WestburyLab.wikicorp.201004'+suffix+'.feat'),'Feature file not found'
with open('./wikicorp/WestburyLab.wikicorp.201004'+suffix+'.feat','r') as f:
    vocab = [k.strip().split(' ')[0] for k in f.readlines()]
print len(vocab)
vocab_arr = np.array(vocab)


601328

In [4]:
#The number of singular nouns (flags->flag)
vlist  = []
w2idx  = {}

for idx,v in enumerate(vocab):
    sv = p.singular_noun(v)
    if sv:
        vlist.append(sv)
        w2idx[sv] = idx
    else:
        vlist.append(v)
        w2idx[v]  = idx
    
vocab_singular_only = set(vlist)
vocab_singular_list = np.array(vlist)
print vocab_singular_list.shape


(601328,)

In [5]:
data = loadSparseHDF5('dataset','./wikicorp/WestburyLab.wikicorp.201004'+suffix+'.h5')
counts = np.array(data.sum(0)).squeeze().astype(int)
print counts.shape


(601328,)

In [6]:
MAXVOCAB   = 20000
sorted_idx = list(set(np.argsort(counts)[-MAXVOCAB:].tolist()))
print np.sort(counts[sorted_idx])

print  len(sorted_idx),np.max(sorted_idx),np.min(sorted_idx),len(vocab)
#Add vectors corresponding to embedding words


[   2104    2104    2104 ..., 1111533 1170157 1341684]
20000 443398 0 601328

In [7]:
#Number of words we're double counting ~ 4k should be OK
subset_w = [vocab[i] for i in sorted_idx]
dblct    = []
for w in subset_w:
    if p.singular_noun(w) in subset_w:
        dblct.append(w)
print len(dblct)


3436

In [8]:
data_subset = data.tocsc()[:,sorted_idx].tocsr()

In [9]:
features_subset = [vocab[k] for k in sorted_idx]

In [10]:
features_subset_singular = []
for w in features_subset:
    kk = p.singular_noun(w)
    if kk:
        features_subset_singular.append(kk)
    else:
        features_subset_singular.append(w)

In [11]:
features_subset          = np.array(features_subset)
features_subset_singular = np.array(features_subset_singular)

In [12]:
#csr_matrix((data, indices, indptr), shape=(3, 3)).toarray()
print data_subset.max(),data_subset.shape
doccts = data_subset.max(1).toarray().squeeze()
docs_keep_idx = np.where(doccts>5)[0]
print docs_keep_idx.shape

data_subset_minlen = data_subset[docs_keep_idx]
print data_subset_minlen.shape


2702.0 (3035070, 20000)
(1204937,)
(1204937, 20000)

In [13]:
np.sort(np.array(data_subset_minlen.sum(1)).squeeze())


Out[13]:
array([  7.00000000e+00,   7.00000000e+00,   7.00000000e+00, ...,
         1.77470000e+04,   1.78530000e+04,   2.51730000e+04])

In [14]:
rm -rf wikicorp/idx-learning.h5

In [ ]:
pwd

In [15]:
#Restrict the documents

if not os.path.exists('wikicorp/idx-learning.h5'):
    np.random.seed(1)
    shufidx = np.random.permutation(data_subset_minlen.shape[0])
    idx = {}
    idx['test']    = shufidx[:100000]
    idx['valid']   = shufidx[:100000]
    idx['train']   = shufidx[100000:]
    saveHDF5('wikicorp/idx-learning.h5',idx)
#idx = loadHDF5('wikicorp/idx-learning.h5')
train_idx, valid_idx, test_idx = idx['train'], idx['valid'], idx['test']


TRAIN = data_subset_minlen[train_idx]
VALID = data_subset_minlen[valid_idx]
TEST  = data_subset_minlen[test_idx]
print TRAIN.shape, VALID.shape, TEST.shape
print np.sort(np.array(TRAIN.sum(1)).squeeze()).astype(int), np.sort(np.array(VALID.sum(1)).squeeze()), np.sort(np.array(TEST.sum(1)).squeeze())


(1104937, 20000) (100000, 20000) (100000, 20000)
[    7     7     8 ..., 17632 17747 17853] [  7.00000000e+00   1.10000000e+01   1.10000000e+01 ...,   9.14100000e+03
   9.52600000e+03   2.51730000e+04] [  7.00000000e+00   1.10000000e+01   1.10000000e+01 ...,   9.14100000e+03
   9.52600000e+03   2.51730000e+04]

In [16]:
os.system('rm -rf ./wikicorp/data-learning.h5 ./wikicorp/misc-learning.pkl')
saveSparseHDF5(TRAIN, 'train', './wikicorp/data-learning.h5')
saveSparseHDF5(VALID, 'valid', './wikicorp/data-learning.h5')
saveSparseHDF5(TEST,  'test' , './wikicorp/data-learning.h5')
savePickle([{},features_subset,features_subset_singular],'./wikicorp/misc-learning.pkl')


Saved  3  objects