Local data prep


In [1]:
import os
import sys
from pathlib import Path

import json
import boto3

from mtest.data.core import prepare, make_ft_fvecs
from mtest.data.sift import generate_gtIP

import logging
logging.basicConfig(level='DEBUG')
logger = logging.getLogger()


Failed to load GPU Faiss: No module named 'swigfaiss_gpu'
Faiss falling back to CPU-only.

Rename


In [2]:
ROOT = Path('/home/elan/Samsung/storage/mips')
WIKI = ROOT / 'WikiLSHTC'
AMZN = ROOT / 'Amazon-3M'
SIFT = ROOT / 'sift'
SML  = ROOT / 'siftsmall'

assert WIKI.exists() and AMZN.exists() and SIFT.exists() and SML.exists()

In [ ]:
os.rename(WIKI / 'wikiLSHTC_train.txt', WIKI / 'train.txt')
os.rename(WIKI / 'wikiLSHTC_test.txt',  WIKI / 'test.txt')

os.rename(AMZN / 'amazon-3M_train.txt', AMZN / 'train.txt')
os.rename(AMZN / 'amazon-3M_test.txt',  AMZN / 'test.txt')

os.rename(SML / 'siftsmall_base.fvecs',         SML / 'sift_base.fvecs' )
os.rename(SML / 'siftsmall_groundtruth.ivecs',  SML / 'sift_groundtruth.ivecs' )
os.rename(SML / 'siftsmall_learn.fvecs',        SML / 'sift_learn.fvecs' )
os.rename(SML / 'siftsmall_query.fvecs',        SML / 'sift_query.fvecs' )

Generate groundtruth for sift


In [ ]:
generate_gtIP(SIFT, SIFT, skip_tests=True)
generate_gtIP(SML, SML, skip_tests=True)

Rename sift


In [25]:
for P in [SIFT, SML]:
    os.rename(P / 'sift_base.fvecs',   P / 'data.base.fvecs' )
    os.rename(P / 'sift_learn.fvecs',  P / 'data.learn.fvecs' )
    os.rename(P / 'sift_query.fvecs',  P / 'data.query.fvecs' )

Generate data for fasttext


In [3]:
prepare(WIKI, WIKI, force=True)
prepare(AMZN, AMZN, force=True)


DEBUG:mtest.data.core:Preparing train csr
INFO:mtest.data.core:Data not found or `force` flag was passed.
DEBUG:mtest.data.core:I'm going to prepare it and store at /home/elan/Samsung/storage/mips/WikiLSHTC/X_train.csr.npz.
libsvm to csr for /home/elan/Samsung/storage/mips/WikiLSHTC/train.txt: 100%|██████████| 1778351/1778351 [01:06<00:00, 26917.04it/s]
DEBUG:mtest.data.core:# compute masks to get rid of examples with too little words or labels
DEBUG:mtest.data.core:# discard unwanted columns
DEBUG:mtest.data.core:# make sure each example has at leas one nonzero feature and one label
DEBUG:mtest.data.core:# fix csr matrices
DEBUG:mtest.data.core:# save the result
DEBUG:mtest.data.core:Preparing test csr
INFO:mtest.data.core:Data not found or `force` flag was passed.
DEBUG:mtest.data.core:I'm going to prepare it and store at /home/elan/Samsung/storage/mips/WikiLSHTC/X_test.csr.npz.
libsvm to csr for /home/elan/Samsung/storage/mips/WikiLSHTC/test.txt: 100%|██████████| 587084/587084 [00:23<00:00, 24687.09it/s]
DEBUG:mtest.data.core:# compute masks to get rid of examples with too little words or labels
DEBUG:mtest.data.core:# discard unwanted columns
DEBUG:mtest.data.core:# make sure each example has at leas one nonzero feature and one label
DEBUG:mtest.data.core:# fix csr matrices
DEBUG:mtest.data.core:# save the result
DEBUG:mtest.data.core:Preparing ft
to_ft (/home/elan/Samsung/storage/mips/WikiLSHTC/train.ft.txt): 100%|██████████| 1762443/1762443 [04:20<00:00, 6764.36it/s]
to_ft (/home/elan/Samsung/storage/mips/WikiLSHTC/test.ft.txt): 100%|██████████| 576246/576246 [01:27<00:00, 6575.64it/s]

Generate fasttext features


In [3]:
make_ft_fvecs('../../mips/fastText/fasttext', WIKI, 8)
make_ft_fvecs('../../mips/fastText/fasttext', AMZN, 8)

Put data on s3


In [28]:
logging.getLogger('botocore').setLevel('WARNING')
logging.getLogger('s3transfer').setLevel('WARNING')
logging.getLogger('boto3').setLevel('WARNING')
logger = logging.getLogger('S3Upload')


cfg    = json.load(open('../aws/config.json'))
s3     = boto3.resource('s3')
bucket = s3.Bucket(cfg['bucket'])

for path in [WIKI, AMZN, SIFT, SML]:
    for item in path.glob('data*'):
        
        key  = f'{path.name}/{item.name}'
        item = str(item)
        
        logger.debug(key)
        bucket.upload_file(item, key)


DEBUG:S3Upload:siftsmall/data.base.fvecs
DEBUG:S3Upload:siftsmall/data.labels.txt
DEBUG:S3Upload:siftsmall/data.learn.fvecs
DEBUG:S3Upload:siftsmall/data.query.fvecs

In [27]:



Out[27]:
'/home/elan/Mine/ml-mine/code/mips-tests/notebooks'