In [1]:
import os
import sys
from pathlib import Path
import json
import boto3
from mtest.data.core import prepare, make_ft_fvecs
from mtest.data.sift import generate_gtIP
import logging
logging.basicConfig(level='DEBUG')
logger = logging.getLogger()
Rename
In [2]:
ROOT = Path('/home/elan/Samsung/storage/mips')
WIKI = ROOT / 'WikiLSHTC'
AMZN = ROOT / 'Amazon-3M'
SIFT = ROOT / 'sift'
SML = ROOT / 'siftsmall'
assert WIKI.exists() and AMZN.exists() and SIFT.exists() and SML.exists()
In [ ]:
os.rename(WIKI / 'wikiLSHTC_train.txt', WIKI / 'train.txt')
os.rename(WIKI / 'wikiLSHTC_test.txt', WIKI / 'test.txt')
os.rename(AMZN / 'amazon-3M_train.txt', AMZN / 'train.txt')
os.rename(AMZN / 'amazon-3M_test.txt', AMZN / 'test.txt')
os.rename(SML / 'siftsmall_base.fvecs', SML / 'sift_base.fvecs' )
os.rename(SML / 'siftsmall_groundtruth.ivecs', SML / 'sift_groundtruth.ivecs' )
os.rename(SML / 'siftsmall_learn.fvecs', SML / 'sift_learn.fvecs' )
os.rename(SML / 'siftsmall_query.fvecs', SML / 'sift_query.fvecs' )
Generate groundtruth for sift
In [ ]:
generate_gtIP(SIFT, SIFT, skip_tests=True)
generate_gtIP(SML, SML, skip_tests=True)
Rename sift
In [25]:
for P in [SIFT, SML]:
os.rename(P / 'sift_base.fvecs', P / 'data.base.fvecs' )
os.rename(P / 'sift_learn.fvecs', P / 'data.learn.fvecs' )
os.rename(P / 'sift_query.fvecs', P / 'data.query.fvecs' )
Generate data for fasttext
In [3]:
prepare(WIKI, WIKI, force=True)
prepare(AMZN, AMZN, force=True)
Generate fasttext features
In [3]:
make_ft_fvecs('../../mips/fastText/fasttext', WIKI, 8)
make_ft_fvecs('../../mips/fastText/fasttext', AMZN, 8)
Put data on s3
In [28]:
logging.getLogger('botocore').setLevel('WARNING')
logging.getLogger('s3transfer').setLevel('WARNING')
logging.getLogger('boto3').setLevel('WARNING')
logger = logging.getLogger('S3Upload')
cfg = json.load(open('../aws/config.json'))
s3 = boto3.resource('s3')
bucket = s3.Bucket(cfg['bucket'])
for path in [WIKI, AMZN, SIFT, SML]:
for item in path.glob('data*'):
key = f'{path.name}/{item.name}'
item = str(item)
logger.debug(key)
bucket.upload_file(item, key)
In [27]:
Out[27]: