Use the extract_test_feats noetbook if you need to redo the full db extraction on train or test


In [1]:
import plyvel
import os
import numpy as np

TRAIN_FULL_PATH = '/media/raid_arr/tmp/normed_lvl'    # DB to split
TRAIN_FOLD_PATH = '/media/raid_arr/tmp/train0_norm_lvl'    # Output train
TEST_FOLD_PATH = '/media/raid_arr/tmp/test0_norm_lvl'    # Output test

TRAIN_FOLD_TXT = '/media/raid_arr/data/ndsb/folds/train0.txt'
TEST_FOLD_TXT = '/media/raid_arr/data/ndsb/folds/test0.txt'

In [2]:
# Grab the image paths from the folds that were already generated
train_fold_paths = np.loadtxt(TRAIN_FOLD_TXT, delimiter='\t', dtype=str)[:, 0]
test_fold_paths = np.loadtxt(TEST_FOLD_TXT, delimiter='\t', dtype=str)[:, 0]

In [10]:
# Get image name from path and shuffle order
train_fold_names = np.array([os.path.basename(p) for p in train_fold_paths])
test_fold_names = np.array([os.path.basename(p) for p in test_fold_paths])
np.random.shuffle(train_fold_names)
np.random.shuffle(test_fold_names)

In [77]:
# Open new db's and fill
db_train_full = plyvel.DB(TRAIN_FULL_PATH)
db_train_fold = plyvel.DB(TRAIN_FOLD_PATH, create_if_missing=True)
wb_train_fold = db_train_fold.write_batch()
db_test_fold = plyvel.DB(TEST_FOLD_PATH, create_if_missing=True)
wb_test_fold = db_test_fold.write_batch()

# Insert in shuffled order
for im_name in train_fold_names:
    db_train_fold.put(str(im_name), db_train_full.get(str(im_name)))
for im_name in test_fold_names:
    db_test_fold.put(str(im_name), db_train_full.get(str(im_name)))
    
wb_train_fold.write()
wb_test_fold.write()

db_train_full.close()
db_train_fold.close()
db_test_fold.close()

In [76]:
db_train_full.close()
db_train_fold.close()
db_test_fold.close()

In [78]:


In [ ]:


In [24]:
from random import shuffle
import my_io
reload(my_io)
shuffle(train_fold_paths)
shuffle(test_fold_paths)

In [25]:
TRAIN_FOLD_PATH = '/media/raid_arr/tmp/train0_norm_lmdb'    # Output train
TEST_FOLD_PATH = '/media/raid_arr/tmp/test0_norm_lmdb'    # Output test

my_io.multi_extract(train_fold_paths, TRAIN_FOLD_PATH, backend='lmdb',
                    perturb=False, verbose=True)
my_io.single_extract(train_fold_paths, TRAIN_FOLD_PATH, backend='lmdb',
                    perturb=False, verbose=True)

my_io.single_extract(test_fold_paths, TEST_FOLD_PATH, backend='lmdb',
                    perturb=False, verbose=True)


Traceback (most recent call last):
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 266, in _feed
    send(obj)
PicklingError: Can't pickle <class 'my_io.ExtractionTask'>: attribute lookup my_io.ExtractionTask failed
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-25-827e774e81e1> in <module>()
      3 
      4 my_io.multi_extract(train_fold_paths, TRAIN_FOLD_PATH, backend='lmdb',
----> 5                     perturb=False, verbose=True)
      6 # my_io.single_extract(train_fold_paths, TRAIN_FOLD_PATH, backend='lmdb',
      7 #                     perturb=False, verbose=True)

/afs/ee.cooper.edu/user/t/a/tam8/documents/ndsb2015/tools/my_io.pyc in multi_extract(im_files, db_path, backend, perturb, out_shape, verbose)
    300 
    301     # Wait for all of the tasks to finish
--> 302     tasks.join()
    303 
    304     # Combine Results

/usr/lib/python2.7/multiprocessing/queues.pyc in join(self)
    338         try:
    339             if not self._unfinished_tasks._semlock._is_zero():
--> 340                 self._cond.wait()
    341         finally:
    342             self._cond.release()

/usr/lib/python2.7/multiprocessing/synchronize.pyc in wait(self, timeout)
    244         try:
    245             # wait for notification or timeout
--> 246             self._wait_semaphore.acquire(True, timeout)
    247         finally:
    248             # indicate that this thread has woken

KeyboardInterrupt: 

In [ ]:
# Create aug
TRAIN_FOLD_PATH = '/media/raid_arr/tmp/train0_normaug_lvl'    # Output train


my_io.multi_extract(train_fold_paths, TRAIN_FOLD_PATH, backend='leveldb',
                    perturb=True, verbose=True)


my_io.single_extract(train_fold_paths, TRAIN_FOLD_PATH, backend='lmdb',
                    perturb=False, verbose=True)

In [4]:
my_io.single_extract(test_fold_paths, TEST_FOLD_PATH, perturb=False, verbose=True)


Exrtaction to db done in 156 s
/usr/lib/python2.7/site-packages/skimage/util/dtype.py:107: UserWarning: Possible precision loss when converting from float64 to uint8
  "%s to %s" % (dtypeobj_in, dtypeobj))

Moving Extracted Features to another DB with same order


In [1]:
from caffe.proto import caffe_pb2
from time import time
import lmdb
import numpy as np

In [80]:
# DB = '/media/raid_arr/tmp/train0_norm_lmdb/'
# DB_FEATS = '/media/raid_arr/tmp/train0_norm_feats_lmdb'
DB = '/media/raid_arr/tmp/test0_norm_lmdb/'
DB_FEATS = '/media/raid_arr/tmp/test0_norm_feats_lmdb'

# DB = '/media/raid_arr/tmp/train0_norm_lmdb/'
# DB_FEATS = '/media/raid_arr/tmp/train0_norm_feats_simp_lmdb'
# DB = '/media/raid_arr/tmp/test0_norm_lmdb/'
# DB_FEATS = '/media/raid_arr/tmp/test0_norm_feats_simp_lmdb'

def make_feats_db(core_db=DB, feats_db=DB_FEATS, verbose=False):
    db = lmdb.open(core_db)
    db_feats = lmdb.open(feats_db, map_size=1e12)
    txn = db.begin()
    c = txn.cursor()
    txn_feats = db_feats.begin(write=True)

    std_scale = 2.
    tic = time()
    for k, v in c:
        datum = caffe_pb2.Datum()
        datum.ParseFromString(v)
        extra_feats = np.array([
            datum.orig_space,
            datum.orig_height,
            datum.orig_width,
            datum.extent,
            datum.hu1,
            datum.hu2,
            datum.hu3,
            datum.hu4,
            datum.hu5,
            datum.hu6,
            datum.hu7,
            datum.solidity,
        ])[None, None, :]
        datum.channels, datum.height, datum.width = extra_feats.shape
        scale_map = ((extra_feats + std_scale) * 128./std_scale).clip(0, 255).astype('uint8')  # 2 std
        datum.data = scale_map.tobytes()
    #     datum.float_data.extend(extra_feats.flat)
        v_feats = datum.SerializeToString()
        txn_feats.put(k, v_feats)

    txn_feats.commit()
    db.close()
    db_feats.close()
    
    if verbose:
        print 'Feat transfer done:', time() - tic

In [79]:
make_feats_db(verbose=True)


1.65424895287

In [10]:
import my_io
reload(my_io)

DB = '/media/raid_arr/tmp/train0_norm_lmdb/'
DB_FEATS = '/media/raid_arr/tmp/train0_norm_feats_lmdb'

# DB = '/media/raid_arr/tmp/test0_norm_lmdb/'
# DB_FEATS = '/media/raid_arr/tmp/test0_norm_feats_lmdb'

my_io.transfer_feats_db(core_db=DB, 
                        feats_db=DB_FEATS,
                        backend='lmdb',
                        verbose=True)


Feat transfer done: 6.59817099571

In [11]:
import my_io
reload(my_io)

DB = '/media/raid_arr/tmp/train0_norm_lmdb/'
DB_LBLS = '/media/raid_arr/tmp/train0_norm_lbls_lmdb'

# DB = '/media/raid_arr/tmp/test0_norm_lmdb/'
# DB_LBLS = '/media/raid_arr/tmp/test0_norm_lbls_lmdb'
my_io.transfer_parentlbls_db(core_db=DB, 
                        feats_db=DB_LBLS,
                        backend='lmdb',
                        verbose=True)


Parent labels transfer done: 5.60691714287