Prepares data for caffe Constructs stratified k=5 folds and shoves data into lmdb


In [13]:
import os
import glob
import numpy as np
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import pickle

mode = 'aug_train'
# mode = 'test_final'

In [13]:
# Grabbing all image paths and labels
if mode == 'train_folds':
    path_data = '../../data/ndsb/train'
    labels = os.listdir(path_data)
    le = LabelEncoder().fit(labels)
    files_all = np.array([os.path.abspath(f) for f in glob.glob(os.path.join(path_data, '*', '*.jpg'))])
    y_str = [os.path.basename(os.path.dirname(f)) for f in files_all]
    y_enc = le.transform(y_str)
    pickle.dump(le, open('./tools/le.p', 'wb'))

In [19]:
# Create k=5 .txt files that specify train & test images for each fold
if mode == 'train_folds':
    k = 5
    skf = StratifiedKFold(y_enc, n_folds=k)

    save_fn = lambda name, f_mode, y_mode: np.savetxt(
            os.path.join(path_save, str(name) + '.txt'), 
            np.c_[f_mode, y_mode], 
            fmt='%s', delimiter='\t')

    save_fn('train_all', files_all, y_enc)
    for fold_ii, (train_ind, test_ind) in enumerate(skf):
        f_train, f_test = files_all[train_ind], files_all[test_ind]
        y_train, y_test = y_enc[train_ind], y_enc[test_ind]
        save_fn('train' + str(fold_ii), f_train, y_train)
        save_fn('test' + str(fold_ii) , f_test, y_test)

In [7]:
if mode == 'test_final':
#     path_data = '../../data/ndsb/test'
    save_name = 'test-1.txt'
    path_data = '/data/ndsb/test'
    files_all = np.array([os.path.abspath(f) for f in glob.glob(os.path.join(path_data, '*.jpg'))])
#     save_arr = files_all
    save_arr = np.c_[files_all, -np.ones(len(files_all)).astype(int)]
    np.savetxt(os.path.join(path_save, save_name), save_arr, fmt='%s')

In [17]:
from tools.le import le
if 'aug' in mode:
    if mode == 'aug_train':
        save_name = 'train_aug.txt'
        path_data = '/data/ndsb/augment/train'
    elif mode == 'aug_test':
        save_name = 'test_aug.txt'
        path_data = '/data/ndsb/augment/test'
        
    path_save = '/data/ndsb/augment'
    files_all = np.array([os.path.abspath(f) for f in glob.glob(os.path.join(path_data, '*', '*.jpg'))])
    
    if mode == 'aug_train':
        lbl_str = [os.path.split(os.path.split(f)[0])[1] for f in files_all]
        lbl_n = le.transform(lbl_str)
    elif mode == 'aug_test':
        lbl_n = -np.ones(len(files_all)).astype(int)
    save_arr = np.c_[files_all, lbl_n]
    np.savetxt(os.path.join(path_save, save_name), save_arr, fmt='%s')
    print os.path.join(path_save, save_name)


/data/ndsb/augment/train_aug.txt

In [16]:
from tools.le import le
le.transform('copepod_calanoid_eggs')


Out[16]:
15