In [10]:
# Augmented training data
# Just augment the folds
import numpy as np
import os

# for mode in ['test', 'train']:
for mode in ['train']:
#     for n in range(5):
    for n in ['_all']:
        # text_file = 'test0.txt'
        text_file = mode + str(n) + '.txt'
        print 'Converting:', text_file
        
        f_path = os.path.join('./data', text_file)
        aug_train_path = '../../data/ndsb/augment/train'
        save_path = os.path.join('./data/augmented', text_file)
#         save_path = os.path.join('./data/augmented', 'unlabeled' + text_file)
        angs = range(0, 360, 45) 
        suffixes = np.array(['_rot' + str(ang) for ang in angs])

        arr = np.loadtxt(f_path, dtype=str, delimiter='\t')

        # Could do some weird thing with meshgrid, but I am too tired
        with open(save_path, 'wb') as f_save:
            for ii, (entry, label) in enumerate(arr):
                d, b = os.path.split(entry)
                l_str = os.path.basename(d)
                b = os.path.splitext(b)[0]
                for suf in suffixes:
                    new_path = os.path.abspath(os.path.join(aug_train_path, l_str, b + suf + '.jpg'))
                    line = new_path + '\t' + label + '\n'
#                     line = new_path + '\t' + '-1' + '\n'
                    f_save.write(line)

                if (ii%1000 == 0):
                    print ii, 'written'


Converting: train_all.txt
0 written
1000 written
2000 written
3000 written
4000 written
5000 written
6000 written
7000 written
8000 written
9000 written
10000 written
11000 written
12000 written
13000 written
14000 written
15000 written
16000 written
17000 written
18000 written
19000 written
20000 written
21000 written
22000 written
23000 written
24000 written
25000 written
26000 written
27000 written
28000 written
29000 written
30000 written

In [39]:
os.path.basename(d)


Out[39]:
'unknown_unclassified'

In [2]:
import numpy as np
import os
import glob

aug_test_path = '/data/ndsb/augment/test'
orig_test_path = '/data/ndsb/test'
save_path = os.path.join('/data/ndsb/augment/test_aug.txt')
if 1:
    files_aug = [os.path.abspath(f) for f in glob.glob(os.path.join(aug_test_path, '*', '*.jpg'))]
    files_orig = [os.path.abspath(f) for f in glob.glob(os.path.join(orig_test_path, '*.jpg'))]
    files_all = np.r_[files_aug, files_orig]
    y = -np.ones(len(files_all), dtype=int)
    np.savetxt(save_path, np.c_[files_all, y], fmt='%s', delimiter='\t')
    
    print save_path, 'saved'


/data/ndsb/augment/test/test_aug.txt saved

In [7]:
print len(files_aug)
print len(files_orig)
print len(files_all)


451045
64435
515480

In [39]:
#######################################
import numpy as np
import os
import glob
from sklearn.cross_validation import StratifiedKFold
from tools.le import le

TRAIN_PATH = '/media/raid_arr/data/ndsb/train/'
TRAIN_AUG_PATH = '/media/raid_arr/data/ndsb/augment/train/'

labels = os.listdir(TRAIN_PATH)
files_all = np.array([os.path.abspath(f) for f in glob.glob(os.path.join(TRAIN_PATH, '*', '*.jpg'))])
y_str = [os.path.basename(os.path.dirname(f)) for f in files_all]
y = le.transform(y_str)

In [51]:
SAVE_PATH = '/media/raid_arr/data/ndsb/folds'
k = 5
skf = StratifiedKFold(y, n_folds=k, shuffle=True, random_state=0)

save_fn = lambda name, f_mode, y_mode: np.savetxt(
        os.path.join(SAVE_PATH, str(name) + '.txt'), 
        np.c_[f_mode, y_mode], 
        fmt='%s', delimiter='\t')

angs = range(0, 360, 45) 
suffixes = np.array(['_rot' + str(ang) for ang in angs])
aug_name = lambda f_i, suff: os.path.join(TRAIN_AUG_PATH, os.path.basename(os.path.dirname(f_i)),
                                          os.path.splitext(os.path.basename(f_i))[0] + 
                                          suff + os.path.splitext(os.path.basename(f_i))[1])


save_fn('train_all', files_all, y)
for fold_ii, (train_ind, test_ind) in enumerate(skf):
    f_train, f_test = files_all[train_ind], files_all[test_ind]
    y_train, y_test = y[train_ind], y[test_ind]
    save_fn('train' + str(fold_ii), f_train, y_train)
    save_fn('test' + str(fold_ii) , f_test, y_test)
    
    # Aug
    f_train_aug = np.array([aug_name(f_i, suff) for f_i in f_train for suff in suffixes])
    y_train_aug = np.array([y_i for y_i in y_train for suff in suffixes])
    save_fn('train_aug' + str(fold_ii), f_train_aug, y_train_aug)

In [50]:
os.path.basename(os.path.dirname(f_i))


Out[50]:
'copepod_calanoid_eggs'