In [1]:
# Mapping of SHA1 value to the path of the original image file
sha2path = dict((r[1],r[0]) for r in csv.reader(open('../positive.cmd.processed.csv')))
sha2path.update(dict((r[1],r[0]) for r in csv.reader(open('../negative.cmd.processed.csv'))))
In [2]:
# Use train-1/2/3 set of images to train-test the network. Split 3-1 (train - test)
train1_pos_shas = cPickle.load(open('../train1_pos_shas.pickle'))
train2_pos_shas = cPickle.load(open('../train2_pos_shas.pickle'))
train3_pos_shas = cPickle.load(open('../train3_pos_shas.pickle'))
test_pos_shas = cPickle.load(open('../test_pos_shas.pickle'))
train1_neg_shas = cPickle.load(open('../train1_neg_shas.pickle'))
train2_neg_shas = cPickle.load(open('../train2_neg_shas.pickle'))
train3_neg_shas = cPickle.load(open('../train3_neg_shas.pickle'))
test_neg_shas = cPickle.load(open('../test_neg_shas.pickle'))
train_pos_shas = train1_pos_shas | train2_pos_shas | train3_pos_shas
train_neg_shas = train1_neg_shas | train2_neg_shas | train3_neg_shas
In [11]:
# For each sha, find where it exists between the two download directories:
# /home/purg/data/memex/ht/hackathon_201607_cp1/training_positives/data/CP1_imageset/
# /home/purg/data/memex/ht/dan_bootystor/images/
import os
def find_sha1_filepath(sha1):
pre = sha1[:3]
t1 = os.path.join(
'/home/purg/data/memex/ht/hackathon_201607_cp1/training_positives/data/CP1_imageset',
pre, sha1)
t2 = os.path.join(
'/home/purg/data/memex/ht/dan_bootystor/images',
pre, sha1)
if os.path.isfile(t1):
return t1
elif os.path.isfile(t2):
return t2
else:
raise ValueError("No path for sha: %s" % sha1)
In [13]:
# Remember:
# 0 == negative
# 1 == positive
with open('train.image_truth.txt', 'w') as f:
for sha in train_pos_shas:
fp = find_sha1_filepath(sha)
f.write(fp + ' 1\n')
for sha in train_neg_shas:
fp = find_sha1_filepath(sha)
f.write(fp + ' 0\n')
with open('test.image_truth.txt', 'w') as f:
for sha in test_pos_shas:
fp = find_sha1_filepath(sha)
f.write(fp + ' 1\n')
for sha in test_neg_shas:
fp = find_sha1_filepath(sha)
f.write(fp + ' 0\n')
In [18]:
# Output test and train sets with equal balance, randomly sub-sampling where needed
even_train_size = min([len(train_pos_shas), len(train_neg_shas)])
even_test_size = min([len(test_pos_shas), len(test_neg_shas)])
import random
random.seed(0)
even_train_pos = random.sample(train_pos_shas, even_train_size)
even_train_neg = random.sample(train_neg_shas, even_train_size)
even_test_pos = random.sample(test_pos_shas, even_test_size)
even_test_neg = random.sample(test_neg_shas, even_test_size)
with open('train.image_truth.even.txt', 'w') as f:
for sha in even_train_pos:
fp = find_sha1_filepath(sha)
f.write(fp + ' 1\n')
for sha in even_train_neg:
fp = find_sha1_filepath(sha)
f.write(fp + ' 0\n')
with open('test.image_truth.even.txt', 'w') as f:
for sha in even_test_pos:
fp = find_sha1_filepath(sha)
f.write(fp + ' 1\n')
for sha in even_test_neg:
fp = find_sha1_filepath(sha)
f.write(fp + ' 0\n')
To start model fine-tuning:
/home/purg/dev/caffe/build-master/tools/caffe train -sigint_effect snapshot -solver solver.prototxt -weights <base_model>
If already started and resuming from a snapshot is desired:
/home/purg/dev/caffe/build-master/tools/caffe train -sigint_effect snapshot -solver solver.prototxt -snapshot <snapshot_file>