In [18]:
import os
import subprocess
import my_io
import threading
NDSB_DIR = '/media/raid_arr/data/ndsb/config'
# TRAIN_SCRIPT = os.path.join(NDSB_DIR, 'train_pl.sh')
# RESUME_SCRIPT = os.path.join(NDSB_DIR, 'resume_training_pl.sh')
SOLVER = os.path.join(NDSB_DIR, 'solver.prototxt')
NET = os.path.join(NDSB_DIR, 'train_val.prototxt')
CAFFE = '/afs/ee.cooper.edu/user/t/a/tam8/documents/caffe/build/tools/caffe'
MODELS_DIR = '/media/raid_arr/data/ndsb/models'
snapshot_prefix = 'alexmod_bruteaug_fold0_iter_'
MAX_ITER = 100000 # global max (not per step)
STEP = 100
In [19]:
def write_max_iter_to_solver(max_iter, f_path=SOLVER):
with open(f_path, 'r') as f:
# read a list of lines into data
f_data = f.readlines()
# Change the line with the PL loss weight
line_n = 8 # The line number we're going to replace
new_line = 'max_iter: ' + str(max_iter) + '\n'
f_data[line_n] = new_line
# and write everything back
with open(f_path, 'w') as f:
f.writelines(f_data)
return f_path
In [20]:
snap_name = lambda n_iter: os.path.join(MODELS_DIR,
snapshot_prefix + str(n_iter) + '.solverstate')
call_start = lambda sol=SOLVER: subprocess.call(
[CAFFE, 'train', '--solver=' + sol])
call_resume = lambda snap, sol=SOLVER: subprocess.call(
[CAFFE, 'train', '--solver=' + sol, '--snapshot=' + snap])
In [22]:
last_saved_iter = max({int(os.path.splitext(f)[0].rsplit('_', 1)[1])
for f in next(os.walk(MODELS_DIR))[2]}.union({0}))
for ii in range(last_saved_iter, MAX_ITER+1, STEP):
write_max_iter_to_solver(ii + STEP) # make caffe stop at the next step
print 'ITER:\t', ii
subprocess.call(['rm', '-rf', '/dev/shm/train0_aug_lvl'])
subprocess.call(['cp', '-rf',
'/media/raid_arr/tmp/train0_aug_lvl',
'/dev/shm/train0_aug_lvl'])
# Start augmenting in another thread while caffe runs
aug_thread = threading.Thread(target=my_io.create_aug_lvl)
aug_thread.start()
if ii == 0:
print 'Starting new train'
call_start()
subprocess.call(['cp', '/tmp/caffe.INFO',
'/tmp/my_caffe_log.txt'])
else:
call_resume(snap_name(ii))
subprocess.call(['cat /tmp/caffe.INFO >> /tmp/my_caffe_log.txt'], shell=True)
print 'DONE'
ITER: 400
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 23.3951058388 sec
ITER: 500
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 31.0545511246 sec
ITER: 600
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.6482019424 sec
ITER: 700
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.3256220818 sec
ITER: 800
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 22.3219180107 sec
ITER: 900
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 20.3777880669 sec
ITER: 1000
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.5033640862 sec
ITER: 1100
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.4882810116 sec
ITER: 1200
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 20.5696358681 sec
ITER: 1300
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.5119621754 sec
ITER: 1400
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.4803161621 sec
ITER: 1500
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 22.8650701046 sec
ITER: 1600
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 22.8902490139 sec
ITER: 1700
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.2877149582 sec
ITER: 1800
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.519190073 sec
ITER: 1900
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.2639980316 sec
ITER: 2000
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.2895431519 sec
ITER: 2100
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.1144919395 sec
ITER: 2200
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.0925729275 sec
ITER: 2300
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.2371640205 sec
ITER: 2400
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.35729599 sec
ITER: 2500
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 32.4463670254 sec
ITER: 2600
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.6218709946 sec
ITER: 2700
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.2813549042 sec
ITER: 2800
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 22.3609278202 sec
ITER: 2900
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.2305321693 sec
ITER: 3000
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 20.8340039253 sec
ITER: 3100
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.6067421436 sec
ITER: 3200
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.650067091 sec
ITER: 3300
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 21.7929639816 sec
ITER: 3400
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.1101200581 sec
ITER: 3500
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 32.6959319115 sec
ITER: 3600
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.6990809441 sec
ITER: 3700
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 21.8797810078 sec
ITER: 3800
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.5075337887 sec
ITER: 3900
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 15.3633520603 sec
ITER: 4000
Processed: 0
Processed: 5000
Processed: 10000
Processed: 15000
Processed: 20000
Conversion of images from /dev/shm/train0_lmdb to /media/raid_arr/tmp/train0_aug_lvl took 21.4040081501 sec
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-22-c1286cac557a> in <module>()
20 '/tmp/my_caffe_log.txt'])
21 else:
---> 22 call_resume(snap_name(ii))
23 subprocess.call(['cat /tmp/caffe.INFO >> /tmp/my_caffe_log.txt'], shell=True)
24 print 'DONE'
<ipython-input-20-9ce95f5e1a4b> in <lambda>(snap, sol)
5 [CAFFE, 'train', '--solver=' + sol])
6 call_resume = lambda snap, sol=SOLVER: subprocess.call(
----> 7 [CAFFE, 'train', '--solver=' + sol, '--snapshot=' + snap])
/usr/lib/python2.7/subprocess.pyc in call(*popenargs, **kwargs)
520 retcode = call(["ls", "-l"])
521 """
--> 522 return Popen(*popenargs, **kwargs).wait()
523
524
/usr/lib/python2.7/subprocess.pyc in wait(self)
1374 while self.returncode is None:
1375 try:
-> 1376 pid, sts = _eintr_retry_call(os.waitpid, self.pid, 0)
1377 except OSError as e:
1378 if e.errno != errno.ECHILD:
/usr/lib/python2.7/subprocess.pyc in _eintr_retry_call(func, *args)
474 while True:
475 try:
--> 476 return func(*args)
477 except (OSError, IOError) as e:
478 if e.errno == errno.EINTR:
KeyboardInterrupt:
In [9]:
range(last_saved_iter, MAX_ITER+1, STEP)
Out[9]:
[0,
100,
200,
300,
400,
500,
600,
700,
800,
900,
1000,
1100,
1200,
1300,
1400,
1500,
1600,
1700,
1800,
1900,
2000,
2100,
2200,
2300,
2400,
2500,
2600,
2700,
2800,
2900,
3000,
3100,
3200,
3300,
3400,
3500,
3600,
3700,
3800,
3900,
4000,
4100,
4200,
4300,
4400,
4500,
4600,
4700,
4800,
4900,
5000,
5100,
5200,
5300,
5400,
5500,
5600,
5700,
5800,
5900,
6000,
6100,
6200,
6300,
6400,
6500,
6600,
6700,
6800,
6900,
7000,
7100,
7200,
7300,
7400,
7500,
7600,
7700,
7800,
7900,
8000,
8100,
8200,
8300,
8400,
8500,
8600,
8700,
8800,
8900,
9000,
9100,
9200,
9300,
9400,
9500,
9600,
9700,
9800,
9900,
10000,
10100,
10200,
10300,
10400,
10500,
10600,
10700,
10800,
10900,
11000,
11100,
11200,
11300,
11400,
11500,
11600,
11700,
11800,
11900,
12000,
12100,
12200,
12300,
12400,
12500,
12600,
12700,
12800,
12900,
13000,
13100,
13200,
13300,
13400,
13500,
13600,
13700,
13800,
13900,
14000,
14100,
14200,
14300,
14400,
14500,
14600,
14700,
14800,
14900,
15000,
15100,
15200,
15300,
15400,
15500,
15600,
15700,
15800,
15900,
16000,
16100,
16200,
16300,
16400,
16500,
16600,
16700,
16800,
16900,
17000,
17100,
17200,
17300,
17400,
17500,
17600,
17700,
17800,
17900,
18000,
18100,
18200,
18300,
18400,
18500,
18600,
18700,
18800,
18900,
19000,
19100,
19200,
19300,
19400,
19500,
19600,
19700,
19800,
19900,
20000,
20100,
20200,
20300,
20400,
20500,
20600,
20700,
20800,
20900,
21000,
21100,
21200,
21300,
21400,
21500,
21600,
21700,
21800,
21900,
22000,
22100,
22200,
22300,
22400,
22500,
22600,
22700,
22800,
22900,
23000,
23100,
23200,
23300,
23400,
23500,
23600,
23700,
23800,
23900,
24000,
24100,
24200,
24300,
24400,
24500,
24600,
24700,
24800,
24900,
25000,
25100,
25200,
25300,
25400,
25500,
25600,
25700,
25800,
25900,
26000,
26100,
26200,
26300,
26400,
26500,
26600,
26700,
26800,
26900,
27000,
27100,
27200,
27300,
27400,
27500,
27600,
27700,
27800,
27900,
28000,
28100,
28200,
28300,
28400,
28500,
28600,
28700,
28800,
28900,
29000,
29100,
29200,
29300,
29400,
29500,
29600,
29700,
29800,
29900,
30000,
30100,
30200,
30300,
30400,
30500,
30600,
30700,
30800,
30900,
31000,
31100,
31200,
31300,
31400,
31500,
31600,
31700,
31800,
31900,
32000,
32100,
32200,
32300,
32400,
32500,
32600,
32700,
32800,
32900,
33000,
33100,
33200,
33300,
33400,
33500,
33600,
33700,
33800,
33900,
34000,
34100,
34200,
34300,
34400,
34500,
34600,
34700,
34800,
34900,
35000,
35100,
35200,
35300,
35400,
35500,
35600,
35700,
35800,
35900,
36000,
36100,
36200,
36300,
36400,
36500,
36600,
36700,
36800,
36900,
37000,
37100,
37200,
37300,
37400,
37500,
37600,
37700,
37800,
37900,
38000,
38100,
38200,
38300,
38400,
38500,
38600,
38700,
38800,
38900,
39000,
39100,
39200,
39300,
39400,
39500,
39600,
39700,
39800,
39900,
40000,
40100,
40200,
40300,
40400,
40500,
40600,
40700,
40800,
40900,
41000,
41100,
41200,
41300,
41400,
41500,
41600,
41700,
41800,
41900,
42000,
42100,
42200,
42300,
42400,
42500,
42600,
42700,
42800,
42900,
43000,
43100,
43200,
43300,
43400,
43500,
43600,
43700,
43800,
43900,
44000,
44100,
44200,
44300,
44400,
44500,
44600,
44700,
44800,
44900,
45000,
45100,
45200,
45300,
45400,
45500,
45600,
45700,
45800,
45900,
46000,
46100,
46200,
46300,
46400,
46500,
46600,
46700,
46800,
46900,
47000,
47100,
47200,
47300,
47400,
47500,
47600,
47700,
47800,
47900,
48000,
48100,
48200,
48300,
48400,
48500,
48600,
48700,
48800,
48900,
49000,
49100,
49200,
49300,
49400,
49500,
49600,
49700,
49800,
49900,
50000,
50100,
50200,
50300,
50400,
50500,
50600,
50700,
50800,
50900,
51000,
51100,
51200,
51300,
51400,
51500,
51600,
51700,
51800,
51900,
52000,
52100,
52200,
52300,
52400,
52500,
52600,
52700,
52800,
52900,
53000,
53100,
53200,
53300,
53400,
53500,
53600,
53700,
53800,
53900,
54000,
54100,
54200,
54300,
54400,
54500,
54600,
54700,
54800,
54900,
55000,
55100,
55200,
55300,
55400,
55500,
55600,
55700,
55800,
55900,
56000,
56100,
56200,
56300,
56400,
56500,
56600,
56700,
56800,
56900,
57000,
57100,
57200,
57300,
57400,
57500,
57600,
57700,
57800,
57900,
58000,
58100,
58200,
58300,
58400,
58500,
58600,
58700,
58800,
58900,
59000,
59100,
59200,
59300,
59400,
59500,
59600,
59700,
59800,
59900,
60000,
60100,
60200,
60300,
60400,
60500,
60600,
60700,
60800,
60900,
61000,
61100,
61200,
61300,
61400,
61500,
61600,
61700,
61800,
61900,
62000,
62100,
62200,
62300,
62400,
62500,
62600,
62700,
62800,
62900,
63000,
63100,
63200,
63300,
63400,
63500,
63600,
63700,
63800,
63900,
64000,
64100,
64200,
64300,
64400,
64500,
64600,
64700,
64800,
64900,
65000,
65100,
65200,
65300,
65400,
65500,
65600,
65700,
65800,
65900,
66000,
66100,
66200,
66300,
66400,
66500,
66600,
66700,
66800,
66900,
67000,
67100,
67200,
67300,
67400,
67500,
67600,
67700,
67800,
67900,
68000,
68100,
68200,
68300,
68400,
68500,
68600,
68700,
68800,
68900,
69000,
69100,
69200,
69300,
69400,
69500,
69600,
69700,
69800,
69900,
70000,
70100,
70200,
70300,
70400,
70500,
70600,
70700,
70800,
70900,
71000,
71100,
71200,
71300,
71400,
71500,
71600,
71700,
71800,
71900,
72000,
72100,
72200,
72300,
72400,
72500,
72600,
72700,
72800,
72900,
73000,
73100,
73200,
73300,
73400,
73500,
73600,
73700,
73800,
73900,
74000,
74100,
74200,
74300,
74400,
74500,
74600,
74700,
74800,
74900,
75000,
75100,
75200,
75300,
75400,
75500,
75600,
75700,
75800,
75900,
76000,
76100,
76200,
76300,
76400,
76500,
76600,
76700,
76800,
76900,
77000,
77100,
77200,
77300,
77400,
77500,
77600,
77700,
77800,
77900,
78000,
78100,
78200,
78300,
78400,
78500,
78600,
78700,
78800,
78900,
79000,
79100,
79200,
79300,
79400,
79500,
79600,
79700,
79800,
79900,
80000,
80100,
80200,
80300,
80400,
80500,
80600,
80700,
80800,
80900,
81000,
81100,
81200,
81300,
81400,
81500,
81600,
81700,
81800,
81900,
82000,
82100,
82200,
82300,
82400,
82500,
82600,
82700,
82800,
82900,
83000,
83100,
83200,
83300,
83400,
83500,
83600,
83700,
83800,
83900,
84000,
84100,
84200,
84300,
84400,
84500,
84600,
84700,
84800,
84900,
85000,
85100,
85200,
85300,
85400,
85500,
85600,
85700,
85800,
85900,
86000,
86100,
86200,
86300,
86400,
86500,
86600,
86700,
86800,
86900,
87000,
87100,
87200,
87300,
87400,
87500,
87600,
87700,
87800,
87900,
88000,
88100,
88200,
88300,
88400,
88500,
88600,
88700,
88800,
88900,
89000,
89100,
89200,
89300,
89400,
89500,
89600,
89700,
89800,
89900,
90000,
90100,
90200,
90300,
90400,
90500,
90600,
90700,
90800,
90900,
91000,
91100,
91200,
91300,
91400,
91500,
91600,
91700,
91800,
91900,
92000,
92100,
92200,
92300,
92400,
92500,
92600,
92700,
92800,
92900,
93000,
93100,
93200,
93300,
93400,
93500,
93600,
93700,
93800,
93900,
94000,
94100,
94200,
94300,
94400,
94500,
94600,
94700,
94800,
94900,
95000,
95100,
95200,
95300,
95400,
95500,
95600,
95700,
95800,
95900,
96000,
96100,
96200,
96300,
96400,
96500,
96600,
96700,
96800,
96900,
97000,
97100,
97200,
97300,
97400,
97500,
97600,
97700,
97800,
97900,
98000,
98100,
98200,
98300,
98400,
98500,
98600,
98700,
98800,
98900,
99000,
99100,
99200,
99300,
99400,
99500,
99600,
99700,
99800,
99900,
...]
Content source: JasonTam/ndsb2015
Similar notebooks: