In [3]:
import numpy as np
twopi = 2.*np.pi
oneOver2Pi = 1./twopi
In [4]:
def edgeLenCalc(block):
maxX, maxY = 0., 0.
for line in block:
spt = line.split()
spt = [float(x) for x in spt]
if spt[5] > maxX: maxX = spt[5]
if spt[7] > maxX: maxX = spt[7]
if spt[6] > maxY: maxY = spt[6]
if spt[8] > maxY: maxY = spt[8]
maxX = float(np.ceil(2*maxX))
maxY = float(np.ceil(2*maxY))
return maxX, maxY
In [7]:
import time
def time_usage(func):
def wrapper(*args, **kwargs):
beg_ts = time.time()
retval = func(*args, **kwargs)
end_ts = time.time()
print("elapsed time: %f" % (end_ts - beg_ts))
return retval
return wrapper
# @time_usage
# def test():
# for i in xrange(0, 10000):
# pass
In [23]:
#
# Processing parameters
#
run = "edgevar"
in_dir = "/home/walterms/project/walterms/mcmd/output/"+run+"/"
trn_dir = "/home/walterms/project/walterms/mcmd/nn/fnn/data/"+run+"/train/"
test_dir = "/home/walterms/project/walterms/mcmd/nn/fnn/data/"+run+"/test/"
unlabeled_dir = "/home/walterms/project/walterms/mcmd/nn/fnn/data/"+run+"/unlabeled/"
# For trnfnames, dict of run: label
trnfnames = {"edge40": 1., "edge10": 0., 'edge15.25': 0.}
nblTrn2Test = 300 # How many train blocks channelled to test
nblPerTrnFile = 200 # Number of blocks per training file PER type (used to parse training blocks)
unlabeledfnames = ["edge20", "edge25", "edge30", "edge35", "edge15",
"edge15.25", "edge15.5", "edge15.75", "edge16", "edge16.25",
"edge16.25", "edge16.5", "edge16.75", "edge17", "edge17.5",
"edge18", "edge18.5", "edge19", "edge19.5", "edge21",
"edge22", "edge23"]
In [6]:
#
# Processing parameters
#
run = "XT"
in_dir = "/home/michael/msc/mcmd/output/"+run+"/"
trn_dir = "/home/michael/msc/mcmd/nn/fnn/data/"+run+"/train/"
test_dir = "/home/michael/msc/mcmd/nn/fnn/data/"+run+"/test/"
unlabeled_dir = "/home/michael/msc/mcmd/nn/fnn/data/"+run+"/unlabeled/"
# For trnfnames, dict of run: label
# where label is 0-5 for [D, X, T, U, L, iso] respectively
trnfnames = {"X": 1., "T": 2.}
nblTrn2Test = 300 # How many train blocks channelled to test
nblPerTrnFile = 200 # Number of blocks per training file PER type (used to parse training blocks)
unlabeledfnames = []
Something that might be interesting: Compare NN performance when trained on two types of data: -> One with perfectly balanced training data -> One where each batch has only one type of config
Also, when runs that have less blocks than others, loop over the file and add these to the subsequent training files to maintain an even ratio of configurations per file
In [27]:
nblList
Out[27]:
In [26]:
# First count blocks of each file
nblList = {}
nblTotal = 0
nblMaxRun = 0 # nbl of biggest file
for f in trnfnames:
fin = open(in_dir+f, 'r')
n = 0
for line in fin.readlines():
if line == "\n":
n+=1
nblList.update({f: n})
if n > nblMaxRun: nblMaxRun = n
nblTotal+=n
fin.close()
# How many training files?
nTrnf = (nblMaxRun - nblTrn2Test) / nblPerTrnFile
if (nblMaxRun - nblTrn2Test)%nblPerTrnFile != 0: nTrnf+=1
In [32]:
processTrain()
In [31]:
@time_usage
def processTrain():
maxtrn = 5000
for f in trnfnames:
print "processing " + f + " for training data"
fin = open(in_dir+f,'r')
fout = open(trn_dir+f,'w')
bAddTest = False
if nblList[f] > (nblTrn2Test + nblPerTrnFile):
fout_test = open(test_dir+f, 'w')
fout_unlbl = open(unlabeled_dir+f,'w')
bAddTest = True
# calculate edge length based on vertices of first block
block = []
for line in fin.readlines():
if line == "\n": break
if line[0].isalpha(): continue
block.append(line)
maxX, maxY = edgeLenCalc(block)
normX, normY = 1./maxX, 1./maxY # normalize x and y
fin.seek(0)
if not (fin.readline()[0].isalpha()): fin.seek(0)
nbl = 0
fRot = 0 # rotation factor: 0,1,2,3. Multiplied by pi/2
for line in fin.readlines():
if (nbl>500) and (nbl < nblTrn2Test+500) and bAddTest:
if line == "\n":
nbl+=1
fRot = (fRot+1)%4
fout_test.write('label %f\n\n' % (trnfnames[f]))
fout_unlbl.write("\n")
if (nbl == nblTrn2Test+500) and bAddTest:
fout_test.close()
fout_unlbl.close()
continue
spt = [float(x) for x in line.split()]
x,y,th = spt[2],spt[3],spt[4]
# Rotate block
# note thetas should be [0,2pi] initially
th_ = fRot*np.pi*0.5
th += th_
if th > twopi: th-=twopi
th *= oneOver2Pi
x = np.cos(th_)*spt[2] - np.sin(th_)*spt[3]
y = np.cos(th_)*spt[3] + np.cos(th_)*spt[2]
# shift and normalize
x = normX*(x+maxX/2.)
y = normY*(y+maxY/2.)
fout_test.write('%f %f %f\n' % (x, y, th))
fout_unlbl.write('%f %f %f\n' % (x, y, th))
else:
if line == "\n":
nbl+=1
fRot = (fRot+1)%4
fout.write('label %f\n\n' % (trnfnames[f]))
if nbl > maxtrn: break
continue
spt = [float(x) for x in line.split()]
x,y,th = spt[2],spt[3],spt[4]
# Rotate block
# note thetas should be [0,2pi]
th_ = fRot*np.pi*0.5
th += th_
if th > twopi: th-=twopi
th *= oneOver2Pi
x = np.cos(th_)*spt[2] - np.sin(th_)*spt[3]
y = np.cos(th_)*spt[3] + np.cos(th_)*spt[2]
# shift and normalize
x = normX*(x+maxX/2.)
y = normY*(y+maxY/2.)
# write to file
fout.write('%f %f %f\n' % (x, y, th))
fout.close()
fin.close()
print "Done processing training files"
In [48]:
unlabeledfnames_s = unlabeledfnames
In [13]:
maxunlbl = 2000
for f in unlabeledfnames:
nbl = 0
print "processing " + f + " for unlabeled data"
fin = open(in_dir+f, 'r')
fout = open(unlabeled_dir+f,'w')
# calculate edge length based on vertices of first block
block = []
for line in fin.readlines():
if line == "\n": break
if line[0].isalpha(): continue
block.append(line)
fin.seek(0)
maxX, maxY = edgeLenCalc(block)
normX, normY = 1./maxX, 1./maxY # normalize x and y
if not (fin.readline()[0].isalpha()): fin.seek(0)
fRot = 0
for line in fin.readlines():
if line == "\n":
fout.write("\n")
nbl+=1
fRot = (fRot+1)%4
if nbl==maxunlbl: break
continue
spt = [float(x) for x in line.split()]
x,y,th = spt[2],spt[3],spt[4]
# Rotate block
# note thetas should be [0,2pi]
th_ = fRot*np.pi*0.5
th += th_
if th > twopi: th-=twopi
x = np.cos(th_)*spt[2] - np.sin(th_)*spt[3]
y = np.cos(th_)*spt[3] + np.cos(th_)*spt[2]
th *= oneOver2Pi
x *= normX*(x+maxX/2.)
y *= normY*(y+maxY/2.)
fout.write('%f %f %f\n' % (x, y, th))
fout.close()
fin.close()
print "Done processing unlabeled data"