In [3]:
import numpy as np
twopi = 2.*np.pi
oneOver2Pi = 1./twopi

In [4]:
def edgeLenCalc(block):
    maxX, maxY = 0., 0.
    for line in block:
        spt = line.split()
        spt = [float(x) for x in spt]
        if spt[5] > maxX: maxX = spt[5]
        if spt[7] > maxX: maxX = spt[7]
        if spt[6] > maxY: maxY = spt[6]
        if spt[8] > maxY: maxY = spt[8]
    maxX = float(np.ceil(2*maxX))
    maxY = float(np.ceil(2*maxY))
    return maxX, maxY

In [7]:
import time

def time_usage(func):
    def wrapper(*args, **kwargs):
        beg_ts = time.time()
        retval = func(*args, **kwargs)
        end_ts = time.time()
        print("elapsed time: %f" % (end_ts - beg_ts))
        return retval
    return wrapper

# @time_usage
# def test():
#     for i in xrange(0, 10000):
#         pass

In [23]:
#
# Processing parameters
#
run = "edgevar"
in_dir = "/home/walterms/project/walterms/mcmd/output/"+run+"/"
trn_dir = "/home/walterms/project/walterms/mcmd/nn/fnn/data/"+run+"/train/"
test_dir = "/home/walterms/project/walterms/mcmd/nn/fnn/data/"+run+"/test/"
unlabeled_dir = "/home/walterms/project/walterms/mcmd/nn/fnn/data/"+run+"/unlabeled/"

# For trnfnames, dict of run: label
trnfnames = {"edge40": 1., "edge10": 0., 'edge15.25': 0.}
nblTrn2Test = 300 # How many train blocks channelled to test
nblPerTrnFile = 200 # Number of blocks per training file PER type (used to parse training blocks)

unlabeledfnames = ["edge20", "edge25", "edge30", "edge35", "edge15", 
                   "edge15.25", "edge15.5", "edge15.75", "edge16", "edge16.25", 
                   "edge16.25", "edge16.5", "edge16.75", "edge17", "edge17.5",
                   "edge18", "edge18.5", "edge19", "edge19.5", "edge21", 
                   "edge22", "edge23"]

In [6]:
#
# Processing parameters
#
run = "XT"
in_dir = "/home/michael/msc/mcmd/output/"+run+"/"
trn_dir = "/home/michael/msc/mcmd/nn/fnn/data/"+run+"/train/"
test_dir = "/home/michael/msc/mcmd/nn/fnn/data/"+run+"/test/"
unlabeled_dir = "/home/michael/msc/mcmd/nn/fnn/data/"+run+"/unlabeled/"

# For trnfnames, dict of run: label
# where label is 0-5 for [D, X, T, U, L, iso] respectively
trnfnames = {"X": 1., "T": 2.}

nblTrn2Test = 300 # How many train blocks channelled to test
nblPerTrnFile = 200 # Number of blocks per training file PER type (used to parse training blocks)

unlabeledfnames = []

Something that might be interesting: Compare NN performance when trained on two types of data: -> One with perfectly balanced training data -> One where each batch has only one type of config

Also, when runs that have less blocks than others, loop over the file and add these to the subsequent training files to maintain an even ratio of configurations per file


In [27]:
nblList


Out[27]:
{'edge10': 4140, 'edge15.25': 10001, 'edge40': 9971}

In [26]:
# First count blocks of each file
nblList = {}
nblTotal = 0
nblMaxRun = 0 # nbl of biggest file
for f in trnfnames:
    fin = open(in_dir+f, 'r')
    n = 0
    for line in fin.readlines():
        if line == "\n":
            n+=1
    nblList.update({f: n})
    if n > nblMaxRun: nblMaxRun = n
    nblTotal+=n
    fin.close()

# How many training files?
nTrnf = (nblMaxRun - nblTrn2Test) / nblPerTrnFile
if (nblMaxRun - nblTrn2Test)%nblPerTrnFile != 0: nTrnf+=1

In [32]:
processTrain()


processing edge10 for training data
processing edge15.25 for training data
processing edge40 for training data
Done processing training files
elapsed time: 76.426756

In [31]:
@time_usage
def processTrain():
    maxtrn = 5000
    for f in trnfnames:
        print "processing " + f + " for training data"
        fin = open(in_dir+f,'r')
        fout = open(trn_dir+f,'w')

        bAddTest = False
        if nblList[f] > (nblTrn2Test + nblPerTrnFile):
            fout_test = open(test_dir+f, 'w')
            fout_unlbl = open(unlabeled_dir+f,'w')        
            bAddTest = True
        # calculate edge length based on vertices of first block
        block = []
        for line in fin.readlines():
            if line == "\n": break
            if line[0].isalpha(): continue
            block.append(line)

        maxX, maxY = edgeLenCalc(block)
        normX, normY = 1./maxX, 1./maxY # normalize x and y
        fin.seek(0)
        if not (fin.readline()[0].isalpha()): fin.seek(0)

        nbl = 0
        fRot = 0 # rotation factor: 0,1,2,3. Multiplied by pi/2
        for line in fin.readlines():
            if (nbl>500) and (nbl < nblTrn2Test+500) and bAddTest:
                if line == "\n":
                    nbl+=1
                    fRot = (fRot+1)%4
                    fout_test.write('label %f\n\n' % (trnfnames[f]))
                    fout_unlbl.write("\n")
                    if (nbl == nblTrn2Test+500) and bAddTest:
                        fout_test.close()
                        fout_unlbl.close()
                    continue
                spt = [float(x) for x in line.split()]
                x,y,th = spt[2],spt[3],spt[4]
                # Rotate block
                # note thetas should be [0,2pi] initially
                th_ = fRot*np.pi*0.5
                th += th_
                if th > twopi: th-=twopi
                th *= oneOver2Pi
                x = np.cos(th_)*spt[2] - np.sin(th_)*spt[3]
                y = np.cos(th_)*spt[3] + np.cos(th_)*spt[2]
                # shift and normalize
                x = normX*(x+maxX/2.)
                y = normY*(y+maxY/2.)
                fout_test.write('%f %f %f\n' % (x, y, th))
                fout_unlbl.write('%f %f %f\n' % (x, y, th))
            else:
                if line == "\n":
                    nbl+=1
                    fRot = (fRot+1)%4
                    fout.write('label %f\n\n' % (trnfnames[f]))
                    if nbl > maxtrn: break
                    continue
                spt = [float(x) for x in line.split()]
                x,y,th = spt[2],spt[3],spt[4]
                # Rotate block
                # note thetas should be [0,2pi]
                th_ = fRot*np.pi*0.5
                th += th_
                if th > twopi: th-=twopi
                th *= oneOver2Pi
                x = np.cos(th_)*spt[2] - np.sin(th_)*spt[3]
                y = np.cos(th_)*spt[3] + np.cos(th_)*spt[2]
                # shift and normalize
                x = normX*(x+maxX/2.)
                y = normY*(y+maxY/2.)
                # write to file
                fout.write('%f %f %f\n' % (x, y, th))
        fout.close()
        fin.close()
    print "Done processing training files"

In [48]:
unlabeledfnames_s = unlabeledfnames

In [13]:
maxunlbl = 2000
for f in unlabeledfnames:
    nbl = 0
    print "processing " + f + " for unlabeled data"
    fin = open(in_dir+f, 'r')
    fout = open(unlabeled_dir+f,'w')
    
    # calculate edge length based on vertices of first block
    block = []
    for line in fin.readlines():
        if line == "\n": break
        if line[0].isalpha(): continue
        block.append(line)
    fin.seek(0)
    maxX, maxY = edgeLenCalc(block)
    normX, normY = 1./maxX, 1./maxY # normalize x and y

    if not (fin.readline()[0].isalpha()): fin.seek(0)
    fRot = 0
    for line in fin.readlines():
        if line == "\n":
            fout.write("\n")
            nbl+=1
            fRot = (fRot+1)%4
            if nbl==maxunlbl: break
            continue
        spt = [float(x) for x in line.split()]
        x,y,th = spt[2],spt[3],spt[4]
        # Rotate block
        # note thetas should be [0,2pi]
        th_ = fRot*np.pi*0.5
        th += th_
        if th > twopi: th-=twopi
        x = np.cos(th_)*spt[2] - np.sin(th_)*spt[3]
        y = np.cos(th_)*spt[3] + np.cos(th_)*spt[2]
        th *= oneOver2Pi
        x *= normX*(x+maxX/2.)
        y *= normY*(y+maxY/2.)
        fout.write('%f %f %f\n' % (x, y, th))
    fout.close()
    fin.close()
print "Done processing unlabeled data"


processing edge20 for unlabeled data
processing edge25 for unlabeled data
processing edge30 for unlabeled data
processing edge35 for unlabeled data
processing edge15 for unlabeled data
processing edge15.25 for unlabeled data
processing edge15.5 for unlabeled data
processing edge15.75 for unlabeled data
processing edge16 for unlabeled data
processing edge16.25 for unlabeled data
processing edge16.25 for unlabeled data
processing edge16.5 for unlabeled data
processing edge16.75 for unlabeled data
processing edge17 for unlabeled data
processing edge17.5 for unlabeled data
processing edge18 for unlabeled data
processing edge18.5 for unlabeled data
processing edge19 for unlabeled data
processing edge19.5 for unlabeled data
processing edge21 for unlabeled data
processing edge22 for unlabeled data
processing edge23 for unlabeled data
Done processing unlabeled data