In [1]:
import numpy as np
import random
twopi = 2.*np.pi
oneOver2Pi = 1./twopi

In [2]:
import time

def time_usage(func):
    def wrapper(*args, **kwargs):
        beg_ts = time.time()
        retval = func(*args, **kwargs)
        end_ts = time.time() 
        print("elapsed time: %f" % (end_ts - beg_ts))
        return retval
    return wrapper

In [3]:
#
# Processing parameters
#
run = "bigbox"
in_dir = "/home/walterms/project/walterms/mcmd/output/"+run+"/"
trn_dir = "/home/walterms/project/walterms/mcmd/nn/data/train/"
test_dir = "/home/walterms/project/walterms/mcmd/nn/data/test/"
unlabeled_dir = "/home/walterms/project/walterms/mcmd/nn/data/unlbl/"

# For trnfnames, dict of run: label
# [iso, D, T, X, U, L]
# trnfnames = {"edge15.00": 1, "edge30.00": 0}
unlblfnames = ["bigbox1", "bigbox2"]

# nbl parameters
# Use -1 to mean all
nblSkip = 1 # Skip first few images
nblPerTrnFile = -1
nblTrn2Test = 500 # How many train blocks channelled to test
nblUnlbl = 1000

In [4]:
# First count blocks of each file
nblList = {}
nblTotal = 0
nblMaxRun = 0 # nbl of biggest file
for f in trnfnames:
    fin = open(in_dir+f, 'r')
    n = 0
    for line in fin.readlines():
        if line == "\n":
            n+=1
    nblList.update({f: n})
    if n > nblMaxRun: nblMaxRun = n
    nblTotal+=n
    fin.close()

# How many training files?
nTrnf = (nblMaxRun - nblTrn2Test) / nblPerTrnFile
if (nblMaxRun - nblTrn2Test)%nblPerTrnFile != 0: nTrnf+=1
    
print nblList



NameErrorTraceback (most recent call last)
<ipython-input-4-72adaf421bef> in <module>()
      3 nblTotal = 0
      4 nblMaxRun = 0 # nbl of biggest file
----> 5 for f in trnfnames:
      6     fin = open(in_dir+f, 'r')
      7     n = 0

NameError: name 'trnfnames' is not defined

In [11]:
processTrain()


processing edge30.00 for training data
processing edge15.00 for training data
Done processing training files

In [5]:
def processTrain():
    maxtrn = 10000
    for f in trnfnames:
        print "processing " + f + " for training data"
        fin = open(in_dir+f,'r')
        
        outfname = f
        fout = open(trn_dir+outfname,'w')

        bAddTest = False
        if nblList[f] > (nblTrn2Test + nblPerTrnFile):
            fout_test = open(test_dir+outfname, 'w')
            fout_unlbl = open(unlabeled_dir+outfname,'w')        
            bAddTest = True
            
        # find width from file header
        width, height = 0., 0.
        l = fin.readline().split("|")
        for ll in l:
            if "boxEdge" in ll:
                width = float(ll.split()[1])
        height = width
        fin.seek(0)
        
        if width == 0.:
            # calculate edge length based on vertices of first block
            block = []
            for line in fin.readlines():
                if line == "\n": break
                if line[0].isalpha(): continue
                block.append(line)
            fin.seek(0)
            width, height = edgeLenCalc(block)

        if not (fin.readline()[0].isalpha()): fin.seek(0)
        
        thNorm = oneOver2Pi
        normX, normY = 1./width, 1./height # normalize x and y

        nbl = 0
        fRot = 0. # rotation factor: 0,1,2,3. Multiplied by pi/2
        block = []
        for line in fin.readlines():
            if line == "\n":
                # Done a block
                fRot = random.randint(0,3)
                if (nbl<nblSkip): # skip the first few imgs
                    nbl+=1
                    continue
                if (nbl>499) and (nbl<nblTrn2Test+500) and bAddTest:
                    # Channel some images to the test and unlabeled sets
                    for l in block:
                        fout_test.write('%f %f %f\n' % (l[0], l[1], l[2]))
                    fout_test.write('label %f\n\n' % (trnfnames[f]))
                    if (nbl == nblTrn2Test+499) and bAddTest:
                        fout_test.close()
                if (nbl>499) and (nbl<nblUnlbl+500) and bAddTest:
                    for l in block:
                        fout_unlbl.write('%f %f %f\n' % (l[0], l[1], l[2]))
                    fout_unlbl.write("\n")
                    if (nbl == nblUnlbl+499) and bAddTest:
                        fout_unlbl.close()
                        
                if (nbl<500) or ((nbl>nblTrn2Test+500) and (nbl>nblUnlbl+500)):
                    for l in block:
                        fout.write('%f %f %f\n' % (l[0], l[1], l[2]))
                    fout.write('label %f\n\n' % (trnfnames[f]))
                        
                if nbl>maxtrn: break
                block = []
                nbl+=1
                continue

            if nbl<nblSkip: continue
            spt = [float(x) for x in line.split()]
            x,y,th = spt[2],spt[3],spt[4]
            # Rotate block
            # note thetas should be [0,2pi] initially
            th_ = fRot*twopi*0.25
            th += th_
            if th > twopi: th-=twopi
            th *= thNorm

            x = np.cos(th_)*spt[2] - np.sin(th_)*spt[3]
            y = np.sin(th_)*spt[2] + np.cos(th_)*spt[3]
            # shift and normalize
            x *= normX
            y *= normY
            
            block.append([x,y,th])
            
        fout.close()
        fin.close()
    print "Done processing training files"

In [11]:
edges = []
edgefile = open("/home/walterms/mcmd/nn/edgelist","r")
unlblfnames = []
for e in edgefile.readlines():
    edges.append(e.strip())
    unlblfnames.append("edge"+e.strip())

In [12]:
unlblfnames


Out[12]:
['edge30.00',
 'edge29.12',
 'edge28.32',
 'edge27.57',
 'edge26.89',
 'edge26.25',
 'edge25.66',
 'edge25.10',
 'edge24.58',
 'edge24.09',
 'edge23.63',
 'edge23.19',
 'edge22.78',
 'edge22.39',
 'edge22.01',
 'edge21.66',
 'edge21.32',
 'edge21.00',
 'edge20.69',
 'edge20.40',
 'edge20.11',
 'edge19.84',
 'edge19.58',
 'edge19.33',
 'edge19.09',
 'edge18.86',
 'edge18.63',
 'edge18.42',
 'edge18.21',
 'edge18.01',
 'edge17.81',
 'edge17.62',
 'edge17.44',
 'edge17.26',
 'edge17.09',
 'edge16.92',
 'edge16.76',
 'edge16.60',
 'edge16.45',
 'edge16.30',
 'edge16.15',
 'edge16.01',
 'edge15.87',
 'edge15.74',
 'edge15.61',
 'edge15.48',
 'edge15.36',
 'edge15.24',
 'edge15.12',
 'edge15.00']

In [5]:
nblUnlbl = 5000

for f in unlblfnames:
    nbl = 0
    print "processing " + f + " for unlabeled data"
    fin = open(in_dir+f, 'r')
    fout = open(unlabeled_dir+f, 'w')
    
    # find width from file header
    width, height = 0., 0.
    l = fin.readline().split("|")
    for ll in l:
        if "boxEdge" in ll:
            width = float(ll.split()[1])
    height = width
    fin.seek(0)

    if width == 0.:
        # calculate edge length based on vertices of first block
        block = []
        for line in fin.readlines():
            if line == "\n": break
            if line[0].isalpha(): continue
            block.append(line)
        fin.seek(0)
        width, xheight = edgeLenCalc(block)

    if not (fin.readline()[0].isalpha()): fin.seek(0)

    normX, normY = 1./width, 1./height # normalize x and y
    thNorm = oneOver2Pi

    fRot = 0
    # adjust nblunlbl if needed
    if nblUnlbl == -1:
        nblUnlbl = nblList[f]
    for line in fin.readlines():
#         if nbl < nblList[f]-nblUnlbl:
#             if line == "\n":
#                 nbl += 1
#             continue
        if line == "\n":
            fout.write("\n")
            nbl+=1
            fRot = random.randint(0,3)
            if nbl > nblUnlbl:
                break
            else: continue
        spt = [float(x) for x in line.split()]
        x,y,th,ID = spt[2],spt[3],spt[4],spt[0]
        # Rotate block
        # note thetas should be [0,2pi]
        th_ = fRot*twopi*0.25
        th += th_
        if th > twopi: th-=twopi
        th *= oneOver2Pi

        x = np.cos(th_)*spt[2] - np.sin(th_)*spt[3]
        y = np.sin(th_)*spt[2] + np.cos(th_)*spt[3]
        x *= normX
        y *= normY
        fout.write('%f %f %f %f\n' % (x, y, th, ID))
    fout.close()
    fin.close()
print "Done"


processing bigbox1 for unlabeled data
processing bigbox2 for unlabeled data
Done