In [1]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import os
import sys
import tarfile
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves import cPickle as pickle
#import pickle
import h5py
import skimage
from skimage.io import imread, imsave
from skimage.transform import resize, rotate

%matplotlib inline


/home/josh/anaconda2/envs/tensorflow/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

preprocessing train and test data


In [3]:
train_files = 'train.tar.gz'
test_files = 'test.tar.gz'

train_root = os.path.splitext(os.path.splitext(train_files)[0])[0]  # remove .tar.gz
test_root = os.path.splitext(os.path.splitext(test_files)[0])[0]

In [4]:
extra_files = 'extra.tar.gz'
extra_root = os.path.splitext(os.path.splitext(extra_files)[0])[0]

decompress training data to folder 'train' and testing data to folder 'test'


In [3]:
tar = tarfile.open(train_files)
sys.stdout.flush()
tar.extractall()
tar.close()

tar = tarfile.open(test_files)
sys.stdout.flush()
tar.extractall()
tar.close()

In [3]:
tar = tarfile.open(extra_files)
sys.stdout.flush()
tar.extractall()
tar.close()

In [4]:
train_list =[train_root + '/' + x for x in os.listdir(train_root) ]
test_list =[test_root + '/' + x for x in os.listdir(test_root) ]

In [5]:
extra_list =[extra_root + '/' + x for x in os.listdir(extra_root) ]

info_file is 'digit_struct.mat'


In [5]:
train_info = [s for s in train_list if '.mat' in s]
test_info = [s for s in test_list if '.mat' in s]

train_f = h5py.File(train_info[0],'r')
test_f = h5py.File(test_info[0],'r')

for name in train_f:
    print name


#refs#
digitStruct

In [7]:
extra_info = [s for s in extra_list if '.mat' in s]

extra_f = h5py.File(extra_info[0],'r')


for name in extra_f:
    print name
    
extra_f['/digitStruct/bbox'][0]


#refs#
digitStruct
Out[7]:
array([<HDF5 object reference>], dtype=object)

extract information from matadata


In [10]:
class metadataExtractor(object):
    
    def __init__(self, f):
        self._metadata= {}
        self._metadata['height'] = []
        self._metadata['label'] = []
        self._metadata['left'] = []
        self._metadata['top'] = []
        self._metadata['width'] = []
        self._f = f
        
    def _print_attrs(self, name, obj):
        vals = []
        if obj.shape[0] == 1:
            vals.append(obj[0][0])
        else:
            for k in range(obj.shape[0]):
                vals.append(self._f[obj[k][0]][0][0])
        self._metadata[name].append(vals)
        
    def extract(self):
        i = 0
        for item in self._f['/digitStruct/bbox']:
            self._f[item[0]].visititems(self._print_attrs)
            i +=1
            if i%200 is 0:
                print 'extract {}th iterm'.format(i)
            
            
        return self._metadata

In [7]:
train_metadata = metadataExtractor(train_f).extract()
print 'Train metadata extracted'
test_metadata = metadataExtractor(test_f).extract()
print 'Test metadata extracted'


Train metadata extracted
Test metadata extracted

In [11]:
extra_metadata = metadataExtractor(extra_f).extract()
print 'Extra metadata extracted'


extract 200th iterm
extract 400th iterm
extract 600th iterm
extract 800th iterm
extract 1000th iterm
extract 1200th iterm
extract 1400th iterm
extract 1600th iterm
extract 1800th iterm
extract 2000th iterm
extract 2200th iterm
extract 2400th iterm
extract 2600th iterm
extract 2800th iterm
extract 3000th iterm
extract 3200th iterm
extract 3400th iterm
extract 3600th iterm
extract 3800th iterm
extract 4000th iterm
extract 4200th iterm
extract 4400th iterm
extract 4600th iterm
extract 4800th iterm
extract 5000th iterm
extract 5200th iterm
extract 5400th iterm
extract 5600th iterm
extract 5800th iterm
extract 6000th iterm
extract 6200th iterm
extract 6400th iterm
extract 6600th iterm
extract 6800th iterm
extract 7000th iterm
extract 7200th iterm
extract 7400th iterm
extract 7600th iterm
extract 7800th iterm
extract 8000th iterm
extract 8200th iterm
extract 8400th iterm
extract 8600th iterm
extract 8800th iterm
extract 9000th iterm
extract 9200th iterm
extract 9400th iterm
extract 9600th iterm
extract 9800th iterm
extract 10000th iterm
extract 10200th iterm
extract 10400th iterm
extract 10600th iterm
extract 10800th iterm
extract 11000th iterm
extract 11200th iterm
extract 11400th iterm
extract 11600th iterm
extract 11800th iterm
extract 12000th iterm
extract 12200th iterm
extract 12400th iterm
extract 12600th iterm
extract 12800th iterm
extract 13000th iterm
extract 13200th iterm
extract 13400th iterm
extract 13600th iterm
extract 13800th iterm
extract 14000th iterm
extract 14200th iterm
extract 14400th iterm
extract 14600th iterm
extract 14800th iterm
extract 15000th iterm
extract 15200th iterm
extract 15400th iterm
extract 15600th iterm
extract 15800th iterm
extract 16000th iterm
extract 16200th iterm
extract 16400th iterm
extract 16600th iterm
extract 16800th iterm
extract 17000th iterm
extract 17200th iterm
extract 17400th iterm
extract 17600th iterm
extract 17800th iterm
extract 18000th iterm
extract 18200th iterm
extract 18400th iterm
extract 18600th iterm
extract 18800th iterm
extract 19000th iterm
extract 19200th iterm
extract 19400th iterm
extract 19600th iterm
extract 19800th iterm
extract 20000th iterm
extract 20200th iterm
extract 20400th iterm
extract 20600th iterm
extract 20800th iterm
extract 21000th iterm
extract 21200th iterm
extract 21400th iterm
extract 21600th iterm
extract 21800th iterm
extract 22000th iterm
extract 22200th iterm
extract 22400th iterm
extract 22600th iterm
extract 22800th iterm
extract 23000th iterm
extract 23200th iterm
extract 23400th iterm
extract 23600th iterm
extract 23800th iterm
extract 24000th iterm
extract 24200th iterm
extract 24400th iterm
extract 24600th iterm
extract 24800th iterm
extract 25000th iterm
extract 25200th iterm
extract 25400th iterm
extract 25600th iterm
extract 25800th iterm
extract 26000th iterm
extract 26200th iterm
extract 26400th iterm
extract 26600th iterm
extract 26800th iterm
extract 27000th iterm
extract 27200th iterm
extract 27400th iterm
extract 27600th iterm
extract 27800th iterm
extract 28000th iterm
extract 28200th iterm
extract 28400th iterm
extract 28600th iterm
extract 28800th iterm
extract 29000th iterm
extract 29200th iterm
extract 29400th iterm
extract 29600th iterm
extract 29800th iterm
extract 30000th iterm
extract 30200th iterm
extract 30400th iterm
extract 30600th iterm
extract 30800th iterm
extract 31000th iterm
extract 31200th iterm
extract 31400th iterm
extract 31600th iterm
extract 31800th iterm
extract 32000th iterm
extract 32200th iterm
extract 32400th iterm
extract 32600th iterm
extract 32800th iterm
extract 33000th iterm
extract 33200th iterm
extract 33400th iterm
extract 33600th iterm
extract 33800th iterm
extract 34000th iterm
extract 34200th iterm
extract 34400th iterm
extract 34600th iterm
extract 34800th iterm
extract 35000th iterm
extract 35200th iterm
extract 35400th iterm
extract 35600th iterm
extract 35800th iterm
extract 36000th iterm
extract 36200th iterm
extract 36400th iterm
extract 36600th iterm
extract 36800th iterm
extract 37000th iterm
extract 37200th iterm
extract 37400th iterm
extract 37600th iterm
extract 37800th iterm
extract 38000th iterm
extract 38200th iterm
extract 38400th iterm
extract 38600th iterm
extract 38800th iterm
extract 39000th iterm
extract 39200th iterm
extract 39400th iterm
extract 39600th iterm
extract 39800th iterm
extract 40000th iterm
extract 40200th iterm
extract 40400th iterm
extract 40600th iterm
extract 40800th iterm
extract 41000th iterm
extract 41200th iterm
extract 41400th iterm
extract 41600th iterm
extract 41800th iterm
extract 42000th iterm
extract 42200th iterm
extract 42400th iterm
extract 42600th iterm
extract 42800th iterm
extract 43000th iterm
extract 43200th iterm
extract 43400th iterm
extract 43600th iterm
extract 43800th iterm
extract 44000th iterm
extract 44200th iterm
extract 44400th iterm
extract 44600th iterm
extract 44800th iterm
extract 45000th iterm
extract 45200th iterm
extract 45400th iterm
extract 45600th iterm
extract 45800th iterm
extract 46000th iterm
extract 46200th iterm
extract 46400th iterm
extract 46600th iterm
extract 46800th iterm
extract 47000th iterm
extract 47200th iterm
extract 47400th iterm
extract 47600th iterm
extract 47800th iterm
extract 48000th iterm
extract 48200th iterm
extract 48400th iterm
extract 48600th iterm
extract 48800th iterm
extract 49000th iterm
extract 49200th iterm
extract 49400th iterm
extract 49600th iterm
extract 49800th iterm
extract 50000th iterm
extract 50200th iterm
extract 50400th iterm
extract 50600th iterm
extract 50800th iterm
extract 51000th iterm
extract 51200th iterm
extract 51400th iterm
extract 51600th iterm
extract 51800th iterm
extract 52000th iterm
extract 52200th iterm
extract 52400th iterm
extract 52600th iterm
extract 52800th iterm
extract 53000th iterm
extract 53200th iterm
extract 53400th iterm
extract 53600th iterm
extract 53800th iterm
extract 54000th iterm
extract 54200th iterm
extract 54400th iterm
extract 54600th iterm
extract 54800th iterm
extract 55000th iterm
extract 55200th iterm
extract 55400th iterm
extract 55600th iterm
extract 55800th iterm
extract 56000th iterm
extract 56200th iterm
extract 56400th iterm
extract 56600th iterm
extract 56800th iterm
extract 57000th iterm
extract 57200th iterm
extract 57400th iterm
extract 57600th iterm
extract 57800th iterm
extract 58000th iterm
extract 58200th iterm
extract 58400th iterm
extract 58600th iterm
extract 58800th iterm
extract 59000th iterm
extract 59200th iterm
extract 59400th iterm
extract 59600th iterm
extract 59800th iterm
extract 60000th iterm
extract 60200th iterm
extract 60400th iterm
extract 60600th iterm
extract 60800th iterm
extract 61000th iterm
extract 61200th iterm
extract 61400th iterm
extract 61600th iterm
extract 61800th iterm
extract 62000th iterm
extract 62200th iterm
extract 62400th iterm
extract 62600th iterm
extract 62800th iterm
extract 63000th iterm
extract 63200th iterm
extract 63400th iterm
extract 63600th iterm
extract 63800th iterm
extract 64000th iterm
extract 64200th iterm
extract 64400th iterm
extract 64600th iterm
extract 64800th iterm
extract 65000th iterm
extract 65200th iterm
extract 65400th iterm
extract 65600th iterm
extract 65800th iterm
extract 66000th iterm
extract 66200th iterm
extract 66400th iterm
extract 66600th iterm
extract 66800th iterm
extract 67000th iterm
extract 67200th iterm
extract 67400th iterm
extract 67600th iterm
extract 67800th iterm
extract 68000th iterm
extract 68200th iterm
extract 68400th iterm
extract 68600th iterm
extract 68800th iterm
extract 69000th iterm
extract 69200th iterm
extract 69400th iterm
extract 69600th iterm
extract 69800th iterm
extract 70000th iterm
extract 70200th iterm
extract 70400th iterm
extract 70600th iterm
extract 70800th iterm
extract 71000th iterm
extract 71200th iterm
extract 71400th iterm
extract 71600th iterm
extract 71800th iterm
extract 72000th iterm
extract 72200th iterm
extract 72400th iterm
extract 72600th iterm
extract 72800th iterm
extract 73000th iterm
extract 73200th iterm
extract 73400th iterm
extract 73600th iterm
extract 73800th iterm
extract 74000th iterm
extract 74200th iterm
extract 74400th iterm
extract 74600th iterm
extract 74800th iterm
extract 75000th iterm
extract 75200th iterm
extract 75400th iterm
extract 75600th iterm
extract 75800th iterm
extract 76000th iterm
extract 76200th iterm
extract 76400th iterm
extract 76600th iterm
extract 76800th iterm
extract 77000th iterm
extract 77200th iterm
extract 77400th iterm
extract 77600th iterm
extract 77800th iterm
extract 78000th iterm
extract 78200th iterm
extract 78400th iterm
extract 78600th iterm
extract 78800th iterm
extract 79000th iterm
extract 79200th iterm
extract 79400th iterm
extract 79600th iterm
extract 79800th iterm
extract 80000th iterm
extract 80200th iterm
extract 80400th iterm
extract 80600th iterm
extract 80800th iterm
extract 81000th iterm
extract 81200th iterm
extract 81400th iterm
extract 81600th iterm
extract 81800th iterm
extract 82000th iterm
extract 82200th iterm
extract 82400th iterm
extract 82600th iterm
extract 82800th iterm
extract 83000th iterm
extract 83200th iterm
extract 83400th iterm
extract 83600th iterm
extract 83800th iterm
extract 84000th iterm
extract 84200th iterm
extract 84400th iterm
extract 84600th iterm
extract 84800th iterm
extract 85000th iterm
extract 85200th iterm
extract 85400th iterm
extract 85600th iterm
extract 85800th iterm
extract 86000th iterm
extract 86200th iterm
extract 86400th iterm
extract 86600th iterm
extract 86800th iterm
extract 87000th iterm
extract 87200th iterm
extract 87400th iterm
extract 87600th iterm
extract 87800th iterm
extract 88000th iterm
extract 88200th iterm
extract 88400th iterm
extract 88600th iterm
extract 88800th iterm
extract 89000th iterm
extract 89200th iterm
extract 89400th iterm
extract 89600th iterm
extract 89800th iterm
extract 90000th iterm
extract 90200th iterm
extract 90400th iterm
extract 90600th iterm
extract 90800th iterm
extract 91000th iterm
extract 91200th iterm
extract 91400th iterm
extract 91600th iterm
extract 91800th iterm
extract 92000th iterm
extract 92200th iterm
extract 92400th iterm
extract 92600th iterm
extract 92800th iterm
extract 93000th iterm
extract 93200th iterm
extract 93400th iterm
extract 93600th iterm
extract 93800th iterm
extract 94000th iterm
extract 94200th iterm
extract 94400th iterm
extract 94600th iterm
extract 94800th iterm
extract 95000th iterm
extract 95200th iterm
extract 95400th iterm
extract 95600th iterm
extract 95800th iterm
extract 96000th iterm
extract 96200th iterm
extract 96400th iterm
extract 96600th iterm
extract 96800th iterm
extract 97000th iterm
extract 97200th iterm
extract 97400th iterm
extract 97600th iterm
extract 97800th iterm
extract 98000th iterm
extract 98200th iterm
extract 98400th iterm
extract 98600th iterm
extract 98800th iterm
extract 99000th iterm
extract 99200th iterm
extract 99400th iterm
extract 99600th iterm
extract 99800th iterm
extract 100000th iterm
extract 100200th iterm
extract 100400th iterm
extract 100600th iterm
extract 100800th iterm
extract 101000th iterm
extract 101200th iterm
extract 101400th iterm
extract 101600th iterm
extract 101800th iterm
extract 102000th iterm
extract 102200th iterm
extract 102400th iterm
extract 102600th iterm
extract 102800th iterm
extract 103000th iterm
extract 103200th iterm
extract 103400th iterm
extract 103600th iterm
extract 103800th iterm
extract 104000th iterm
extract 104200th iterm
extract 104400th iterm
extract 104600th iterm
extract 104800th iterm
extract 105000th iterm
extract 105200th iterm
extract 105400th iterm
extract 105600th iterm
extract 105800th iterm
extract 106000th iterm
extract 106200th iterm
extract 106400th iterm
extract 106600th iterm
extract 106800th iterm
extract 107000th iterm
extract 107200th iterm
extract 107400th iterm
extract 107600th iterm
extract 107800th iterm
extract 108000th iterm
extract 108200th iterm
extract 108400th iterm
extract 108600th iterm
extract 108800th iterm
extract 109000th iterm
extract 109200th iterm
extract 109400th iterm
extract 109600th iterm
extract 109800th iterm
extract 110000th iterm
extract 110200th iterm
extract 110400th iterm
extract 110600th iterm
extract 110800th iterm
extract 111000th iterm
extract 111200th iterm
extract 111400th iterm
extract 111600th iterm
extract 111800th iterm
extract 112000th iterm
extract 112200th iterm
extract 112400th iterm
extract 112600th iterm
extract 112800th iterm
extract 113000th iterm
extract 113200th iterm
extract 113400th iterm
extract 113600th iterm
extract 113800th iterm
extract 114000th iterm
extract 114200th iterm
extract 114400th iterm
extract 114600th iterm
extract 114800th iterm
extract 115000th iterm
extract 115200th iterm
extract 115400th iterm
extract 115600th iterm
extract 115800th iterm
extract 116000th iterm
extract 116200th iterm
extract 116400th iterm
extract 116600th iterm
extract 116800th iterm
extract 117000th iterm
extract 117200th iterm
extract 117400th iterm
extract 117600th iterm
extract 117800th iterm
extract 118000th iterm
extract 118200th iterm
extract 118400th iterm
extract 118600th iterm
extract 118800th iterm
extract 119000th iterm
extract 119200th iterm
extract 119400th iterm
extract 119600th iterm
extract 119800th iterm
extract 120000th iterm
extract 120200th iterm
extract 120400th iterm
extract 120600th iterm
extract 120800th iterm
extract 121000th iterm
extract 121200th iterm
extract 121400th iterm
extract 121600th iterm
extract 121800th iterm
extract 122000th iterm
extract 122200th iterm
extract 122400th iterm
extract 122600th iterm
extract 122800th iterm
extract 123000th iterm
extract 123200th iterm
extract 123400th iterm
extract 123600th iterm
extract 123800th iterm
extract 124000th iterm
extract 124200th iterm
extract 124400th iterm
extract 124600th iterm
extract 124800th iterm
extract 125000th iterm
extract 125200th iterm
extract 125400th iterm
extract 125600th iterm
extract 125800th iterm
extract 126000th iterm
extract 126200th iterm
extract 126400th iterm
extract 126600th iterm
extract 126800th iterm
extract 127000th iterm
extract 127200th iterm
extract 127400th iterm
extract 127600th iterm
extract 127800th iterm
extract 128000th iterm
extract 128200th iterm
extract 128400th iterm
extract 128600th iterm
extract 128800th iterm
extract 129000th iterm
extract 129200th iterm
extract 129400th iterm
extract 129600th iterm
extract 129800th iterm
extract 130000th iterm
extract 130200th iterm
extract 130400th iterm
extract 130600th iterm
extract 130800th iterm
extract 131000th iterm
extract 131200th iterm
extract 131400th iterm
extract 131600th iterm
extract 131800th iterm
extract 132000th iterm
extract 132200th iterm
extract 132400th iterm
extract 132600th iterm
extract 132800th iterm
extract 133000th iterm
extract 133200th iterm
extract 133400th iterm
extract 133600th iterm
extract 133800th iterm
extract 134000th iterm
extract 134200th iterm
extract 134400th iterm
extract 134600th iterm
extract 134800th iterm
extract 135000th iterm
extract 135200th iterm
extract 135400th iterm
extract 135600th iterm
extract 135800th iterm
extract 136000th iterm
extract 136200th iterm
extract 136400th iterm
extract 136600th iterm
extract 136800th iterm
extract 137000th iterm
extract 137200th iterm
extract 137400th iterm
extract 137600th iterm
extract 137800th iterm
extract 138000th iterm
extract 138200th iterm
extract 138400th iterm
extract 138600th iterm
extract 138800th iterm
extract 139000th iterm
extract 139200th iterm
extract 139400th iterm
extract 139600th iterm
extract 139800th iterm
extract 140000th iterm
extract 140200th iterm
extract 140400th iterm
extract 140600th iterm
extract 140800th iterm
extract 141000th iterm
extract 141200th iterm
extract 141400th iterm
extract 141600th iterm
extract 141800th iterm
extract 142000th iterm
extract 142200th iterm
extract 142400th iterm
extract 142600th iterm
extract 142800th iterm
extract 143000th iterm
extract 143200th iterm
extract 143400th iterm
extract 143600th iterm
extract 143800th iterm
extract 144000th iterm
extract 144200th iterm
extract 144400th iterm
extract 144600th iterm
extract 144800th iterm
extract 145000th iterm
extract 145200th iterm
extract 145400th iterm
extract 145600th iterm
extract 145800th iterm
extract 146000th iterm
extract 146200th iterm
extract 146400th iterm
extract 146600th iterm
extract 146800th iterm
extract 147000th iterm
extract 147200th iterm
extract 147400th iterm
extract 147600th iterm
extract 147800th iterm
extract 148000th iterm
extract 148200th iterm
extract 148400th iterm
extract 148600th iterm
extract 148800th iterm
extract 149000th iterm
extract 149200th iterm
extract 149400th iterm
extract 149600th iterm
extract 149800th iterm
extract 150000th iterm
extract 150200th iterm
extract 150400th iterm
extract 150600th iterm
extract 150800th iterm
extract 151000th iterm
extract 151200th iterm
extract 151400th iterm
extract 151600th iterm
extract 151800th iterm
extract 152000th iterm
extract 152200th iterm
extract 152400th iterm
extract 152600th iterm
extract 152800th iterm
extract 153000th iterm
extract 153200th iterm
extract 153400th iterm
extract 153600th iterm
extract 153800th iterm
extract 154000th iterm
extract 154200th iterm
extract 154400th iterm
extract 154600th iterm
extract 154800th iterm
extract 155000th iterm
extract 155200th iterm
extract 155400th iterm
extract 155600th iterm
extract 155800th iterm
extract 156000th iterm
extract 156200th iterm
extract 156400th iterm
extract 156600th iterm
extract 156800th iterm
extract 157000th iterm
extract 157200th iterm
extract 157400th iterm
extract 157600th iterm
extract 157800th iterm
extract 158000th iterm
extract 158200th iterm
extract 158400th iterm
extract 158600th iterm
extract 158800th iterm
extract 159000th iterm
extract 159200th iterm
extract 159400th iterm
extract 159600th iterm
extract 159800th iterm
extract 160000th iterm
extract 160200th iterm
extract 160400th iterm
extract 160600th iterm
extract 160800th iterm
extract 161000th iterm
extract 161200th iterm
extract 161400th iterm
extract 161600th iterm
extract 161800th iterm
extract 162000th iterm
extract 162200th iterm
extract 162400th iterm
extract 162600th iterm
extract 162800th iterm
extract 163000th iterm
extract 163200th iterm
extract 163400th iterm
extract 163600th iterm
extract 163800th iterm
extract 164000th iterm
extract 164200th iterm
extract 164400th iterm
extract 164600th iterm
extract 164800th iterm
extract 165000th iterm
extract 165200th iterm
extract 165400th iterm
extract 165600th iterm
extract 165800th iterm
extract 166000th iterm
extract 166200th iterm
extract 166400th iterm
extract 166600th iterm
extract 166800th iterm
extract 167000th iterm
extract 167200th iterm
extract 167400th iterm
extract 167600th iterm
extract 167800th iterm
extract 168000th iterm
extract 168200th iterm
extract 168400th iterm
extract 168600th iterm
extract 168800th iterm
extract 169000th iterm
extract 169200th iterm
extract 169400th iterm
extract 169600th iterm
extract 169800th iterm
extract 170000th iterm
extract 170200th iterm
extract 170400th iterm
extract 170600th iterm
extract 170800th iterm
extract 171000th iterm
extract 171200th iterm
extract 171400th iterm
extract 171600th iterm
extract 171800th iterm
extract 172000th iterm
extract 172200th iterm
extract 172400th iterm
extract 172600th iterm
extract 172800th iterm
extract 173000th iterm
extract 173200th iterm
extract 173400th iterm
extract 173600th iterm
extract 173800th iterm
extract 174000th iterm
extract 174200th iterm
extract 174400th iterm
extract 174600th iterm
extract 174800th iterm
extract 175000th iterm
extract 175200th iterm
extract 175400th iterm
extract 175600th iterm
extract 175800th iterm
extract 176000th iterm
extract 176200th iterm
extract 176400th iterm
extract 176600th iterm
extract 176800th iterm
extract 177000th iterm
extract 177200th iterm
extract 177400th iterm
extract 177600th iterm
extract 177800th iterm
extract 178000th iterm
extract 178200th iterm
extract 178400th iterm
extract 178600th iterm
extract 178800th iterm
extract 179000th iterm
extract 179200th iterm
extract 179400th iterm
extract 179600th iterm
extract 179800th iterm
extract 180000th iterm
extract 180200th iterm
extract 180400th iterm
extract 180600th iterm
extract 180800th iterm
extract 181000th iterm
extract 181200th iterm
extract 181400th iterm
extract 181600th iterm
extract 181800th iterm
extract 182000th iterm
extract 182200th iterm
extract 182400th iterm
extract 182600th iterm
extract 182800th iterm
extract 183000th iterm
extract 183200th iterm
extract 183400th iterm
extract 183600th iterm
extract 183800th iterm
extract 184000th iterm
extract 184200th iterm
extract 184400th iterm
extract 184600th iterm
extract 184800th iterm
extract 185000th iterm
extract 185200th iterm
extract 185400th iterm
extract 185600th iterm
extract 185800th iterm
extract 186000th iterm
extract 186200th iterm
extract 186400th iterm
extract 186600th iterm
extract 186800th iterm
extract 187000th iterm
extract 187200th iterm
extract 187400th iterm
extract 187600th iterm
extract 187800th iterm
extract 188000th iterm
extract 188200th iterm
extract 188400th iterm
extract 188600th iterm
extract 188800th iterm
extract 189000th iterm
extract 189200th iterm
extract 189400th iterm
extract 189600th iterm
extract 189800th iterm
extract 190000th iterm
extract 190200th iterm
extract 190400th iterm
extract 190600th iterm
extract 190800th iterm
extract 191000th iterm
extract 191200th iterm
extract 191400th iterm
extract 191600th iterm
extract 191800th iterm
extract 192000th iterm
extract 192200th iterm
extract 192400th iterm
extract 192600th iterm
extract 192800th iterm
extract 193000th iterm
extract 193200th iterm
extract 193400th iterm
extract 193600th iterm
extract 193800th iterm
extract 194000th iterm
extract 194200th iterm
extract 194400th iterm
extract 194600th iterm
extract 194800th iterm
extract 195000th iterm
extract 195200th iterm
extract 195400th iterm
extract 195600th iterm
extract 195800th iterm
extract 196000th iterm
extract 196200th iterm
extract 196400th iterm
extract 196600th iterm
extract 196800th iterm
extract 197000th iterm
extract 197200th iterm
extract 197400th iterm
extract 197600th iterm
extract 197800th iterm
extract 198000th iterm
extract 198200th iterm
extract 198400th iterm
extract 198600th iterm
extract 198800th iterm
extract 199000th iterm
extract 199200th iterm
extract 199400th iterm
extract 199600th iterm
extract 199800th iterm
extract 200000th iterm
extract 200200th iterm
extract 200400th iterm
extract 200600th iterm
extract 200800th iterm
extract 201000th iterm
extract 201200th iterm
extract 201400th iterm
extract 201600th iterm
extract 201800th iterm
extract 202000th iterm
extract 202200th iterm
Extra metadata extracted

In [8]:
train_metadata.keys()


Out[8]:
['width', 'top', 'label', 'left', 'height']

pickle the extracted metadata


In [9]:
pickle_file = 'train_metadata.pickle'
try:
    pickleData = open(pickle_file, 'wb')
    pickle.dump(train_metadata, pickleData, pickle.HIGHEST_PROTOCOL)
    pickleData.close()
    print 'Train metadata pickled'
except Exception as e:
    print 'Unable to save data to', pickle_file, ':', e
    raise


    
pickle_file = 'test_metadata.pickle'
try:
    pickleData = open(pickle_file, 'wb')
    pickle.dump(test_metadata, pickleData, pickle.HIGHEST_PROTOCOL)
    pickleData.close()
    print 'Test metadata pickled'
except Exception as e:
    print 'Unable to save data to', pickle_file, ':', e
    raise


Train metadata pickled
Test metadata pickled

In [12]:
pickle_file = 'extra_metadata.pickle'
try:
    pickleData = open(pickle_file, 'wb')
    pickle.dump(extra_metadata, pickleData, pickle.HIGHEST_PROTOCOL)
    pickleData.close()
    print 'Extra metadata pickled'
except Exception as e:
    print 'Unable to save data to', pickle_file, ':', e
    raise


Extra metadata pickled

In [10]:
img = imread('train/1.png')

In [11]:
plt.imshow(img[:,200:500,:])
plt.show()



In [12]:
np.min((img-128.0)/128.0)


Out[12]:
-1.0

read metadata from pickle file


In [2]:
pickle_file = 'train_metadata.pickle'
train_metadata = {}

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_metadata = save
    del save  # hint to help gc free up memory

    
pickle_file = 'test_metadata.pickle'
test_metadata = {}

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    test_metadata = save
    del save  # hint to help gc free up memory

In [2]:
pickle_file = 'extra_metadata.pickle'
extra_metadata = {}

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    extra_metadata = save
    del save  # hint to help gc free up memory

In [3]:
train_metadata['label'][:2]


Out[3]:
[[1.0, 9.0], [2.0, 3.0]]

Resize image and change corresponding metadata


In [4]:
def resize_and_boxes(img, new_size=[32,32], top=[5,5], left=[5,5], height=[30,30],width=[30,30]):
    img_resize = skimage.transform.resize(img, new_size)
    
    #the following line does not work. training accuracy won't improve. strange.
    #img_resize = ( img_resize - 128.0 ) /128.0
    
    v_factor = 1.0*new_size[0]/img.shape[0]
    new_top = v_factor * np.asarray(top) 
    new_height = v_factor*np.asarray(height)
    
    new_bottom = new_top + new_height
    new_top = np.floor(new_top)
    new_height = np.ceil(new_bottom - new_top)
    
    h_factor = 1.0 * new_size[1]/img.shape[1]
    new_left = h_factor * np.asarray(left) 
    new_width = h_factor * np.asarray(width)
    
    new_right = new_left + new_width
    new_left = np.floor(new_left)
    new_width = np.ceil(new_right - new_left)
    
    new_box = {'top': new_top.tolist(), 'left': new_left.tolist(),
               'height':new_height.tolist(), 'width':new_width.tolist()}

    return img_resize, new_box

In [5]:
def crop_and_resize(img, new_size=[32,32], top=[5,5], left=[5,5], height=[30,30],width=[30,30], margin =0.1 ):
    
    new_top =  np.min( np.asarray(top) ) 
    new_bottom =  np.max( np.asarray(top) + np.asarray(height)) 
    
    v_margin = margin *(new_bottom - new_top)
    
    new_top = int(new_top - v_margin)
    new_bottom = int(new_bottom + v_margin)
    
    new_left =  np.min( np.asarray(left) ) 
    new_right =  np.max( np.asarray(left) + np.asarray(width)) 
    
    h_margin = margin*(new_right - new_left)
    
    new_left = int(new_left - h_margin)
    new_right = int(new_right + h_margin)
    
    #print new_top, new_bottom, new_left, new_right
    
    cropped = img[ max(new_top,0):min(new_bottom,img.shape[0]), max(new_left,0):min(new_right,img.shape[1]), :]
    
    img_resize = skimage.transform.resize(cropped, new_size)
    
    #the following line does not work. training accuracy won't improve. strange.
    #img_resize = ( img_resize - 128.0 ) /128.0
    
    
    
    new_box = {'top': [0], 'left': [0],
               'height':[(new_size[0]-1)], 'width': [(new_size[0]-1)] }

    return img_resize, new_box

In [8]:
img = imread('train/1.png')
img_resize, _ = crop_and_resize(img, new_size=[64,64], top=train_metadata['top'][0], 
                                               left = train_metadata['left'][0], height=train_metadata['height'][0], 
                                               width = train_metadata['width'][0])

In [9]:
plt.imshow(img_resize)
plt.show()


Generate resized training and testing data set


In [3]:
#Note: file_list must have one to one correspondence relation with metadata
def get_data_and_resize(file_list, metadata, image_size = [64,64]):
    
    dataset = np.ndarray(shape=(len(file_list), image_size[0], image_size[1], 3), dtype=np.float32)
    outcome = {}
    outcome['left']=[]
    outcome['top']=[]
    outcome['height']=[]
    outcome['width']=[]
    outcome['label']=[]

    for i in range(len(file_list)):
        img = imread(file_list[i])
        img_resize, new_box = resize_and_boxes(img, new_size=image_size, top=metadata['top'][i], 
                                               left = metadata['left'][i], height=metadata['height'][i], 
                                               width = metadata['width'][i])
        dataset[i,:,:,:] = img_resize
    
        outcome['label'].append(metadata['label'][i])
    
        for name in new_box.keys():
            outcome[name].append(new_box[name])
            
    return dataset, outcome


def crop_data_and_resize(file_list, metadata, image_size = [64,64]):
    
    dataset = np.ndarray(shape=(len(file_list), image_size[0], image_size[1], 3), dtype=np.float32)
    outcome = {}
    outcome['left']=[]
    outcome['top']=[]
    outcome['height']=[]
    outcome['width']=[]
    outcome['label']=[]

    for i in range(len(file_list)):
        img = imread(file_list[i])
        img_resize, new_box = crop_and_resize(img, new_size=image_size, top=metadata['top'][i], 
                                               left = metadata['left'][i], height=metadata['height'][i], 
                                               width = metadata['width'][i])
        dataset[i,:,:,:] = img_resize
    
        outcome['label'].append(metadata['label'][i])
    
        for name in new_box.keys():
            outcome[name].append(new_box[name])
            
    return dataset, outcome

In [11]:
#train
#train_size = len(train_list)-2
train_size = len(train_metadata['label'])
image_size = [64,64]

file_list = ['train/' + str(x+1) + '.png' for x in range(train_size)]

train_dataset, train_outcome = get_data_and_resize(file_list, train_metadata, image_size)

train_dataset2, train_outcome2 = crop_data_and_resize(file_list, train_metadata, image_size)

In [12]:
# Create figure and axes
fig,ax = plt.subplots(1)

# Display the image
ax.imshow(train_dataset[1,:,:,:])

# Create a Rectangle patch
rect = patches.Rectangle((train_outcome['left'][1][0],train_outcome['top'][1][0]),
                         train_outcome['width'][1][0],train_outcome['height'][1][0],
                         linewidth=1,edgecolor='r',facecolor='none')

# Add the patch to the Axes
ax.add_patch(rect)

plt.show()



In [13]:
# Create figure and axes
fig,ax = plt.subplots(1)

# Display the image
ax.imshow(train_dataset2[1,:,:,:])

# Create a Rectangle patch
rect = patches.Rectangle((train_outcome2['left'][1][0],train_outcome2['top'][1][0]),
                         train_outcome2['width'][1][0],train_outcome2['height'][1][0],
                         linewidth=1,edgecolor='r',facecolor='none')

# Add the patch to the Axes
ax.add_patch(rect)

plt.show()



In [23]:
train_dataset.shape


Out[23]:
(33402, 64, 64, 3)

In [24]:
train_outcome['width'][33]


Out[24]:
[10.0, 14.0]

In [14]:
#test
test_size = len(test_metadata['label'])
image_size = [64,64]

file_list = ['test/' + str(x+1) + '.png' for x in range(test_size)]

test_dataset, test_outcome = get_data_and_resize(file_list, test_metadata, image_size)
test_dataset2, test_outcome2 = crop_data_and_resize(file_list, test_metadata, image_size)

In [15]:
# Create figure and axes
fig,ax = plt.subplots(1)

# Display the image
ax.imshow(test_dataset[0,:,:,:])

# Create a Rectangle patch
rect = patches.Rectangle((test_outcome['left'][0][0],test_outcome['top'][0][0]),
                         test_outcome['width'][0][0],test_outcome['height'][0][0],
                         linewidth=1,edgecolor='r',facecolor='none')

# Add the patch to the Axes
ax.add_patch(rect)

plt.show()



In [16]:
# Create figure and axes
fig,ax = plt.subplots(1)

# Display the image
ax.imshow(test_dataset2[0,:,:,:])

# Create a Rectangle patch
rect = patches.Rectangle((test_outcome2['left'][0][0],test_outcome2['top'][0][0]),
                         test_outcome2['width'][0][0],test_outcome2['height'][0][0],
                         linewidth=1,edgecolor='r',facecolor='none')

# Add the patch to the Axes
ax.add_patch(rect)

plt.show()



In [18]:
#extra
#extra_size = len(extra_metadata['label'])
extra_size = 500
image_size = [64,64]

file_list = ['extra/' + str(x+1) + '.png' for x in range(extra_size)]

extra_dataset, extra_outcome = get_data_and_resize(file_list, extra_metadata, image_size)
extra_dataset2, extra_outcome2 = crop_data_and_resize(file_list, extra_metadata, image_size)

In [19]:
# Create figure and axes
fig,ax = plt.subplots(1)

# Display the image
ax.imshow(extra_dataset[0,:,:,:])

# Create a Rectangle patch
rect = patches.Rectangle((extra_outcome['left'][0][0],extra_outcome['top'][0][0]),
                         extra_outcome['width'][0][0],extra_outcome['height'][0][0],
                         linewidth=1,edgecolor='r',facecolor='none')

# Add the patch to the Axes
ax.add_patch(rect)

plt.show()



In [20]:
# Create figure and axes
fig,ax = plt.subplots(1)

# Display the image
ax.imshow(extra_dataset2[0,:,:,:])

# Create a Rectangle patch
rect = patches.Rectangle((extra_outcome2['left'][0][0],extra_outcome2['top'][0][0]),
                         extra_outcome2['width'][0][0],extra_outcome2['height'][0][0],
                         linewidth=1,edgecolor='r',facecolor='none')

# Add the patch to the Axes
ax.add_patch(rect)

plt.show()


pickle the data sets for later use


In [17]:
pickle_file = 'train.pickle'
train = {'data':train_dataset, 'outcome':train_outcome}
try:
    pickleData = open(pickle_file, 'wb')
    pickle.dump(train, pickleData, pickle.HIGHEST_PROTOCOL)
    pickleData.close()
    print 'Train dataset pickled'
except Exception as e:
    print 'Unable to save data to', pickle_file, ':', e
    raise


Train dataset pickled

In [18]:
pickle_file = 'train2.pickle'
train = {'data':train_dataset2, 'outcome':train_outcome2}
try:
    pickleData = open(pickle_file, 'wb')
    pickle.dump(train, pickleData, pickle.HIGHEST_PROTOCOL)
    pickleData.close()
    print 'Train dataset pickled'
except Exception as e:
    print 'Unable to save data to', pickle_file, ':', e
    raise


Train dataset pickled

In [19]:
pickle_file = 'test.pickle'
test = {'data':test_dataset, 'outcome':test_outcome}
try:
    pickleData = open(pickle_file, 'wb')
    pickle.dump(test, pickleData, pickle.HIGHEST_PROTOCOL)
    pickleData.close()
    print 'Test dataset pickled'
except Exception as e:
    print 'Unable to save data to', pickle_file, ':', e
    raise


Test dataset pickled

In [20]:
pickle_file = 'test2.pickle'
test = {'data':test_dataset2, 'outcome':test_outcome2}
try:
    pickleData = open(pickle_file, 'wb')
    pickle.dump(test, pickleData, pickle.HIGHEST_PROTOCOL)
    pickleData.close()
    print 'Test dataset pickled'
except Exception as e:
    print 'Unable to save data to', pickle_file, ':', e
    raise


Test dataset pickled

In [21]:
pickle_file = 'train.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_X = save['data']
    train_outcome = save['outcome']
    del save  # hint to help gc free up memory

In [22]:
train_X.shape


Out[22]:
(33402, 64, 64, 3)

In [23]:
print train_outcome['label'][:10]


[[1.0, 9.0], [2.0, 3.0], [2.0, 5.0], [9.0, 3.0], [3.0, 1.0], [3.0, 3.0], [2.0, 8.0], [7.0, 4.0, 4.0], [1.0, 2.0, 8.0], [1.0, 6.0]]

Extra data is too large, so split them to 8 files


In [35]:
#Note: file_list must have one to one correspondence relation with metadata
def get_data_and_resize_range(root, start, end, metadata, image_size = [64,64]):
    
    file_list = [root + '/' + str(x+1) + '.png' for x in range(start, end)]
    
    dataset = np.ndarray(shape=(len(file_list), image_size[0], image_size[1], 3), dtype=np.float32)
    outcome = {}
    outcome['left']=[]
    outcome['top']=[]
    outcome['height']=[]
    outcome['width']=[]
    outcome['label']=[]

    for i in range(start, end):
        img = imread(file_list[i-start])
        img_resize, new_box = resize_and_boxes(img, new_size=image_size, top=metadata['top'][i], 
                                               left = metadata['left'][i], height=metadata['height'][i], 
                                               width = metadata['width'][i])
        dataset[i-start,:,:,:] = img_resize
    
        outcome['label'].append(metadata['label'][i])
    
        for name in new_box.keys():
            outcome[name].append(new_box[name])
            
    return dataset, outcome


def crop_data_and_resize_range(root, start, end, metadata, image_size = [64,64]):
    
    file_list = [root + '/' + str(x+1) + '.png' for x in range(start, end)]
    
    dataset = np.ndarray(shape=(len(file_list), image_size[0], image_size[1], 3), dtype=np.float32)
    outcome = {}
    outcome['left']=[]
    outcome['top']=[]
    outcome['height']=[]
    outcome['width']=[]
    outcome['label']=[]

    for i in range(start, end):
        img = imread(file_list[i-start])
        img_resize, new_box = crop_and_resize(img, new_size=image_size, top=metadata['top'][i], 
                                               left = metadata['left'][i], height=metadata['height'][i], 
                                               width = metadata['width'][i])
        dataset[i-start,:,:,:] = img_resize
    
        outcome['label'].append(metadata['label'][i])
    
        for name in new_box.keys():
            outcome[name].append(new_box[name])
            
    return dataset, outcome

In [36]:
split_idx = ( np.arange(8)*int(len(extra_metadata['label'])/8) ).tolist()
split_idx.append(len(extra_metadata['label']))
extra_root = 'extra'
image_size = [64,64]

for i in range(len(split_idx)-1):
    extra_dataset, extra_outcome = get_data_and_resize_range(extra_root, 
                                                             split_idx[i], split_idx[i+1],
                                                             extra_metadata, image_size)
    
    pickle_file = extra_root + '_full_' + str(i) + '.pickle'
    extra = {'data':extra_dataset, 'outcome':extra_outcome}
    try:
        pickleData = open(pickle_file, 'wb')
        pickle.dump(extra, pickleData, pickle.HIGHEST_PROTOCOL)
        pickleData.close()
        print 'Extra dataset pickled: full {}'.format(i)
    except Exception as e:
        print 'Unable to save data to', pickle_file, ':', e
        raise
    
    extra_dataset2, extra_outcome2 = crop_data_and_resize_range(extra_root, 
                                                                split_idx[i], split_idx[i+1], 
                                                                extra_metadata, image_size)
    
    pickle_file = extra_root + '_crop_' + str(i) + '.pickle'
    extra = {'data':extra_dataset2, 'outcome':extra_outcome2}
    try:
        pickleData = open(pickle_file, 'wb')
        pickle.dump(extra, pickleData, pickle.HIGHEST_PROTOCOL)
        pickleData.close()
        print 'Extra dataset pickled: crop {}'.format(i)
    except Exception as e:
        print 'Unable to save data to', pickle_file, ':', e
        raise


Extra dataset pickled: full 0
Extra dataset pickled: crop 0
Extra dataset pickled: full 1
Extra dataset pickled: crop 1
Extra dataset pickled: full 2
Extra dataset pickled: crop 2
Extra dataset pickled: full 3
Extra dataset pickled: crop 3
Extra dataset pickled: full 4
Extra dataset pickled: crop 4
Extra dataset pickled: full 5
Extra dataset pickled: crop 5
Extra dataset pickled: full 6
Extra dataset pickled: crop 6
Extra dataset pickled: full 7
Extra dataset pickled: crop 7

In [23]:
len(extra_metadata['label'])/8


Out[23]:
25294

In [28]:
split_idx


Out[28]:
[0, 25294, 50588, 75882, 101176, 126470, 151764, 177058, 202353]

In [ ]: