notebook.community

Edit and run



In [1]:

    
from __future__ import print_function

import sys
import os
import time
import string

import numpy as np
import theano
import theano.tensor as T

import os; import sys; sys.path.append('..')
import gp

import gp.nets as nets
import gp.nets.BatchNormLayer as BatchNormLayer

import lasagne

sys.setrecursionlimit(10000)









    



Using gpu device 0: GeForce GTX TITAN (CNMeM is disabled, CuDNN 4007)
/home/d/nolearn/local/lib/python2.7/site-packages/theano/tensor/signal/downsample.py:6: UserWarning: downsample module has been moved to the theano.tensor.signal.pool module.
  "downsample module has been moved to the theano.tensor.signal.pool module.")



In [2]:

    
%load_ext autoreload
%autoreload 2



In [2]:

    
# helper function for projection_b
def ceildiv(a, b):
    return -(-a // b)

def build_cnn(input_var=None, n=1, num_filters=8, cudnn='no'):
    import lasagne # For some odd reason it can't read the global import, please PR/Issue if you know why
    projection_type = 'B'
    # Setting up layers
    if cudnn == 'yes':
        import lasagne.layers.dnn
        conv = lasagne.layers.dnn.Conv2DDNNLayer # cuDNN
    else:
        conv = lasagne.layers.Conv2DLayer
    dropout = lasagne.layers.DropoutLayer
    nonlin = lasagne.nonlinearities.rectify
    nonlin_layer = lasagne.layers.NonlinearityLayer
    sumlayer = lasagne.layers.ElemwiseSumLayer
    #batchnorm = BatchNormLayer.BatchNormLayer
    batchnorm = lasagne.layers.BatchNormLayer

    # Setting the projection type for when reducing height/width
    # and increasing dimensions.
    # Default is 'B' as B performs slightly better
    # and A requires newer version of lasagne with ExpressionLayer
    projection_type = 'B'
    if projection_type == 'A':
        expression = lasagne.layers.ExpressionLayer
        pad = lasagne.layers.PadLayer

    if projection_type == 'A':
        # option A for projection as described in paper
        # (should perform slightly worse than B)
        def projection(l_inp):
            n_filters = l_inp.output_shape[1]*2
            l = expression(l_inp, lambda X: X[:, :, ::2, ::2], lambda s: (s[0], s[1], ceildiv(s[2], 2), ceildiv(s[3], 2)))
            l = pad(l, [n_filters//4,0,0], batch_ndim=1)
            return l

    if projection_type == 'B':
        # option B for projection as described in paper
        def projection(l_inp):
            # twice normal channels when projecting!
            n_filters = l_inp.output_shape[1]*2 
            l = conv(l_inp, num_filters=n_filters, filter_size=(1, 1),
                     stride=(2, 2), nonlinearity=None, pad='same', b=None)
            l = batchnorm(l)
            return l

    # helper function to handle filters/strides when increasing dims
    def filters_increase_dims(l, increase_dims):
        in_num_filters = l.output_shape[1]
        if increase_dims:
            first_stride = (2, 2)
            out_num_filters = in_num_filters*2
        else:
            first_stride = (1, 1)
            out_num_filters = in_num_filters
 
        return out_num_filters, first_stride

    # block as described and used in cifar in the original paper:
    # http://arxiv.org/abs/1512.03385
    def res_block_v1(l_inp, nonlinearity=nonlin, increase_dim=False):
        # first figure filters/strides
        n_filters, first_stride = filters_increase_dims(l_inp, increase_dim)
        # conv -> BN -> nonlin -> conv -> BN -> sum -> nonlin
        l = conv(l_inp, num_filters=n_filters, filter_size=(3, 3),
                 stride=first_stride, nonlinearity=None, pad='same',
                 W=lasagne.init.HeNormal(gain='relu'))
        l = batchnorm(l)

        l = nonlin_layer(l, nonlinearity=nonlin)
#         l = dropout(l, p=.2)
#         print('adding dropout')        
        
        l = conv(l, num_filters=n_filters, filter_size=(3, 3),
                 stride=(1, 1), nonlinearity=None, pad='same',
                 W=lasagne.init.HeNormal(gain='relu'))
        l = batchnorm(l)
        if increase_dim:
            # Use projection (A, B) as described in paper
            p = projection(l_inp)
        else:
            # Identity shortcut
            p = l_inp
        l = sumlayer([l, p])
        l = nonlin_layer(l, nonlinearity=nonlin)
        return l

    # block as described in second paper on the subject (by same authors):
    # http://arxiv.org/abs/1603.05027
    def res_block_v2(l_inp, nonlinearity=nonlin, increase_dim=False):
        # first figure filters/strides
        n_filters, first_stride = filters_increase_dims(l_inp, increase_dim)
        # BN -> nonlin -> conv -> BN -> nonlin -> conv -> sum
        l = batchnorm(l_inp)
        l = nonlin_layer(l, nonlinearity=nonlin)
        l = conv(l, num_filters=n_filters, filter_size=(3, 3),
                 stride=first_stride, nonlinearity=None, pad='same',
                 W=lasagne.init.HeNormal(gain='relu'))
        l = batchnorm(l)
        l = nonlin_layer(l, nonlinearity=nonlin)
        l = conv(l, num_filters=n_filters, filter_size=(3, 3),
                 stride=(1, 1), nonlinearity=None, pad='same',
                 W=lasagne.init.HeNormal(gain='relu'))
        if increase_dim:
            # Use projection (A, B) as described in paper
            p = projection(l_inp)
        else:
            # Identity shortcut
            p = l_inp
        l = sumlayer([l, p])
        return l

    def bottleneck_block(l_inp, nonlinearity=nonlin, increase_dim=False):
        # first figure filters/strides
        n_filters, first_stride = filters_increase_dims(l_inp, increase_dim)
        # conv -> BN -> nonlin -> conv -> BN -> nonlin -> conv -> BN -> sum
        # -> nonlin
        # first make the bottleneck, scale the filters ..!
        scale = 4 # as per bottleneck architecture used in paper
        scaled_filters = n_filters/scale
        l = conv(l_inp, num_filters=scaled_filters, filter_size=(1, 1),
                 stride=first_stride, nonlinearity=None, pad='same',
                 W=lasagne.init.HeNormal(gain='relu'))
        l = batchnorm(l)
        l = nonlin_layer(l, nonlinearity=nonlin)
        l = conv(l, num_filters=scaled_filters, filter_size=(3, 3),
                 stride=(1, 1), nonlinearity=None, pad='same',
                 W=lasagne.init.HeNormal(gain='relu'))
        l = batchnorm(l)
        l = nonlin_layer(l, nonlinearity=nonlin)
        l = conv(l, num_filters=n_filters, filter_size=(1, 1),
                 stride=(1, 1), nonlinearity=None, pad='same',
                 W=lasagne.init.HeNormal(gain='relu'))
        if increase_dim:
            # Use projection (A, B) as described in paper
            p = projection(l_inp)
        else:
            # Identity shortcut
            p = l_inp
        l = sumlayer([l, p])
        l = nonlin_layer(l, nonlinearity=nonlin)
        return l

    # Bottleneck architecture with more efficiency (the post with Kaiming He's response)
    # https://www.reddit.com/r/MachineLearning/comments/3ywi6x/deep_residual_learning_the_bottleneck/ 
    def bottleneck_block_fast(l_inp, nonlinearity=nonlin, increase_dim=False):
        # first figure filters/strides
        n_filters, last_stride = filters_increase_dims(l_inp, increase_dim)
        # conv -> BN -> nonlin -> conv -> BN -> nonlin -> conv -> BN -> sum
        # -> nonlin
        # first make the bottleneck, scale the filters ..!
        scale = 4 # as per bottleneck architecture used in paper
        scaled_filters = n_filters/scale
        l = conv(l_inp, num_filters=scaled_filters, filter_size=(1, 1),
                 stride=(1, 1), nonlinearity=None, pad='same',
                 W=lasagne.init.HeNormal(gain='relu'))
        l = batchnorm(l)
        l = nonlin_layer(l, nonlinearity=nonlin)
        l = conv(l, num_filters=scaled_filters, filter_size=(3, 3),
                 stride=(1, 1), nonlinearity=None, pad='same',
                 W=lasagne.init.HeNormal(gain='relu'))
        l = batchnorm(l)
        l = nonlin_layer(l, nonlinearity=nonlin)
        l = conv(l, num_filters=n_filters, filter_size=(1, 1),
                 stride=last_stride, nonlinearity=None, pad='same',
                 W=lasagne.init.HeNormal(gain='relu'))
        if increase_dim:
            # Use projection (A, B) as described in paper
            p = projection(l_inp)
        else:
            # Identity shortcut
            p = l_inp
        l = sumlayer([l, p])
        l = nonlin_layer(l, nonlinearity=nonlin)
        return l
       
    res_block = res_block_v1

    # Stacks the residual blocks, makes it easy to model size of architecture with int n   
    def blockstack(l, n, nonlinearity=nonlin):
        print('NNN',n)
        for _ in range(n):
            print ('new')
            l = res_block(l, nonlinearity=nonlin)
        return l

    # Building the network
    l_in = lasagne.layers.InputLayer(shape=(None, 4, 75, 75),
                                        input_var=input_var)
    # First layer! just a plain convLayer
    l1 = conv(l_in, num_filters=num_filters, stride=(1, 1),
              filter_size=(3, 3), nonlinearity=None, pad='same')
    l1 = batchnorm(l1)
    l1 = nonlin_layer(l1, nonlinearity=nonlin)

    # Stacking bottlenecks and increasing dims! (while reducing shape size)
#     l1_bs = blockstack(l1, n=n)
#     l1_id = res_block(l1_bs, increase_dim=True)

#     l2_bs = blockstack(l1_id, n=n)
#     l2_id = res_block(l2_bs, increase_dim=True)

#     l3_bs = blockstack(l2_id, n=n)

    l3_bs = blockstack(l1, n=n)

    l3_do = dropout(l3_bs, p=.5)
    
    # And, finally, the 10-unit output layer:
    network = lasagne.layers.DenseLayer(
            l3_do,
#             l1,
            num_units=2,
            nonlinearity=lasagne.nonlinearities.softmax)

    return network


# ############################# Batch iterator ###############################
# This is just a simple helper function iterating over training data in
# mini-batches of a particular size, optionally in random order. It assumes
# data is available as numpy arrays. For big datasets, you could load numpy
# arrays as memory-mapped files (np.load(..., mmap_mode='r')), or write your
# own custom data iteration function. For small datasets, you can also copy
# them to GPU at once for slightly improved performance. This would involve
# several changes in the main program, though, and is not demonstrated here.

def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    assert len(inputs) == len(targets)
    if shuffle:
        indices = np.arange(len(inputs))
        np.random.shuffle(indices)
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
            
            Xb = inputs[excerpt]
            yb = targets[excerpt]
            
            Xb = Xb - .5
            
            k_s = np.array([0,1,2,3],dtype=np.uint8)
            for i in range(len(Xb)):
                k = np.random.choice(k_s)
                for j in range(Xb.shape[1]):
                    Xb[j][0] = np.rot90(Xb[j][0], k)
                    
            yield Xb, yb
            
#         yield inputs[excerpt], targets[excerpt]



In [3]:

    
PATCH_PATH = ('ipmlb')
X_train, y_train, X_test, y_test = gp.Patch.load_rgba(PATCH_PATH)









    



Loaded /home/d/patches//ipmlb/ in 0.147146940231 seconds.



In [4]:

    
X_val = X_train[-X_train.shape[0]/4:]
y_val = y_train[-X_train.shape[0]/4:]



In [5]:

    
X_train2 = X_train[:-X_train.shape[0]/4]
y_train2 = y_train[:-X_train.shape[0]/4]



In [6]:

    
n=2
num_filters=32
num_epochs=200
cudnn='yes'
print(n)



In [7]:

    
# Prepare Theano variables for inputs and targets
input_var = T.tensor4('inputs')
target_var = T.ivector('targets')

# Create neural network model (depending on first command line parameter)
print("Building model and compiling functions...")
network = build_cnn(input_var, n, num_filters, cudnn)
all_layers = lasagne.layers.get_all_layers(network)
num_params = lasagne.layers.count_params(network)
num_conv = 0
num_nonlin = 0
num_input = 0
num_batchnorm = 0
num_elemsum = 0
num_dense = 0
num_unknown = 0
print("  layer output shapes:")
for layer in all_layers:
    name = string.ljust(layer.__class__.__name__, 32)
    print("    %s %s" %(name, lasagne.layers.get_output_shape(layer)))
    if "Conv2D" in name:
        num_conv += 1
    elif "NonlinearityLayer" in name:
        num_nonlin += 1
    elif "InputLayer" in name:
        num_input += 1
    elif "BatchNormLayer" in name:
        num_batchnorm += 1
    elif "ElemwiseSumLayer" in name:
        num_elemsum += 1
    elif "DenseLayer" in name:
        num_dense += 1
    else:
        num_unknown += 1
print("  no. of InputLayers: %d" % num_input)
print("  no. of Conv2DLayers: %d" % num_conv)
print("  no. of BatchNormLayers: %d" % num_batchnorm)
print("  no. of NonlinearityLayers: %d" % num_nonlin)
print("  no. of DenseLayers: %d" % num_dense)
print("  no. of ElemwiseSumLayers: %d" % num_elemsum)
print("  no. of Unknown Layers: %d" % num_unknown)
print("  total no. of layers: %d" % len(all_layers))
print("  no. of parameters: %d" % num_params)
# Create a loss expression for training, i.e., a scalar objective we want
# to minimize (for our multi-class problem, it is the cross-entropy loss):
prediction = lasagne.layers.get_output(network)
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
loss = loss.mean()
# We could add some weight decay as well here, see lasagne.regularization.

# Create update expressions for training, i.e., how to modify the
# parameters at each training step. Here, we'll use Stochastic Gradient
# Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
params = lasagne.layers.get_all_params(network, trainable=True)

# several learning rates for low initial learning rates and
# learning rate anealing (id is epoch)
# learning_rate_schedule = {
# 0: 0.0001, # low initial learning rate as described in paper
# 2: 0.01,
# 100: 0.001,
# 150: 0.0001
# }


learning_rate = theano.shared(np.float32(0.03))
momentum = theano.shared(np.float32(0.9))

updates = lasagne.updates.nesterov_momentum(
        loss, params, learning_rate=learning_rate, momentum=momentum)

# Create a loss expression for validation/testing. The crucial difference
# here is that we do a deterministic forward pass through the network,
# disabling dropout layers.
test_prediction = lasagne.layers.get_output(network, deterministic=True)
test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
                                                        target_var)
test_loss = test_loss.mean()
# As a bonus, also create an expression for the classification accuracy:
test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                  dtype=theano.config.floatX)

# Compile a function performing a training step on a mini-batch (by giving
# the updates dictionary) and returning the corresponding training loss:
train_fn = theano.function([input_var, target_var], loss, updates=updates)

# Compile a second function computing the validation loss and accuracy:
val_fn = theano.function([input_var, target_var], [test_loss, test_acc])









    



Building model and compiling functions...
NNN 2
new
new
  layer output shapes:
    InputLayer                       (None, 4, 75, 75)
    Conv2DDNNLayer                   (None, 32, 75, 75)
    BatchNormLayer                   (None, 32, 75, 75)
    NonlinearityLayer                (None, 32, 75, 75)
    Conv2DDNNLayer                   (None, 32, 75, 75)
    BatchNormLayer                   (None, 32, 75, 75)
    NonlinearityLayer                (None, 32, 75, 75)
    Conv2DDNNLayer                   (None, 32, 75, 75)
    BatchNormLayer                   (None, 32, 75, 75)
    ElemwiseSumLayer                 (None, 32, 75, 75)
    NonlinearityLayer                (None, 32, 75, 75)
    Conv2DDNNLayer                   (None, 32, 75, 75)
    BatchNormLayer                   (None, 32, 75, 75)
    NonlinearityLayer                (None, 32, 75, 75)
    Conv2DDNNLayer                   (None, 32, 75, 75)
    BatchNormLayer                   (None, 32, 75, 75)
    ElemwiseSumLayer                 (None, 32, 75, 75)
    NonlinearityLayer                (None, 32, 75, 75)
    DropoutLayer                     (None, 32, 75, 75)
    DenseLayer                       (None, 2)
  no. of InputLayers: 1
  no. of Conv2DLayers: 5
  no. of BatchNormLayers: 5
  no. of NonlinearityLayers: 5
  no. of DenseLayers: 1
  no. of ElemwiseSumLayers: 2
  no. of Unknown Layers: 1
  total no. of layers: 20
  no. of parameters: 398818



In [ ]:



In [8]:

    
with np.load('/home/d/resnet3_71.npz') as f:
    param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    lasagne.layers.set_all_param_values(network, param_values)



In [9]:

    
test_prediction = lasagne.layers.get_output(network, deterministic=True)
test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
                                                        target_var)
test_loss = test_loss.mean()
# As a bonus, also create an expression for the classification accuracy:
test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                  dtype=theano.config.floatX)

pred_fn = theano.function([input_var, target_var], [test_prediction, test_loss, test_acc])
pred2_fn = theano.function([input_var], [test_prediction])



In [10]:

    
all_preds = []
for i,p in enumerate(X_test):
    pred = pred2_fn(p.reshape(1,4,75,75))
    all_preds.append(pred[0][:,1][0].astype(np.uint8))
    if i % 1000 == 0:
        print(i)



In [11]:

    
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, precision_recall_fscore_support, f1_score, precision_recall_curve, average_precision_score, zero_one_loss



In [12]:

    
print(classification_report(y_test, all_preds))









    



             precision    recall  f1-score   support

          0       0.61      0.94      0.74      8780
          1       0.87      0.38      0.53      8780

avg / total       0.74      0.66      0.64     17560



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [17]:

    
# After training, we compute and print the test error:
test_err = 0
test_acc = 0
test_batches = 0
for batch in iterate_minibatches(X_test, y_test, 128, shuffle=False):
    inputs, targets = batch
    err, acc = val_fn(inputs, targets)
    test_err += err
    test_acc += acc
    test_batches += 1
print("Final results:")
print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
print("  test accuracy:\t\t{:.2f} %".format(
    test_acc / test_batches * 100))









    



Final results:
  test loss:			0.417827
  test accuracy:		91.68 %



In [ ]:



In [ ]: