and why they matter
Install the bleeding edge version from here: http://lasagne.readthedocs.org/en/latest/user/installation.html
In [1]:
import numpy as np
def sum_squares(N):
return N * (N - 1) * (2 * N - 1) // 6
In [2]:
%%time
sum_squares(10**8)
Out[2]:
In [3]:
import theano
import theano.tensor as T
In [4]:
#I gonna be function parameter
N = T.scalar("a dimension",dtype='int32')
#i am a recipe on how to produce sum of squares of arange of N given N
result = (T.arange(N)**2).sum()
#Compiling the recipe of computing "result" given N
sum_function = theano.function(inputs = [N],outputs=result)
In [5]:
%%time
sum_function(10**8)
Out[5]:
Still confused? We gonna fix that.
In [6]:
#Inputs
example_input_integer = T.scalar("scalar input",dtype='float32')
example_input_tensor = T.tensor4("four dimensional tensor input") #dtype = theano.config.floatX by default
#не бойся, тензор нам не пригодится
input_vector = T.vector("my vector", dtype='int32') # vector of integers
In [7]:
#Transformations
#transofrmation: elementwise multiplication
double_the_vector = input_vector*2
#elementwise cosine
elementwise_cosine = T.cos(input_vector)
#difference between squared vector and vector itself
vector_squares = input_vector**2 - input_vector
In [8]:
#Practice time:
#create two vectors of size float32
my_vector = T.vector(name='first vector', dtype='float32')
my_vector2 = T.vector(name='second vector', dtype='float32')
In [9]:
#Write a transformation(recipe):
#(vec1)*(vec2) / (sin(vec1) +1)
my_transformation = my_vector * my_vector2 / (T.sin(my_vector) + 1)
In [10]:
print( my_transformation)
#it's okay it aint a number
In [11]:
#What's inside the transformation
theano.printing.debugprint(my_transformation)
In [12]:
inputs = [my_vector, my_vector2]
outputs = [my_transformation]
# The next lines compile a function that takes two vectors and computes your transformation
my_function = theano.function(
inputs,outputs,
allow_input_downcast=True #automatic type casting for input parameters (e.g. float64 -> float32)
)
In [13]:
#using function with, lists:
print ("using python lists:")
print (my_function([1,2,3],[4,5,6]))
print
#Or using numpy arrays:
#btw, that 'float' dtype is casted to secong parameter dtype which is float32
print ("using numpy arrays:")
print (my_function(np.arange(10),
np.linspace(5,6,10,dtype='float')))
In [14]:
#a dictionary of inputs
my_function_inputs = {
my_vector:[1,2,3],
my_vector2:[4,5,6]
}
# evaluate my_transformation
# has to match with compiled function output
print (my_transformation.eval(my_function_inputs))
# can compute transformations on the fly
print ("add 2 vectors", (my_vector + my_vector2).eval(my_function_inputs))
#!WARNING! if your transformation only depends on some inputs,
#do not provide the rest of them
print ("vector's shape:", my_vector.shape.eval({
my_vector:[1,2,3]
}))
In [15]:
# Quest #1 - implement a function that computes a mean squared error of two input vectors
# Your function has to take 2 vectors and return a single number
prediction = T.vector(dtype='float32')
target = T.vector(dtype='float32')
compute_mse = theano.function([prediction, target], outputs=T.mean((prediction - target) ** 2),
allow_input_downcast=True)
In [16]:
# Tests
from sklearn.metrics import mean_squared_error
for n in [1,5,10,10**3]:
elems = [np.arange(n),np.arange(n,0,-1), np.zeros(n),
np.ones(n),np.random.random(n),np.random.randint(100,size=n)]
for el in elems:
for el_2 in elems:
true_mse = np.array(mean_squared_error(el,el_2))
my_mse = compute_mse(el,el_2)
if not np.allclose(true_mse,my_mse):
print ('Wrong result:')
print ('mse(%s,%s)'%(el,el_2))
print ("should be: %f, but your function returned %f"%(true_mse,my_mse))
raise ValueError("Что-то не так")
print ("All tests passed")
The inputs and transformations only exist when function is called
Shared variables always stay in memory like global variables
In [17]:
#creating shared variable
shared_vector_1 = theano.shared(np.ones(10,dtype='float64'))
In [18]:
#evaluating shared variable (outside symbolicd graph)
print ("initial value",shared_vector_1.get_value())
# within symbolic graph you use them just as any other inout or transformation, not "get value" needed
In [19]:
#setting new value
shared_vector_1.set_value( np.arange(5) )
#getting that new value
print ("new value", shared_vector_1.get_value())
#Note that the vector changed shape
#This is entirely allowed... unless your graph is hard-wired to work with some fixed shape
In [20]:
# Write a recipe (transformation) that computes an elementwise transformation of shared_vector and input_scalar
#Compile as a function of input_scalar
input_scalar = T.scalar('coefficient',dtype='float32')
scalar_times_shared = input_scalar * shared_vector_1
shared_times_n = theano.function([input_scalar], scalar_times_shared, allow_input_downcast=True)
In [21]:
print ("shared:", shared_vector_1.get_value())
print ("shared_times_n(5)",shared_times_n(5))
print ("shared_times_n(-0.5)",shared_times_n(-0.5))
In [22]:
#Changing value of vector 1 (output should change)
shared_vector_1.set_value([-1,0,1])
print ("shared:", shared_vector_1.get_value())
print ("shared_times_n(5)",shared_times_n(5))
print ("shared_times_n(-0.5)",shared_times_n(-0.5))
Limitations:
In [23]:
my_scalar = T.scalar(name='input',dtype='float64')
scalar_squared = T.sum(my_scalar**2)
#a derivative of v_squared by my_vector
derivative = T.grad(scalar_squared,my_scalar)
fun = theano.function([my_scalar],scalar_squared)
grad = theano.function([my_scalar],derivative)
In [24]:
import matplotlib.pyplot as plt
%matplotlib inline
x = np.linspace(-3,3)
x_squared = list(map(fun,x))
x_squared_der = list(map(grad,x))
plt.plot(x, x_squared,label="x^2")
plt.plot(x, x_squared_der, label="derivative")
plt.legend()
Out[24]:
In [25]:
my_vector = T.vector('float64')
#Compute the gradient of the next weird function over my_scalar and my_vector
#warning! Trying to understand the meaning of that function may result in permanent brain damage
weird_psychotic_function = ((my_vector+my_scalar)**(1+T.var(my_vector)) +1./T.arcsinh(my_scalar)).mean()/(my_scalar**2 +1) + 0.01*T.sin(2*my_scalar**1.5)*(T.sum(my_vector)* my_scalar**2)*T.exp((my_scalar-4)**2)/(1+T.exp((my_scalar-4)**2))*(1.-(T.exp(-(my_scalar-4)**2))/(1+T.exp(-(my_scalar-4)**2)))**2
der_by_scalar = T.grad(weird_psychotic_function, [my_scalar])
der_by_vector = T.grad(weird_psychotic_function, [my_vector])
compute_weird_function = theano.function([my_scalar,my_vector],weird_psychotic_function)
compute_der_by_scalar = theano.function([my_scalar,my_vector],der_by_scalar)
In [26]:
#Plotting your derivative
vector_0 = [1,2,3]
scalar_space = np.linspace(0,7)
y = [compute_weird_function(x,vector_0) for x in scalar_space]
plt.plot(scalar_space,y,label='function')
y_der_by_scalar = [compute_der_by_scalar(x,vector_0) for x in scalar_space]
plt.plot(scalar_space,y_der_by_scalar,label='derivative')
plt.grid();plt.legend()
Out[26]:
In [27]:
# Multiply shared vector by a number and save the product back into shared vector
inputs = [input_scalar]
outputs = [scalar_times_shared] #return vector times scalar
my_updates = {
shared_vector_1:scalar_times_shared #and write this same result bach into shared_vector_1
}
compute_and_save = theano.function(inputs, outputs, updates=my_updates)
In [28]:
shared_vector_1.set_value(np.arange(5))
#initial shared_vector_1
print ("initial shared value:" ,shared_vector_1.get_value())
# evaluating the function (shared_vector_1 will be changed)
print ("compute_and_save(2) returns",compute_and_save(2))
#evaluate new shared_vector_1
print ("new shared value:" ,shared_vector_1.get_value())
Implement the regular logistic regression training algorithm
Tips:
We shall train on a two-class MNIST dataset
In [29]:
from sklearn.datasets import load_digits
mnist = load_digits(2)
X,y = mnist.data, mnist.target
print ("y [shape - %s]:"%(str(y.shape)),y[:10])
print ("X [shape - %s]:"%(str(X.shape)))
print (X[:3])
print (y[:10])
In [30]:
# inputs and shareds
shared_weights = theano.shared(np.random.randn(X.shape[1]), allow_downcast=True, name="w")
shared_b = theano.shared(0., name="b", allow_downcast=True)
input_X = T.matrix(dtype='float32', name="x")
input_y = T.vector(dtype='float32', name="y")
In [31]:
predicted_y = 1. / (T.exp(-T.dot(input_X, shared_weights) - shared_b) + 1)
loss = -T.mean(input_y * T.log(predicted_y) +\
(1 - input_y) * T.log(1 - predicted_y)) +\
1e-2 * (shared_weights ** 2).sum()
grad_w, grad_b = T.grad(loss, [shared_weights, shared_b])
updates = [
(shared_weights, shared_weights - 0.1 * grad_w),
(shared_b, shared_b - 0.1 * grad_b)
]
In [32]:
train_function = theano.function([input_X, input_y], loss, updates=updates, allow_input_downcast=True)
predict_function = theano.function([input_X], predicted_y > 0.5, allow_input_downcast=True)
In [33]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)
In [34]:
from sklearn.metrics import roc_auc_score
for i in range(5):
loss_i = train_function(X_train,y_train)
print ("loss at iter %i:%.4f"%(i,loss_i))
print ("train auc:",roc_auc_score(y_train,predict_function(X_train)))
print ("test auc:",roc_auc_score(y_test,predict_function(X_test)))
print ("resulting weights:")
plt.imshow(shared_weights.get_value().reshape(8,-1))
plt.colorbar()
Out[34]:
In [3]:
from mnist import load_dataset
X_train,y_train,X_val,y_val,X_test,y_test = load_dataset()
print (X_train.shape,y_train.shape)
In [4]:
import lasagne
input_X = T.tensor4("X")
#input dimention (None means "Arbitrary" and only works at the first axes [samples])
input_shape = [None,1,28,28]
target_y = T.vector("target Y integer",dtype='int32')
Defining network architecture
In [5]:
#Input layer (auxilary)
input_layer = lasagne.layers.InputLayer(shape = input_shape,input_var=input_X)
#fully connected layer, that takes input layer and applies 50 neurons to it.
# nonlinearity here is sigmoid as in logistic regression
# you can give a name to each layer (optional)
dense_1 = lasagne.layers.DenseLayer(input_layer,num_units=50,
nonlinearity = lasagne.nonlinearities.sigmoid,
name = "hidden_dense_layer")
#fully connected output layer that takes dense_1 as input and has 10 neurons (1 for each digit)
#We use softmax nonlinearity to make probabilities add up to 1
dense_output = lasagne.layers.DenseLayer(dense_1,num_units = 10,
nonlinearity = lasagne.nonlinearities.softmax,
name='output')
In [6]:
#network prediction (theano-transformation)
y_predicted = lasagne.layers.get_output(dense_output)
In [39]:
#all network weights (shared variables)
all_weights = lasagne.layers.get_all_params(dense_output)
print (all_weights)
In [40]:
#Mean categorical crossentropy as a loss function - similar to logistic loss but for multiclass targets
loss = lasagne.objectives.categorical_crossentropy(y_predicted,target_y).mean()
#prediction accuracy
accuracy = lasagne.objectives.categorical_accuracy(y_predicted,target_y).mean()
#This function computes gradient AND composes weight updates just like you did earlier
updates_sgd = lasagne.updates.sgd(loss, all_weights,learning_rate=0.01)
In [41]:
#function that computes loss and updates weights
train_fun = theano.function([input_X,target_y],[loss,accuracy],updates= updates_sgd)
#function that just computes accuracy
accuracy_fun = theano.function([input_X,target_y],accuracy)
In [14]:
# An auxilary function that returns mini-batches for neural network training
#Parameters
# X - a tensor of images with shape (many, 1, 28, 28), e.g. X_train
# y - a vector of answers for corresponding images e.g. Y_train
#batch_size - a single number - the intended size of each batches
#What do need to implement
# 1) Shuffle data
# - Gotta shuffle X and y the same way not to break the correspondence between X_i and y_i
# 3) Split data into minibatches of batch_size
# - If data size is not a multiple of batch_size, make one last batch smaller.
# 4) return a list (or an iterator) of pairs
# - (подгруппа картинок, ответы из y на эту подгруппу)
def iterate_minibatches(X, y, batchsize, shuffle=True):
if shuffle:
indices = np.arange(len(X))
np.random.shuffle(indices)
for start_idx in range(0, len(X) - batchsize + 1, batchsize):
if shuffle:
excerpt = indices[start_idx:start_idx + batchsize]
else:
excerpt = slice(start_idx, start_idx + batchsize)
yield X[excerpt], y[excerpt]
In [43]:
import time
num_epochs = 100 #amount of passes through the data
batch_size = 50 #number of samples processed at each function call
for epoch in range(num_epochs):
# In each epoch, we do a full pass over the training data:
train_err = 0
train_acc = 0
train_batches = 0
start_time = time.time()
for batch in iterate_minibatches(X_train, y_train,batch_size):
inputs, targets = batch
train_err_batch, train_acc_batch= train_fun(inputs, targets)
train_err += train_err_batch
train_acc += train_acc_batch
train_batches += 1
# And a full pass over the validation data:
val_acc = 0
val_batches = 0
for batch in iterate_minibatches(X_val, y_val, batch_size):
inputs, targets = batch
val_acc += accuracy_fun(inputs, targets)
val_batches += 1
# Then we print the results for this epoch:
print("Epoch {} of {} took {:.3f}s".format(
epoch + 1, num_epochs, time.time() - start_time))
print(" training loss (in-iteration):\t\t{:.6f}".format(train_err / train_batches))
print(" train accuracy:\t\t{:.2f} %".format(
train_acc / train_batches * 100))
print(" validation accuracy:\t\t{:.2f} %".format(
val_acc / val_batches * 100))
In [44]:
test_acc = 0
test_batches = 0
for batch in iterate_minibatches(X_test, y_test, 500):
inputs, targets = batch
acc = accuracy_fun(inputs, targets)
test_acc += acc
test_batches += 1
print("Final results:")
print(" test accuracy:\t\t{:.2f} %".format(
test_acc / test_batches * 100))
if test_acc / test_batches * 100 > 99:
print ("Achievement unlocked: 80lvl Warlock!")
else:
print ("We need more magic!")
There is a mini-report at the end that you will have to fill in. We recommend to read it first and fill in while you are iterating.
lasagne.layers.DropoutLayer(prev_layer, p=probability_to_zero_out)
Convolution layers
network = lasagne.layers.Conv2DLayer(prev_layer,
num_filters = n_neurons,
filter_size = (filter width, filter height),
nonlinearity = some_nonlinearity)
Plenty other layers and architectures
There is a template for your solution below that you can opt to use or throw away and write it your way
In [1]:
import numpy as np
from IPython import display
import theano
from theano import tensor as T
import lasagne
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
In [2]:
from mnist import load_dataset
X_train,y_train,X_val,y_val,X_test,y_test = load_dataset()
print (X_train.shape,y_train.shape)
In [3]:
input_X = T.tensor4("X")
#input dimention (None means "Arbitrary" and only works at the first axes [samples])
input_shape = [None,1,28,28]
target_y = T.vector("target Y integer",dtype='int32')
In [30]:
#Input layer (auxilary)
#Input layer (auxilary)
nn = lasagne.layers.InputLayer(shape = input_shape,input_var=input_X)
nn = lasagne.layers.Conv2DLayer(nn, 128, (5,5), nonlinearity=lasagne.nonlinearities.rectify)
nn = lasagne.layers.MaxPool2DLayer(nn, (2,2))
nn = lasagne.layers.Conv2DLayer(nn, 64, (3,3), nonlinearity=lasagne.nonlinearities.rectify)
nn = lasagne.layers.MaxPool2DLayer(nn, (2,2))
nn = lasagne.layers.DropoutLayer(nn)
nn = lasagne.layers.DenseLayer(nn, num_units=128, nonlinearity = lasagne.nonlinearities.rectify)
dense_output = lasagne.layers.DenseLayer(nn,num_units = 10, nonlinearity = lasagne.nonlinearities.softmax)
In [31]:
# Network predictions (theano-transformation)
y_predicted_train = lasagne.layers.get_output(dense_output)
y_predicted_test = lasagne.layers.get_output(dense_output, deterministic=True)
In [32]:
all_weights = lasagne.layers.get_all_params(dense_output)
In [33]:
#Mean categorical crossentropy as a loss function - similar to logistic loss but for multiclass targets
loss = lasagne.objectives.categorical_crossentropy(y_predicted_train,target_y).mean()
# + 0.001 * lasagne.regularization.regularize_network_params(dense_output, lasagne.regularization.l2)
#prediction accuracy
accuracy = lasagne.objectives.categorical_accuracy(y_predicted_test,target_y).mean()
#This function computes gradient AND composes weight updates just like you did earlier
updates_sgd = lasagne.updates.adam(loss, all_weights)
In [34]:
#A function that accepts X and y, returns loss functions and performs weight updates
train_fun = theano.function([input_X,target_y],[loss,accuracy],updates= updates_sgd)
#A function that just computes accuracy given X and y
accuracy_fun = theano.function([input_X,target_y],accuracy)
In [35]:
def iterate_minibatches(X, y, batchsize, shuffle=True):
if shuffle:
indices = np.arange(len(X))
np.random.shuffle(indices)
for start_idx in range(0, len(X) - batchsize + 1, batchsize):
if shuffle:
excerpt = indices[start_idx:start_idx + batchsize]
else:
excerpt = slice(start_idx, start_idx + batchsize)
yield X[excerpt], y[excerpt]
In [36]:
num_epochs = 50
batch_size = 500
train_loss = []
train_accuracy = []
val_accuracy = []
In [37]:
for epoch in range(num_epochs):
# In each epoch, we do a full pass over the training data:
train_err = 0
train_acc = 0
train_batches = 0
for batch in iterate_minibatches(X_train, y_train,batch_size):
inputs, targets = batch
train_err_batch, train_acc_batch= train_fun(inputs, targets)
train_err += train_err_batch
train_acc += train_acc_batch
train_batches += 1
# And a full pass over the validation data:
val_acc = 0
val_batches = 0
for batch in iterate_minibatches(X_val, y_val, batch_size):
inputs, targets = batch
val_acc += accuracy_fun(inputs, targets)
val_batches += 1
train_loss.append(train_err / train_batches)
train_accuracy.append(train_acc / train_batches * 100)
val_accuracy.append(val_acc / val_batches * 100)
display.clear_output(wait=True)
display.display(plt.gcf())
plt.figure(figsize=(15,8))
plt.plot(train_accuracy, 'b')
plt.plot(val_accuracy, 'r')
In [38]:
test_acc = 0
test_batches = 0
for batch in iterate_minibatches(X_test, y_test, 500):
inputs, targets = batch
acc = accuracy_fun(inputs, targets)
test_acc += acc
test_batches += 1
print("Final results:")
print("test accuracy:\t\t{:.2f} %".format(
test_acc / test_batches * 100))
if test_acc / test_batches * 100 > 99:
print ("Achievement unlocked: 80lvl Warlock!")
else:
print ("We need more magic!")
gg wp
easy :)
Report
All creative approaches are highly welcome, but at the very least it would be great to mention
There is no need to write strict mathematical proofs (unless you want to).
Т.к. сеть должна классифицировать изображения цифр, на мой взгляд лучше всего взять CNN, т.к. она сможет выделить некие локальные особенности этих цифр. Я пробовал различные комбинации сверток и макспулингов вначале. Затем брал дропаут в качестве регуляризации, полносвязный слой, а потом выходной слой.
Лучшее качество получается, если делать отдельный макспул после каждой свертки, а не один после нескольких. При этом размер фильтра сначала 5x5, а потом 3x3.
Как показывает качество, одного полносвязного слоя вполне достаточно. Больше слоев лишь будут переобучать сеть. Финальную архитектуру можно посмотреть в ячейке.
Для тренировкий нейросети использовал метод стохастического градиентного спуска.