In [ ]:
# Import some libraries that will be necessary for working with data and displaying plots
# To visualize plots in the notebook
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy.io # To read matlab files
import pylab
from test_helper import Test
This notebook reviews some of the Python modules that make it possible to work with data structures in an easy an efficient manner. We will start by reviewing Numpy arrays and matrices, and some of the common operations which are needed when working with these data structures in Machine Learning. The second part of the notebook will present some of the data types inherent to MLlib, and explain the basics of distributing data sets for parallel optimization of models
The following code fragment defines variable x
as a list of 4 integers, you can check that by printing the type of any element of x
. Use python command map()
to create a new list with the same elements as x, but where each element of the list is a float.
In [ ]:
x = [5, 4, 3, 4]
print type(x[0])
# Create a list of floats containing the same elements as in x
x_f = <FILL IN>
In [ ]:
Test.assertTrue(np.all(x == x_f), 'Elements of both lists are not the same')
Test.assertTrue(((type(x[-2])==int) & (type(x_f[-2])==float)),'Type conversion incorrect')
Numpy arrays can be defined directly using methods such as np.arange()
, np.ones()
, np.zeros()
, as well as random number generators. Alternatively, you can easily generate them from python lists (or lists of lists) containing elements of numeric type.
You can easily check the shape of any numpy vector with the property .shape
, and reshape it with the method reshape()
. Note the difference between 1-D and N-D numpy arrays (ndarrays
). You should also be aware of the existance of another numpy data type: Numpy matrices (http://docs.scipy.org/doc/numpy-1.10.1/reference/generated/numpy.matrix.html) are inherently 2-D structures where operators *
and **
have the meaning of matrix multiplication and matrix power.
In the code below, you can check the types and shapes of different numpy arrays. Complete also the exercise where you are asked to convert a unidimensional array into a vector of size $4\times2$.
In [ ]:
# Numpy arrays can be created from numeric lists or using different numpy methods
y = np.arange(8)+1
x = np.array(x_f)
# Check the different data types involved
print 'El tipo de la variable x_f es ', type(x_f)
print 'El tipo de la variable x es ', type(x)
print 'El tipo de la variable y es ', type(y)
# Print the shapes of the numpy arrays
print 'La variable y tiene dimensiones ', y.shape
print 'La variable x tiene dimensiones ', x.shape
#Complete the following exercises
# Convert x into a variable x_matrix, of type `numpy.matrixlib.defmatrix.matrix` using command
# np.matrix(). The resulting matrix should be of dimensions 4x1
x_matrix = <FILL IN>
# Convert x into a variable x_array, of type `ndarray`, and dimensions 4x2
x_array = <FILL IN>
# Reshape array y into a 4x2 matrix using command np.reshape()
y = <FILL IN>
In [ ]:
Test.assertEquals(type(x_matrix),np.matrixlib.defmatrix.matrix,'x_matrix is not defined as a matrix')
Test.assertEqualsHashed(x_matrix,'f4239d385605dc62b73c9a6f8945fdc65e12e43b','Incorrect variable x_matrix')
Test.assertEquals(type(x_array),np.ndarray,'x_array is not defined as a numpy ndarray')
Test.assertEqualsHashed(x_array,'f4239d385605dc62b73c9a6f8945fdc65e12e43b','Incorrect variable x_array')
Test.assertEquals(type(y),np.ndarray,'y is not defined as a numpy ndarray')
Test.assertEqualsHashed(y,'66d90401cb8ed9e1b888b76b0f59c23c8776ea42','Incorrect variable y')
Some other useful Numpy methods are:
np.flatten()
: converts a numpy array or matrix into a vector by concatenating the elements in the different dimension. Note that the result of the method keeps the type of the original variable, so the result is a 1-D ndarray
when invoked on a numpy array, and a numpy matrix (and necessarily 2-D) when invoked on a matrix.np.tolist()
: converts a numpy array or matrix into a python list.These uses are illustrated in the code fragment below.
In [ ]:
print 'Uso de flatten sobre la matriz x_matrix (de tipo matrix)'
print 'x_matrix.flatten(): ', x_matrix.flatten()
print 'Su tipo es: ', type(x_matrix.flatten())
print 'Sus dimensiones son: ', x_matrix.flatten().shape
print '\nUso de flatten sobre la matriz y (de tipo ndarray)'
print 'x_matrix.flatten(): ', y.flatten()
print 'Su tipo es: ', type(y.flatten())
print 'Sus dimensiones son: ', y.flatten().shape
print '\nUso de tolist sobre la matriz x_matrix (de tipo matrix) y el vector (2D) y (de tipo ndarray)'
print 'x_matrix.tolist(): ', x_matrix.tolist()
print 'y.tolist(): ', y.tolist()
*
and **
when used with Numpy arrays implement elementwise product and exponentiation*
and **
when used with Numpy matrices implement matrix product and exponentiationSo you have to be careful about the types you are using for each variable
In [ ]:
# Try to run the following command on variable x_matrix, and see what happens
print x_array**2
In [ ]:
# Try to run the following command on variable x_matrix, and see what happens
print 'Remember that the shape of x_array is ', x_array.shape
print 'Remember that the shape of y is ', y.shape
# Complete the following exercises. You can print the partial results to visualize them
# Multiply the 2-D array `y` by 2
y_by2 = <FILL IN>
# Multiply each of the columns in `y` by the column vector x_array
z_4_2 = <FILL IN>
# Obtain the matrix product of the transpose of x_array and y
x_by_y = <FILL IN>
# Repeat the previous calculation, this time using x_matrix (of type numpy matrix) instead of x_array
# Note that in this case you do not need to use method dot()
x_by_y2 = <FILL IN>
# Multiply vector x_array by its transpose to obtain a 4 x 4 matrix
x_4_4 = <FILL IN>
# Multiply the transpose of vector x_array by vector x_array. The result is the squared-norm of the vector
x_norm2 = <FILL IN>
In [ ]:
Test.assertEqualsHashed(y_by2,'120a3a46cdf65dc239cc9b128eb1336886c7c137','Incorrect result for variable y_by2')
Test.assertEqualsHashed(z_4_2,'607730d96899ee27af576ecc7a4f1105d5b2cfed','Incorrect result for variable z_4_2')
Test.assertEqualsHashed(x_by_y,'a3b24f229d1e02fa71e940adc0a4135779864358','Incorrect result for variable x_by_y')
Test.assertEqualsHashed(x_by_y2,'a3b24f229d1e02fa71e940adc0a4135779864358','Incorrect result for variable x_by_y2')
Test.assertEqualsHashed(x_4_4,'fff55c032faa93592e5d27bf13da9bb49c468687','Incorrect result for variable x_4_4')
Test.assertEqualsHashed(x_norm2,'6eacac8f346bae7b5c72bcc3381c7140eaa98b48','Incorrect result for variable x_norm2')
In [ ]:
print z_4_2.shape
print np.mean(z_4_2)
print np.mean(z_4_2,axis=0)
print np.mean(z_4_2,axis=1)
Other numpy methods where you can specify the axis along with a certain operation should be carried out are:
np.median()
np.std()
np.var()
np.percentile()
np.sort()
np.argsort()
If the axis argument is not provided, the array is flattened before carriying out the corresponding operation.
In [ ]:
# Previous check that you are working with the right matrices
Test.assertEqualsHashed(z_4_2,'607730d96899ee27af576ecc7a4f1105d5b2cfed','Wrong value for variable z_4_2')
Test.assertEqualsHashed(x_array,'f4239d385605dc62b73c9a6f8945fdc65e12e43b','Wrong value for variable x_array')
# Vertically stack matrix z_4_2 with itself
ex1_res = <FILL IN>
# Horizontally stack matrix z_4_2 and vector x_array
ex2_res = <FILL IN>
# Horizontally stack a column vector of ones with the result of the first exercise (variable ex1_res)
X = <FILL IN>
In [ ]:
Test.assertEqualsHashed(ex1_res,'31e60c0fa3e3accedc7db24339452085975a6659','Wrong value for variable ex1_res')
Test.assertEqualsHashed(ex2_res,'189b90c5b2113d2415767915becb58c6525519b7','Wrong value for variable ex2_res')
Test.assertEqualsHashed(X,'426c2708350ac469bc2fc4b521e781b36194ba23','Wrong value for variable X')
In [ ]:
# Keep last row of matrix X
X_sub1 = <FILL IN>
# Keep first column of the three first rows of X
X_sub2 = <FILL IN>
# Keep first two columns of the three first rows of X
X_sub3 = <FILL IN>
# Invert the order of the rows of X
X_sub4 = <FILL IN>
In [ ]:
Test.assertEqualsHashed(X_sub1,'0bcf8043a3dd569b31245c2e991b26686305b93f','Wrong value for variable X_sub1')
Test.assertEqualsHashed(X_sub2,'7c43c1137480f3bfea7454458fcfa2bc042630ce','Wrong value for variable X_sub2')
Test.assertEqualsHashed(X_sub3,'3cddc950ea2abc256192461728ef19d9e1d59d4c','Wrong value for variable X_sub3')
Test.assertEqualsHashed(X_sub4,'33190dec8f3cbe3ebc9d775349665877d7b892dd','Wrong value for variable X_sub4')
In [ ]:
print X.shape
print X.dot(X.T)
print X.T.dot(X)
print np.linalg.inv(X.T.dot(X))
#print np.linalg.inv(X.dot(X.T))
In this section, you will complete three exercises where you will carry out some common operations when working with data structures. For this exercise you will work with the 2-D numpy array X
, assuming that it contains the values of two different variables for 8 data patterns. A first column of ones has already been introduced in a previous exercise:
First of all, let us check that you are working with the right matrix
In [ ]:
Test.assertEqualsHashed(X,'426c2708350ac469bc2fc4b521e781b36194ba23','Wrong value for variable X')
Create a new matrix Z, where additional features are created by carrying out the following non-linear transformations:
$$Z = \left[ \begin{array}{ccc} 1 & x_1^{(1)} & x_2^{(1)} & \log\left(x_1^{(1)}\right) & \log\left(x_2^{(1)}\right)\\ 1 & x_1^{(2)} & x_2^{(2)} & \log\left(x_1^{(2)}\right) & \log\left(x_2^{(2)}\right) \\ \vdots & \vdots & \vdots \\ 1 & x_1^{(8)} & x_2^{(8)} & \log\left(x_1^{(8)}\right) & \log\left(x_2^{(8)}\right)\end{array}\right] = \left[ \begin{array}{ccc} 1 & z_1^{(1)} & z_2^{(1)} & z_3^{(1)} & z_4^{(1)}\\ 1 & z_1^{(2)} & z_2^{(2)} & z_3^{(1)} & z_4^{(1)} \\ \vdots & \vdots & \vdots \\ 1 & z_1^{(8)} & z_2^{(8)} & z_3^{(1)} & z_4^{(1)} \end{array}\right]$$In other words, we are calculating the logarightmic values of the two original variables. From now on, any function involving linear transformations of the variables in Z
, will be in fact a non-linear function of the original variables.
In [ ]:
# Obtain matrix Z
Z = <FILL IN>
In [ ]:
Test.assertEqualsHashed(Z,'d68d0394b57b4583ba95fc669c1c12aeec782410','Incorrect matrix Z')
If you did not do that, repeat the previous exercise, this time using the map()
method together with function log_transform()
:
In [ ]:
def log_transform(x):
return <FILL IN>
Z_map = np.array(map(log_transform,X))
In [ ]:
Test.assertEqualsHashed(Z_map,'d68d0394b57b4583ba95fc669c1c12aeec782410','Incorrect matrix Z')
Repeat the previous exercise once again using a lambda function:
In [ ]:
Z_lambda = np.array(map(lambda x: <FILL IN>,X))
In [ ]:
Test.assertEqualsHashed(Z_lambda,'d68d0394b57b4583ba95fc669c1c12aeec782410','Incorrect matrix Z')
Similarly to the previous exercise, now we are interested in obtaining another matrix that will be used to evaluate a polynomial model. In order to do so, compute matrix Z_poly
as follows:
Note that, in this case, only the first variable of each pattern is used.
In [ ]:
# Calculate variable Z_poly, using any method that you want
Z_poly = <FILL IN>
In [ ]:
Test.assertEqualsHashed(Z_poly,'ba0f38316dffe901b6c7870d13ccceccebd75201','Wrong variable Z_poly')
Finally, we can use previous data matrices Z
and Z_poly
to efficiently compute the output of the corresponding non-linear models over all the patterns in the data set. In this exercise, we consider the two following linear-in-the-parameters models to be evaluated:
Compute the output of the two models for the particular weights that are defined in the code below. Your output variables f_log
and f_poly
should contain the outputs of the model for all eight patterns in the data set.
In [ ]:
w_log = np.array([3.3, 0.5, -2.4, 3.7, -2.9])
w_poly = np.array([3.2, 4.5, -3.2, 0.7])
f_log = <FILL IN>
f_poly = <FILL IN>
In [ ]:
Test.assertEqualsHashed(f_log,'cf81496c5371a0b31931625040f460ed3481fb3d','Incorrect evaluation of the logarithmic model')
Test.assertEqualsHashed(f_poly,'05307e30124daa103c970044828f24ee8b1a0bb9','Incorrect evaluation of the polynomial model')
MLlib is Apache Spark's scalable machine learning library. It implements several machine learning methods that can work over data distributed by means of RDDs. The regression methods that are part of MLlib are:
We will just use the three first methods, and we will also work on an implementation of KNN regression over Spark, using the Data types provided by MLlib.
In [ ]:
# Import additional libraries for this part
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
DenseVectors
can be created from lists or from numpy arraysSparseVector
constructor requires three arguments: the length of the vector, an array with the indices of the non-zero coefficients, and the values of such positions (in the same order)
In [ ]:
# We create a sparse vector of length 900, with only 25 non-zero values
Z = np.eye(30, k=5).flatten()
print 'The dimension of array Z is ', Z.shape
# Create a DenseVector containing the elements of array Z
dense_V = <FILL IN>
#Create a SparseVector containing the elements of array Z
#Nonzero elements are indexed by the following variable idx_nonzero
idx_nonzero = np.nonzero(Z)[0]
sparse_V = <FILL IN>
#Standard matrix operations can be computed on DenseVectors and SparseVectors
#Calculate the square norm of vector sparse_V, by multiplying sparse_V by the transponse of dense_V
print 'The norm of vector Z is', sparse_V.dot(dense_V)
#print sparse_V
#print dense_V
In [ ]:
Test.assertEqualsHashed(dense_V,'b331f43b23fda1ac19f5c29ee2c843fab6e34dfa', 'Incorrect vector dense_V')
Test.assertEqualsHashed(sparse_V,'954fe70f3f9acd720219fc55a30c7c303d02f05d', 'Incorrect vector sparse_V')
Test.assertEquals(type(dense_V),pyspark.mllib.linalg.DenseVector,'Incorrect type for dense_V')
Test.assertEquals(type(sparse_V),pyspark.mllib.linalg.SparseVector,'Incorrect type for sparse_V')
Labeled point constructor takes two arguments: the labels, and a numpy array / DenseVector / SparseVector containing the features.
In [ ]:
# Create a labeled point with a positive label and a dense feature vector.
pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])
# Create a labeled point with a negative label and a sparse feature vector.
neg = LabeledPoint(0.0, sparse_V)
# You can now easily access the label and features of the vector:
print 'The label of the first labeled point is', pos.label
print 'The features of the second labeled point are', neg.features