In [4]:
# Using Truncated SVD (Singular Value Decomposition).
# SVD factors a matrix M into 3 matricies A,B,C.
# Similar to PCA except that SVD is done on the data matrix and
# PCA is performed on the Covariance Matrix.
# Typically SVD is done to find the principle components of a
# matrix.

from sklearn.datasets import load_iris
from sklearn.decomposition import TruncatedSVD

In [5]:
iris = load_iris()
iris_data = iris.data
iris_target = iris.target

In [6]:
svd = TruncatedSVD(2)
iris_transformed = svd.fit_transform(iris_data)

In [7]:
iris_data[:5]


Out[7]:
array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2]])

In [8]:
iris_transformed[:5]


Out[8]:
array([[ 5.91220352, -2.30344211],
       [ 5.57207573, -1.97383104],
       [ 5.4464847 , -2.09653267],
       [ 5.43601924, -1.87168085],
       [ 5.87506555, -2.32934799]])

In [9]:
# How does this work?
# using scipy only to do the same thing...

In [12]:
from scipy.linalg import svd
import numpy as np

In [13]:
D = np.array([[1,2], [1,3], [1,4]])

In [14]:
D


Out[14]:
array([[1, 2],
       [1, 3],
       [1, 4]])

In [15]:
U,S,V = svd(D, full_matrices=False)

In [20]:
print U.shape, S.shape, V.shape

print U, S, V


(3, 2) (2,) (2, 2)
[[-0.39133557  0.8247362 ]
 [-0.5605708   0.13817999]
 [-0.72980603 -0.54837623]] [ 5.64015854  0.43429448] [[-0.29816758 -0.95451354]
 [ 0.95451354 -0.29816758]]

In [17]:
# reconstruct D to show that U,S,V are just decompositions:
np.dot(U.dot(np.diag(S)), V)


Out[17]:
array([[ 1.,  2.],
       [ 1.,  3.],
       [ 1.,  4.]])

In [19]:
# matrix returned by TruncatedSVD is just the dot product
# of the U and S matrices
# To simulate truncation, drop smallest singular values and the
# corresponding column vectors of U
new_S = S[0]
new_U = U[:, 0]
new_U.dot(new_S)


Out[19]:
array([-2.20719466, -3.16170819, -4.11622173])

In [ ]: