Singular Value Decomposition - I



In [ ]:

    
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn.cluster import KMeans



#import matplotlib as mpl
import seaborn as sns
%matplotlib inline

Generating low rank data



In [ ]:

    
data = sk_data.make_low_rank_matrix(n_samples=100, n_features=50, effective_rank=10, tail_strength=0.1, random_state=None)
sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)

Numpy svd : http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html



In [ ]:

    
U, s, V = np.linalg.svd(data)
print U.shape, s.shape, V.shape
plt.plot(s)
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')



In [ ]:

    
print s



In [ ]:

    
errors = np.zeros(50)
for i in range(50):
    s[-1:-(i+1):-1]=np.zeros(i)
    S = np.diag(s[0:50])
    S = np.vstack([S, np.zeros((50,50)) ])
    approx_d = np.dot(U, np.dot(S,V))
    errors[i] = np.linalg.norm(d-approx_d)
print errors



In [ ]:

    
plt.plot(errors)
plt.ylabel('Error')
plt.xlabel('# of ignored singular values')

Using real data (20 Newsgroup data)



In [ ]:

    
from sklearn.datasets import fetch_20newsgroups

"""
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]"""
categories = ['alt.atheism', 'sci.space','rec.sport.baseball']
news_data = fetch_20newsgroups(subset='train', categories=categories)



In [ ]:

    
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', min_df=0.2)
vectors = vectorizer.fit_transform(news_data.data)



In [ ]:

    
print type(vectors), vectors.shape



In [ ]:

    
U,s,V = sp.sparse.linalg.svds(vectors,10)

Sparse SVD : http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.linalg.svds.html



In [ ]:

    
import scipy.sparse.linalg as linalg

U,s,V = linalg.svds(vectors,10,which='LM')
print U.shape, V.shape, s.shape
print s



In [ ]:

    
plt.plot(range(1,len(s)+1),s[::-1])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')



In [1]:

    
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
    styles = open("../theme/custom.css", "r").read()
    return HTML(styles)
css_styling()









    Out[1]: