In [ ]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn.cluster import KMeans
#import matplotlib as mpl
import seaborn as sns
%matplotlib inline
In [ ]:
data = sk_data.make_low_rank_matrix(n_samples=100, n_features=50, effective_rank=10, tail_strength=0.1, random_state=None)
sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)
In [ ]:
U, s, V = np.linalg.svd(data)
print U.shape, s.shape, V.shape
plt.plot(s)
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
In [ ]:
print s
In [ ]:
errors = np.zeros(50)
for i in range(50):
s[-1:-(i+1):-1]=np.zeros(i)
S = np.diag(s[0:50])
S = np.vstack([S, np.zeros((50,50)) ])
approx_d = np.dot(U, np.dot(S,V))
errors[i] = np.linalg.norm(d-approx_d)
print errors
In [ ]:
plt.plot(errors)
plt.ylabel('Error')
plt.xlabel('# of ignored singular values')
In [ ]:
from sklearn.datasets import fetch_20newsgroups
"""
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]"""
categories = ['alt.atheism', 'sci.space','rec.sport.baseball']
news_data = fetch_20newsgroups(subset='train', categories=categories)
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', min_df=0.2)
vectors = vectorizer.fit_transform(news_data.data)
In [ ]:
print type(vectors), vectors.shape
In [ ]:
U,s,V = sp.sparse.linalg.svds(vectors,10)
In [ ]:
import scipy.sparse.linalg as linalg
U,s,V = linalg.svds(vectors,10,which='LM')
print U.shape, V.shape, s.shape
print s
In [ ]:
plt.plot(range(1,len(s)+1),s[::-1])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
In [1]:
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
styles = open("../theme/custom.css", "r").read()
return HTML(styles)
css_styling()
Out[1]: