In [ ]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
import scipy.sparse.linalg as linalg
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
#import matplotlib as mpl
import seaborn as sns
%matplotlib inline
The data is generated using the sample data generators summarized at:
http://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets
More specifically, we use the datasets.make_low_rank_matrix() function:
This function generates a mostly low rank matrix with bell-shaped singular values
In [ ]:
data = sk_data.make_low_rank_matrix(n_samples=100, n_features=50, effective_rank=2, tail_strength=0.0, random_state=None)
sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)
We start our analysis with SVD using the (non-sparse) SVD decomposition provided by numpy: http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html
In [ ]:
U, s, V = np.linalg.svd(data)
print U.shape, s.shape, V.shape
plt.plot(s[1:10])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
In [ ]:
#printing the singular values
print s
Computing the error in the representation by the top-k singular vectors and singular values, when the n-k singular values are set to 0 (this means that the corresponding singular vectors are also ingored).
In [ ]:
errors = np.zeros(50)
for i in range(50):
s[-1:-(i+1):-1]=np.zeros(i)
S = np.diag(s[0:50])
S = np.vstack([S, np.zeros((50,50)) ])
approx_d = np.dot(U, np.dot(S,V))
errors[i] = np.linalg.norm(data-approx_d)
print errors
Plotting the errors as a function of the ignored singular values/vectors
In [ ]:
plt.plot(errors)
plt.ylabel('Error')
plt.xlabel('# of ignored singular values')
Plotting the data projected on the top-2 singular vectors (or making our data 2-dimensional)
In [ ]:
U, s, V = np.linalg.svd(data)
k=2
Uk = U[:,0:k]
Vk = V[0:k,:]
Sk = np.diag(s[0:k])
datak = np.dot(Uk,Sk)
plt.plot(datak[:,0],datak[:,1],'bo')
Using again the sample data generators summarized at:
http://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets
This time we use the function datasets.make_blobs(), which generates data from isotropic Gaussian blobs:
In [ ]:
X, y = sk_data.make_blobs(n_samples=100, centers=3, n_features=30,
center_box=(-3.0, 3.0),random_state=0, cluster_std=[3.]*30)
Plotting the dataset using our favorite heatmaps
In [ ]:
sns.heatmap(X,xticklabels=False, yticklabels=False, linewidths=0)
In [ ]:
def svd_projection(data,k):
U, s, V = np.linalg.svd(data)
Uk = U[:,0:k]
Vk = V[0:k,:]
Sk = np.diag(s[0:k])
datak = np.dot(Uk,Sk)
return datak
Taking the projection of the data in the top-k (k=5) right singular vectors
In [ ]:
svdX = svd_projection(X,5)
Projecting our data into pairs of dimensions
In [ ]:
svdX_df = pd.DataFrame(svdX)
g = sns.PairGrid(svdX_df)
g.map(plt.scatter)
Showing the low-dimensional dataset in a heatmap
In [ ]:
k=3
U, s, V = np.linalg.svd(data)
Uk = U[:,0:k]
Vk = V[0:k,:]
Sk = np.diag(s[0:k])
newX = np.dot(np.dot(Uk,Sk),Vk)
print newX.shape
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(15,10))
sns.heatmap(newX, xticklabels=False, yticklabels=False, linewidths=0, ax=ax1, cbar=False)
sns.heatmap(np.dot(Uk,Sk), xticklabels=False, yticklabels=False, linewidths=0, ax=ax2, cbar=False)
In [ ]:
kmeans = KMeans(init='k-means++', n_clusters=3, n_init=1)
kmeans.fit_predict(X)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_
idx = np.argsort(labels)
rX = X[idx,:]
sns.heatmap( rX,xticklabels=False, yticklabels=False, linewidths=0,cbar=False)
ri = metrics.adjusted_rand_score(labels,y)
print ri
In [ ]:
svdX = svd_projection(X,3)
kmeans = KMeans(init='k-means++', n_clusters=3, n_init=1)
kmeans.fit_predict(svdX)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_
idx = np.argsort(labels)
rX = svdX[idx,:]
sns.heatmap( rX,xticklabels=False, yticklabels=False, linewidths=0,cbar=False)
ri = metrics.adjusted_rand_score(labels,y)
print ri
In [ ]:
from sklearn.datasets import fetch_20newsgroups
"""
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space', 'rec.sport.baseball'
]"""
categories = ['alt.atheism', 'sci.space','rec.sport.baseball']
news_data = fetch_20newsgroups(subset='train', categories=categories)
stemmed_data = news_data.data
In [ ]:
print news_data.target_names
print news_data.target
From a cell you need to run nltk.download() and select the appropriate packages from the interface that appears you need to download: stopwords from corpora and punkt and snowball_data from models.
In [ ]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
stemmed_data = [" ".join(SnowballStemmer("english", ignore_stopwords=True).stem(word)
for sent in sent_tokenize(message)
for word in word_tokenize(sent))
for message in news_data.data]
In [ ]:
print stemmed_data[0]
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', min_df=0.05,max_df=0.9, ngram_range=(1,2))
vectors = vectorizer.fit_transform(stemmed_data)
In [ ]:
print type(vectors), vectors.shape
#terms = vectorizer.get_feature_names()
#print terms
In [ ]:
k=3
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10,random_state=0)
kmeans.fit_predict(vectors)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_
In [ ]:
ri = metrics.adjusted_rand_score(labels,news_data.target)
print ri
Our data is now sparse, so we need to call svds from scipy.sparse.linalg, which deals with sparse data
In [ ]:
#computing the singular value decomposition
k = 5
U,s,V = linalg.svds(vectors,k,which='LM')
print U.shape, V.shape, s.shape
print s[::-1]
In [ ]:
#plotting the first k singular values
plt.plot(range(1,len(s)+1),s[::-1])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
Again, we project in all pairwise dimensions to visualize the behavior of our data on all 2-dimensional spaces we consider.
In [ ]:
Xk = U*sp.sparse.diags(s,0)
X_df = pd.DataFrame(Xk)
g = sns.PairGrid(X_df)
g.map(plt.scatter)
In [ ]:
with sns.axes_style("white"):
fig, ax = plt.subplots(1,1,figsize=(10,10))
cmap = sns.hls_palette(n_colors=6, h=0.35, l=0.4, s=0.9)
for i, label in enumerate(set(news_data.target)):
point_indices = np.where(news_data.target == label)[0]
point_indices = point_indices.tolist()
plt.scatter(Xk[point_indices,2], Xk[point_indices,3], s=20, alpha=0.5, c=cmap[i], marker='D',
label=news_data.target_names[i])
plt.legend(prop={'size':14}, loc=2)
sns.despine()
In [ ]:
print V.shape
#print V
sns.heatmap(V,xticklabels=False, yticklabels=False, linewidths=0)
In [ ]:
vectorsk = U[:,2:4]*sp.sparse.diags(s[2:4],0)
print vectorsk.shape, type(vectorsk)
k=3
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10, random_state=0)
kmeans.fit_predict(vectorsk)
centroidsk = kmeans.cluster_centers_
labelsk = kmeans.labels_
errork = kmeans.inertia_
In [ ]:
ri = metrics.adjusted_rand_score(labelsk,news_data.target)
print ri
In [ ]:
k = 5
tsvd = TruncatedSVD(n_components=k)
Xk = tsvd.fit_transform(vectors)
X_df = pd.DataFrame(Xk)
g = sns.PairGrid(X_df)
g.map(plt.scatter)
In [ ]:
k = 10
tsvd = TruncatedSVD(n_components=k)
Xk = tsvd.fit_transform(vectors)
k=4
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, random_state=0)
kmeans.fit_predict(vectorsk)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_
ri = metrics.adjusted_rand_score(labels,news_data.target)
print ri
In [ ]:
print type(X)
In [1]:
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
styles = open("custom.css", "r").read()
return HTML(styles)
css_styling()
Out[1]: