05 Non_Negative Matrix Factorization Movies


In [1]:
from __future__ import print_function
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import numpy as np
np.set_printoptions(suppress=True)

In [2]:
movies = ["Matrix","Alien","Serenity","Casablanca","Amelie"]
moviesxuser = np.array([
    [ 2,4,4,0, 0],
 [ 0,5,5,0, 0],
 [ 1,2,2,0, 0],
 [ 1,0,0,1, 1],
 [ 3,0,0,3, 3],
 [ 4,0,0,4, 4],
 [ 5,0,0,5, 5]])
print(movies)
print(moviesxuser)
n_components = 3


['Matrix', 'Alien', 'Serenity', 'Casablanca', 'Amelie']
[[2 4 4 0 0]
 [0 5 5 0 0]
 [1 2 2 0 0]
 [1 0 0 1 1]
 [3 0 0 3 3]
 [4 0 0 4 4]
 [5 0 0 5 5]]

In [3]:
# Fit the NMF model
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,alpha=.1, l1_ratio=.5)
W = nmf.fit_transform(moviesxuser)
H = nmf.components_
print(W)
print("done in %0.3fs." % (time() - t0))


[[ 0.0120135   1.78641894  1.2475875 ]
 [ 0.          2.26289743  0.        ]
 [ 0.00693544  0.89088871  0.611759  ]
 [ 0.4851928   0.          0.        ]
 [ 1.46358762  0.          0.        ]
 [ 1.95278503  0.          0.        ]
 [ 2.44198244  0.          0.        ]]
done in 0.022s.

In [4]:
nmf.reconstruction_err_


Out[4]:
0.15656451511696387

In [24]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] if topic[i] > 0] )
        print(message)
#        print(topic.argsort(),topic,feature_names)
    print()

In [25]:
print("\nTopics in NMF model (Frobenius norm):")
n_top_words = 5
print(H)

print_top_words(nmf, movies, n_top_words)


Topics in NMF model (Frobenius norm):
[[ 2.03605042  0.          0.          2.03591561  2.03591561]
 [ 0.02494155  2.19299697  2.19299697  0.          0.        ]
 [ 1.48864702  0.04094544  0.04094544  0.          0.        ]]
Topic #0: Matrix Amelie Casablanca
Topic #1: Serenity Alien Matrix
Topic #2: Matrix Serenity Alien


In [7]:
print(nmf.components_)


[[ 2.03605042  0.          0.          2.03591561  2.03591561]
 [ 0.02494155  2.19299697  2.19299697  0.          0.        ]
 [ 1.48864702  0.04094544  0.04094544  0.          0.        ]]

In [8]:
W_new = nmf.transform(np.array([[3, 0, 0,0,0]]))
print(W_new)


[[ 0.01902652  0.          1.92043194]]

In [9]:
W_new = nmf.transform(np.array([[0, 0, 0,0,5]]))
print(W_new)


[[ 0.81131346  0.          0.        ]]

In [10]:
W_new = nmf.transform(np.array([[1, 0, 0,1,0]]))
print(W_new)


[[ 0.24896827  0.          0.3014132 ]]

In [11]:
W_new = nmf.transform(np.array([[3, 0, 0,0,0],[0, 0, 0,0,5],[1, 0, 0,3,0]]))
print(W_new)


[[ 0.01906499  0.          1.92038056]
 [ 0.81131346  0.          0.        ]
 [ 0.64826063  0.          0.        ]]

In [12]:
print(np.around(np.dot(W,H)))
print(moviesxuser)


[[ 2.  4.  4.  0.  0.]
 [ 0.  5.  5.  0.  0.]
 [ 1.  2.  2.  0.  0.]
 [ 1.  0.  0.  1.  1.]
 [ 3.  0.  0.  3.  3.]
 [ 4.  0.  0.  4.  4.]
 [ 5.  0.  0.  5.  5.]]
[[2 4 4 0 0]
 [0 5 5 0 0]
 [1 2 2 0 0]
 [1 0 0 1 1]
 [3 0 0 3 3]
 [4 0 0 4 4]
 [5 0 0 5 5]]

In [ ]: