In [22]:
%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
def PlotHeatmap(X):
c = 3.
plt.figure(figsize=(X.shape[1]/c,X.shape[0]/c))
plt.imshow(X, interpolation='nearest',vmax=1,vmin=0)
plt.set_cmap('gray_r')
plt.gca().set_xticks([])
plt.gca().set_yticks([])
plt.show()
N = 8
M = 10
R = 1
A = np.random.rand(N,R)**2
A = A/np.sum(A,axis=0)
B = np.random.rand(R,M)**2
B = B/np.sum(B,axis=1).reshape((R,1))
Y = A.dot(B)
PlotHeatmap(A)
PlotHeatmap(B)
PlotHeatmap(Y)
In [8]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy as sc
import scipy.linalg as la
import scipy.misc
X = sc.misc.ascent()
fig = plt.figure(figsize=(7,7))
plt.gray()
plt.imshow(X)
plt.show()
In [9]:
U,S,Vt = la.svd(X)
M,N = X.shape
for rnk in range(1,20,2):
Sr = np.zeros_like(S)
Sr[0:rnk] = S[0:rnk]
Sig = la.diagsvd(Sr, M, N)
A2 = np.dot(np.dot(U, Sig), Vt)
fig = plt.figure(figsize=(5,5))
plt.gray()
plt.imshow(A2)
plt.show()
In [10]:
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pylab as plt
import numpy as np
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('data/ml-100k/u.data', sep='\t', header=None, names=rnames)
#ratings
inames = ['movie_id', 'movie_title', 'release_date', 'video_release_date',
'IMDb_URL','unknown', 'Action', 'Adventure','Animation',
'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film_Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi',
'Thriller', 'War', 'Western']
items = pd.read_table('data/ml-100k/u.item', sep='|', header=None, names=inames)
#items
unames = ['user_id','age','gender','occupation', 'zip_code']
users = pd.read_table('data/ml-100k/u.user', sep='|', header=None, names=unames)
users
Out[10]:
In [2]:
NRows = max(ratings['user_id'])
NCols = max(ratings['movie_id'])
X = np.nan*np.ones((NRows, NCols))
for k in range(len(ratings)):
i = ratings['user_id'].ix[k]-1
j = ratings['movie_id'].ix[k]-1
r = ratings['rating'].ix[k]
X[i,j] = r
In [4]:
Data = X[6:16,3:20]
plt.imshow(Data, interpolation='nearest',vmax=5,vmin=0)
plt.colorbar()
plt.set_cmap('jet')
plt.xlabel('Movies')
plt.ylabel('Users')
plt.show()
In [17]:
users.ix[0:100]
Out[17]:
In [5]:
def nmf_kl_multiplicative(D, M, W, H, EPOCH=1):
MD = D.copy()
MD[M==0] = 0
for e in range(EPOCH):
Xhat = W.dot(H)
W = W*((MD/Xhat).dot(H.T)/np.dot(M, H.T))
Xhat = W.dot(H)
H = H*(W.T.dot(MD/Xhat)/np.dot(W.T, M))
#print np.sum(np.abs(MD - M*Xhat))/np.sum(M)
return W, H
In [8]:
#Rank
R = 3
# Data
Nr = Data.shape[0]
Nc = Data.shape[1]
# Initialize
W = np.random.rand(Nr, R)*100
H = np.random.rand(R, Nc)*100
Mask = np.ones_like(Data)
Mask[np.isnan(Data)] = 0
W,H = nmf_kl_multiplicative(Data, Mask, W, H, EPOCH=1000)
Xhat = W.dot(H)
def ShowMatrix(X, title=''):
plt.figure()
plt.imshow(X, interpolation='nearest',vmax=5,vmin=0)
plt.colorbar()
plt.set_cmap('jet')
plt.xlabel('Movies')
plt.ylabel('Users')
plt.title(title)
plt.show()
ShowMatrix(Data, 'original')
ShowMatrix(Xhat, 'estimate')
(From the Readme)
This dataset contains social networking, tagging, and music artist listening information
from a set of 2K users from Last.fm online music system.
http://www.last.fm
The dataset is released in the framework of the 2nd International Workshop on
Information Heterogeneity and Fusion in Recommender Systems (HetRec 2011)
http://ir.ii.uam.es/hetrec2011
at the 5th ACM Conference on Recommender Systems (RecSys 2011)
http://recsys.acm.org/2011
* 1892 users
* 17632 artists
* 12717 bi-directional user friend relations, i.e. 25434 (user_i, user_j) pairs
* avg. 13.443 friend relations per user
* 92834 user-listened artist relations, i.e. tuples [user, artist, listeningCount]
* avg. 49.067 artists most listened by each user
* avg. 5.265 users who listened each artist
* 11946 tags
* 186479 tag assignments (tas), i.e. tuples [user, tag, artist]
* avg. 98.562 tas per user
* avg. 14.891 tas per artist
* avg. 18.930 distinct tags used by each user
* avg. 8.764 distinct tags used for each artist
In [2]:
import scipy.sparse as sparse
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
In [3]:
# Create ListeningCount(user, artist) data
df = pd.read_csv('./data/hetrec2011-lastfm-2k/user_artists.dat', sep='\t')
userID2idx = {ids: i for i,ids in enumerate(sorted(df['userID'].unique()))}
idx2userID = {i: ids for i,ids in enumerate(sorted(df['userID'].unique()))}
artistID2idx = {ids: i for i,ids in enumerate(sorted(df['artistID'].unique()))}
idx2artistID = {i: ids for i,ids in enumerate(sorted(df['artistID'].unique()))}
NumOfUsers = len(userID2idx)
NumOfArtists = len(artistID2idx)
X_lil = sparse.lil_matrix((NumOfUsers, NumOfArtists))
for rec in df.itertuples():
i = userID2idx[int(rec[1])]
j = artistID2idx[int(rec[2])]
X_lil[i, j] = float(rec[3])
In [4]:
plt.figure(figsize=(12,4))
plt.spy(X_lil[0:,0:],markersize=1)
plt.show()
In [5]:
df = pd.read_csv('./data/hetrec2011-lastfm-2k/artists.dat', sep='\t')
artistID2artistName = {r[1]: r[2] for r in df.ix[:,['id','name']].itertuples()}
artistName2artistID = {r[2]: r[1] for r in df.ix[:,['id','name']].itertuples()}
artistName2artistID
Out[5]:
In [100]:
# Get artists listened by a user and the listeningCount
i = 1860
uID = idx2userID[i]
print 'User:', uID
a_idx = X_lil[i,:].nonzero()[1]
for j in a_idx:
print artistID2artistName[idx2artistID[j]], X_lil[i, j]
In [7]:
# Users listening to an artist
artistName = 'Morrissey'
j = artistID2idx[artistName2artistID[artistName]]
idx = X_lil[:,j].nonzero()[0]
for i in idx:
print idx2userID[i]
In [74]:
sm = np.array(np.sum(X_lil, axis=0))[0]
idx_sorted = np.array(sm.argsort())
for i in reversed(idx_sorted[-300:]):
print artistID2artistName[idx2artistID[i]],sm[i]
In [8]:
df = pd.read_csv('./data/hetrec2011-lastfm-2k/tags.dat', sep='\t')
tagName2tagID = {rec[2]: rec[1] for rec in df.itertuples()}
tagID2tagName = {rec[1]: rec[2] for rec in df.itertuples()}
tagID2idx = { tid: num for num,tid in enumerate(sorted(tagID2tagName.keys()))}
idx2tagID = { num: tid for num,tid in enumerate(sorted(tagID2tagName.keys()))}
NumOfTags = len(tagID2tagName)
NumOfTags
Out[8]:
In [9]:
df = pd.read_csv('./data/hetrec2011-lastfm-2k/user_taggedartists-timestamps.dat', sep='\t')
df
Out[9]:
In [10]:
new_artists = set()
for r in df.itertuples():
if not artistID2idx.has_key(r[2]):
new_artists.add(r[2])
nextArtistID = NumOfArtists
for u in new_artists:
if not artistID2artistName.has_key(u):
name = "Unknown_"+str(u)
print name,u
artistID2artistName[u] = name
artistName2artistID[name] = u
artistID2idx[u] = nextArtistID
idx2artistID[nextArtistID] = u
nextArtistID += 1
In [83]:
NumOfArtistsExtended = len(artistID2artistName)
TG = {(userID2idx[r[1]],artistID2idx[r[2]], tagID2idx[r[3]]): 1 for r in df.itertuples()}
ArtistTimesTagged = sparse.lil_matrix((NumOfArtistsExtended, NumOfTags))
for k in TG.iterkeys():
i = k[1]
j = k[2]
ArtistTimesTagged[i,j] += 1
In [87]:
plt.figure(figsize=(12,20))
plt.spy(ArtistTimesTagged[0:,0:],markersize=1)
plt.ylabel('Artists')
plt.xlabel('Tags')
plt.show()
In [122]:
#i = artistID2idx[artistName2artistID['Brad Mehldau']]
#i = artistID2idx[artistName2artistID['Baba Zula']]
#i = artistID2idx[artistName2artistID['Britney Spears']]
#i = artistID2idx[artistName2artistID['Kurban']]
#i = artistID2idx[artistName2artistID['Nancy Sinatra']]
i = artistID2idx[artistName2artistID['Nancy Sinatra']]
aID = idx2artistID[i]
print 'Artist:', artistID2artistName[aID]
tag_idx = ArtistTimesTagged[i,:].nonzero()[1]
for j in tag_idx:
print tagID2tagName[idx2tagID[j]], ArtistTimesTagged[i, j]
In [76]:
print(NumOfArtists)
len(artistID2artistName)
Out[76]:
In [12]:
df = pd.read_csv('./data/hetrec2011-lastfm-2k/user_friends.dat', sep='\t')
SN = {(userID2idx[r[1]],userID2idx[r[2]]): 1 for r in df.itertuples()}
In [150]:
import time
timestamp = 1241128800000
time.strftime("%a %d %b %Y %H:%M:%S GMT", time.gmtime(timestamp/1000))
Out[150]: