In [3]:
import pandas as pd
import numpy as np
import scipy as sp
In [1]:
datafolder = '/home/eliezer/datasets/hetrec2011/lastfm/'
from TopicSPF import ContentSocialPoissonMF
cspmf = ContentSocialPoissonMF()
In [4]:
userfriends = np.loadtxt(datafolder+"user_friends.dat", delimiter='\t',skiprows=1,dtype=int)
userartists = np.loadtxt(datafolder+"user_artists.dat", delimiter='\t',skiprows=1,dtype=int)
ua_tag = np.loadtxt(datafolder+"user_taggedartists.dat", delimiter='\t',skiprows=1,dtype=int)
artists_id = np.unique(userartists[:,1])
users_id = np.unique(userartists[:,0])
In [8]:
artists_inv_id=dict(zip(artists_id,xrange(len(artists_id))))
users_inv_id=dict(zip(users_id,xrange(len(users_id))))
In [9]:
### gera a lista de amigos usando o indice do array e nao o id que veio do sistema
list_friends_id= [ [users_inv_id[friend_id] for friend_id in userfriends[userfriends[:,0]==u_id][:,1]] for u_id in users_id]
### pegar o indice de todas as tags de um artista
artists_tags_id= [ ua_tag[ua_tag[:,1]==a_id][:,2] for a_id in artists_id ]
In [10]:
print list_friends_id[0]
print artists_tags_id[0]
In [11]:
tags=pd.read_csv(datafolder+"tags.dat", sep='\t')
tags_id = tags['tagID'].get_values()
map_tag_id_name=dict(zip(tags['tagID'].get_values(),map(str,tags['tagValue'].get_values())))
map_tag_name_id=dict(zip(map(str,tags['tagValue'].get_values()),tags['tagID'].get_values()))
In [12]:
generate_artists_tags = lambda lst_tags_id: [val.replace("-","") for x in lst_tags_id for val in map_tag_id_name[x].split()]
In [13]:
generate_artists_tags(artists_tags_id[12])
Out[13]:
In [60]:
tags_inv_id = dict(zip(tags_id, xrange(len(tags_id))))
In [26]:
n_wd_tags_entries=sum(len(x) for x in artists_tags_id)
n_ratings = userartists.size
n_neighbors = max(len(x) for x in list_friends_id)
n_words=tags_id.size
n_user=users_id.size
n_item=artists_id.size
In [25]:
print "n_words=",n_words
print "n_user=",n_user
print "n_item=",n_item
print "n_neighbors=",n_neighbors
print "n_wd_tags_entries=",n_wd_tags_entries
print "n_ratings=",n_ratings
In [36]:
k=100
memoryusage=k*(3*n_words+6*n_item+3*n_user+4+2*n_ratings+n_wd_tags_entries)+n_neighbors*(3*n_user+n_ratings+1)
print "memory usage (floats) = ",(memoryusage*32.0)/(2**30)," GB"
In [37]:
memoryusage*32.0
Out[37]:
In [35]:
122742800*32.0
Out[35]:
In [ ]:
datafolder = '/home/eliezer/datasets/hetrec2011/lastfm/'
from experiment_util import LoadLastFM
loader=LoadLastFM(datafolder)
loader.load()
In [5]:
plt.hist([len(userartists[userartists[:,0]==idx]) for idx in np.unique(userartists[:,0])])
In [40]:
In [ ]:
In [ ]: