In [3]:
import pandas as pd
import numpy as np
import scipy as sp

In [1]:
datafolder = '/home/eliezer/datasets/hetrec2011/lastfm/'
from TopicSPF import ContentSocialPoissonMF
cspmf = ContentSocialPoissonMF()

In [4]:
userfriends = np.loadtxt(datafolder+"user_friends.dat", delimiter='\t',skiprows=1,dtype=int)
userartists = np.loadtxt(datafolder+"user_artists.dat", delimiter='\t',skiprows=1,dtype=int)
ua_tag = np.loadtxt(datafolder+"user_taggedartists.dat", delimiter='\t',skiprows=1,dtype=int)

artists_id = np.unique(userartists[:,1])
users_id = np.unique(userartists[:,0])

In [8]:
artists_inv_id=dict(zip(artists_id,xrange(len(artists_id))))
users_inv_id=dict(zip(users_id,xrange(len(users_id))))

In [9]:
### gera a lista de amigos usando o indice do array e nao o id que veio do sistema
list_friends_id= [ [users_inv_id[friend_id] for friend_id in userfriends[userfriends[:,0]==u_id][:,1]] for u_id in users_id]

### pegar o indice de todas as tags de um artista
artists_tags_id= [ ua_tag[ua_tag[:,1]==a_id][:,2] for a_id in artists_id ]

In [10]:
print list_friends_id[0]
print artists_tags_id[0]


[257, 400, 482, 709, 772, 838, 1102, 1103, 1121, 1203, 1431, 1466, 1689]
[ 552 1219  139  141 2850  139  141  179  541  139  141  179  541  139  139]

In [11]:
tags=pd.read_csv(datafolder+"tags.dat", sep='\t')
tags_id = tags['tagID'].get_values()
map_tag_id_name=dict(zip(tags['tagID'].get_values(),map(str,tags['tagValue'].get_values())))
map_tag_name_id=dict(zip(map(str,tags['tagValue'].get_values()),tags['tagID'].get_values()))

In [12]:
generate_artists_tags = lambda lst_tags_id: [val.replace("-","") for x in lst_tags_id for val in map_tag_id_name[x].split()]

In [13]:
generate_artists_tags(artists_tags_id[12])


Out[13]:
['electroindustrial',
 'ebm',
 'dark',
 'electro',
 'industrial',
 'dark',
 'electro',
 'industrial',
 'ebm',
 'dark',
 'electro',
 'electroindustrial',
 'industrial',
 'ebm',
 'ebm',
 'dark',
 'electro',
 'electronic',
 'industrial',
 'dark',
 'electro',
 'harsh',
 'ebm',
 'ebm',
 'friends',
 'industrial',
 'ebm',
 'aggrotech']

In [60]:
tags_inv_id = dict(zip(tags_id, xrange(len(tags_id))))

In [26]:
n_wd_tags_entries=sum(len(x) for x in artists_tags_id)
n_ratings = userartists.size
n_neighbors = max(len(x) for x in list_friends_id)
n_words=tags_id.size
n_user=users_id.size
n_item=artists_id.size

In [25]:
print "n_words=",n_words
print "n_user=",n_user
print "n_item=",n_item
print "n_neighbors=",n_neighbors
print "n_wd_tags_entries=",n_wd_tags_entries
print "n_ratings=",n_ratings


n_words= 11946
n_user= 1892
n_item= 17632
n_neighbors= 119
n_wd_tags_entries= 184941
n_ratings= 278502

In [36]:
k=100
memoryusage=k*(3*n_words+6*n_item+3*n_user+4+2*n_ratings+n_wd_tags_entries)+n_neighbors*(3*n_user+n_ratings+1)
print "memory usage (floats) = ",(memoryusage*32.0)/(2**30)," GB"


memory usage (floats) =  3.65802052617  GB

In [37]:
memoryusage*32.0


Out[37]:
3927769632.0

In [35]:
122742800*32.0


Out[35]:
3927769600.0

In [ ]:
datafolder = '/home/eliezer/datasets/hetrec2011/lastfm/'
from experiment_util import LoadLastFM
loader=LoadLastFM(datafolder)
loader.load()


train percentage =  80.0

In [5]:
plt.hist([len(userartists[userartists[:,0]==idx]) for idx in np.unique(userartists[:,0])])



NameErrorTraceback (most recent call last)
<ipython-input-5-40341a4ce73c> in <module>()
----> 1 plt.hist([len(userartists[userartists[:,0]==idx]) for idx in np.unique(userartists[:,0])])

NameError: name 'plt' is not defined

In [40]:


In [ ]:


In [ ]: