In [10]:
import pandas as pd
import numpy as np
import experiment_util as exper
In [122]:
datafolder = '/home/eliezer/Dropbox/repo-phd/poissoncpp/datasets/hetrec2011/lastfm/'
userfriends = np.loadtxt(datafolder+"user_friends.dat", delimiter='\t',skiprows=1,dtype=int)
userartists = np.loadtxt(datafolder+"user_artists.dat", delimiter='\t',skiprows=1,dtype=int)
ua_tag = np.loadtxt(datafolder+"user_taggedartists.dat", delimiter='\t',skiprows=1,dtype=int)
artists_id = np.unique(userartists[:,1])
users_id = np.unique(userartists[:,0])
max_rating_users=np.array([np.max(userartists[userartists[:,0]==x,2]) for x in users_id])
In [123]:
artists_inv_id=dict(zip(artists_id,xrange(len(artists_id))))
users_inv_id=dict(zip(users_id,xrange(len(users_id))))
In [124]:
### gera a lista de amigos usando o indice do array e nao o id que veio do sistema
list_friends_id= [ [users_inv_id[friend_id] for friend_id in userfriends[userfriends[:,0]==u_id][:,1]]\
for u_id in users_id]
### pegar o indice de todas as tags de um artista
artists_tags_id= [ ua_tag[ua_tag[:,1]==a_id][:,2] for a_id in artists_id ]
In [125]:
temp=np.apply_along_axis( lambda x:[users_inv_id[x[0]],artists_inv_id[x[1]],int(np.ceil(np.log(x[2])))]\
, axis=1, arr=userartists)
In [14]:
datafolder = datafolder+'/home/eliezer/datasets/hetrec2011/lastfm/transformed/'
new_userartists_log = np.apply_along_axis( lambda x:[users_inv_id[x[0]],artists_inv_id[x[1]]\
,int(np.ceil(np.log(x[2])))], axis=1, arr=userartists)
np.savetxt(datafolder+"v1_log_user_artists.dat",new_userartists_log,delimiter='\t')
In [138]:
datafolder = '/home/eliezer/datasets/hetrec2011/lastfm/transformed/'
new_list_friends_id=np.array([ (i, item_i) for i, item in enumerate(list_friends_id) for item_i in item])
np.savetxt(datafolder+"v1_user_friends.dat",new_list_friends_id,delimiter='\t',fmt='%i')
In [121]:
datafolder = '/home/eliezer/datasets/hetrec2011/lastfm/'
from load_convert import LoadModify
loader=LoadModify(datafolder)
loader.load()
R = loader.mat_users_artists_train.T
W = loader.mat_artists_tags
S = loader.list_friends_id
datafolder = '/home/eliezer/datasets/hetrec2011/lastfm/transformed/'
In [130]:
W = np.array(loader.mat_artists_tags,dtype=int)
print W.shape
W[0]
Out[130]:
In [131]:
loader.list_test[0]
Out[131]:
In [132]:
loader.list_train[0]
Out[132]:
In [141]:
print loader.list_friends_id[0]
print list_friends_id[0]
In [79]:
#import pickle as pk
#pk.dump(loader, open( loader.rootfolder+"loader.pk", "wb" ) )
In [11]:
import load_convert
reload(load_convert)
Out[11]:
In [137]:
datafolder = '/home/eliezer/datasets/hetrec2011/lastfm/transformed/'
np.savetxt(datafolder+"user_artist_rating.train",loader.list_train,fmt='%i',delimiter='\t')
np.savetxt(datafolder+"user_artist_rating.test",loader.list_test,fmt='%i',delimiter='\t')
new_list_friends_id=np.array([ (i, item_i) for i, item in enumerate(loader.list_friends_id) for item_i in item])
np.savetxt(datafolder+"v1_user_friends.dat",new_list_friends_id,delimiter='\t',fmt='%i')
W = np.array(loader.mat_artists_tags,dtype=int)
np.savetxt(datafolder+"tag_artist_count.dat",W,fmt='%i',delimiter='\t')
# keep a list with tags and index
tags_id_name=np.array(zip([loader.tags_inv_id[x] for x in loader.tags_id],loader.generate_artists_tags(loader.tags_id)))
pd.DataFrame(tags_id_name[:,1],index=tags_id_name[:,0],columns=['tag'])\
.to_csv(datafolder+"tag_id_name.dat",sep='\t',header=False)
In [3]:
!pwd
In [ ]:
In [110]:
#np.savetxt(datafolder+"tag_id_name.dat",tags_id_name,delimiter='\t')
In [12]:
datafolder = '/home/eliezer/Dropbox/repo-phd/poissoncpp/datasets/hetrec2011/lastfm/'
from load_convert import LoadModify
loader=LoadModify(datafolder)
loader.load_save(0.8)
#tag_id tag_name {tag_id_name.dat}
# art_id tag_id tag_count {tag_artist_count.dat}
# user_id art_id user_art_count {user_artist_rating.test user_artist_rating.train}
# user_id friend_id {v1_user_friends.dat}
In [160]:
loader.load_save(0.80)
#tag_id tag_name {tag_id_name.dat}
# art_id tag_id tag_count {tag_artist_count.dat}
# user_id art_id user_art_count {user_artist_rating.test user_artist_rating.train}
# user_id friend_id {v1_user_friends.dat}
In [3]:
userfriends = np.loadtxt("/home/eliezer/datasets/hetrec2011/lastfm/p85_train_test_9208/user_artist_rating.test", delimiter='\t',dtype=int)
In [7]:
userfriends[11685]
Out[7]:
In [9]:
userfriends[:,0]
Out[9]:
In [13]:
datafolder = '/home/eliezer/Dropbox/repo-phd/poissoncpp/datasets/hetrec2011/lastfm/'
from load_convert import LoadModify
loader=LoadModify(datafolder)
loader.load_save(0.85)
In [29]:
for x in range(30):
datafolder = '/home/eliezer/Dropbox/repo-phd/poissoncpp/datasets/hetrec2011/lastfm/'
loader=LoadModify(datafolder)
loader.load_save(0.85)
for x in range(30):
datafolder = '/home/eliezer/Dropbox/repo-phd/poissoncpp/datasets/hetrec2011/lastfm/'
loader=LoadModify(datafolder)
loader.load_save(0.90)
In [15]:
import numpy as np
In [28]:
for x in range(10):
print "p"+str(int(0.8*100))+"_train_test_"+str(np.random.randint(10000))+'/'
In [32]:
!ls /home/eliezer/Dropbox/repo-phd/poissoncpp/datasets/hetrec2011/lastfm
In [35]:
In [ ]:
In [54]:
Out[54]:
In [ ]: