notebook.community

Edit and run



In [141]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
from matrix_factorization import matrix_factorization
from graph_init import *
from similarity import *
from create_R import *
from ALS import *
from hard_hfs import *
import copy
import matplotlib.pyplot as plt









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [142]:

    
def RMSE(ground, predict):
    
    error = 0
    n = 0
    
    for i in range(len(ground)):
        for j in range(len(ground[0])):
            if ground[i,j] != 0:
                error += (ground[i,j] - predict[i,j])**2
                n += 1
                
    return np.sqrt(error/n)

def RMSEvec(ground, predict):
    
    error = 0
    n = 0
    
    for i in range(len(ground)):
        if ground[i] != 0:
            error += (ground[i] - predict[i])**2
            n += 1
                
    return np.sqrt(error/n)

def meanError(ground_truth,new_res):
    return np.mean(abs((new_res - ground_truth)[ground_truth!=0]))



In [143]:

    
def dictfromR(R):

    R_dict = {"Users": np.empty([0]), "Movies": np.empty([0]), "Ratings": np.empty([0])}

    for i in range(len(R)):
        for j in range(len(R[0])):
            if R[i,j] != 0:
                R_dict["Users"] = np.append(R_dict["Users"],i)
                R_dict["Movies"] = np.append(R_dict["Movies"],j)
                R_dict["Ratings"] = np.append(R_dict["Ratings"],R[i,j])

    return R_dict



In [144]:

    
# R = [
#      [5,3,0,1],
#      [4,0,0,1],
#      [1,1,0,5],
#      [1,0,0,4],
#      [0,1,5,4],
#     ]

# R = np.array(R)

R,R_dict = create_R()

print(R_dict)









    



/home/marc/Documents/MVA/ProjetGraphes/src/create_R.py:21: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  R[ratingsnp[i,0]-1, ratingsnp[i,-1]] = ratingsnp[i,2]






    



{'Users': array([   0.,    0.,    0., ...,  670.,  670.,  670.]), 'Ratings': array([ 2.5,  3. ,  3. , ...,  4. ,  2.5,  3.5]), 'Movies': array([   30.,   833.,   859., ...,  4597.,  4610.,  4696.])}



In [145]:

    
P_dict = copy.deepcopy(R_dict)
P_dict["Ratings"] = np.ones([len(R_dict["Ratings"])])
P = R > 0
print(P)









    



[[False False False ..., False False False]
 [False False False ..., False False False]
 [False False False ..., False False False]
 ..., 
 [False False False ..., False False False]
 [ True False False ..., False False False]
 [ True False False ..., False False False]]



In [146]:

    
num_cold = 5
cold_movies = np.argsort([np.sum(P[i,:] for i in range(len(P)))][0])[-num_cold:]



In [147]:

    
to_keep = 5
ground_truth = []
sel = []
for i,c in enumerate(cold_movies):
    sel.append(np.where(R[:,c] != 0)[0])
    np.random.shuffle(sel[i])
    sel[i] = sel[i][:len(sel[i])-to_keep]
    ground_truth.append(copy.deepcopy(R[:,c]))
    R[sel[i],c]=0



In [148]:

    
R_dictCopy = copy.deepcopy(R_dict)
R_dict = dictfromR(R)



In [149]:

    
np.where(R_dict['Movies'] == 321)









    Out[149]:





(array([13896, 20198, 34976, 52367, 57123]),)



In [150]:

    
N = len(R)
M = len(R[0])
K = 4

# P = np.random.rand(N,K)
# Q = np.random.rand(M,K)

# nP, nQ = matrix_factorization(R, P, Q, K)

als = ALS(K,N,M,"Users","Movies","Ratings",lbda = 0.1,lbda2 = 0.1)
print("Als created")
ans = als.fit(R_dict)

# nR = np.dot(nP, nQ.T)

# print(nP, "\n\n", nQ)









    



Als created



In [151]:

    
R_rec = np.dot(als.U,np.transpose(als.V))



In [152]:

    
print(RMSE(R,R_rec))
print(RMSE(R,(R_rec-np.min(R_rec))*5/np.max(R_rec-np.min(R_rec))))









    



0.699343983692
0.921908953239



In [153]:

    
for i in range(num_cold):
    print(RMSEvec(ground_truth[i], R_rec[:,321]))









    



0.985627858389
1.02384824386
1.0649895437
1.0872626417
0.9933353707



In [154]:

    
R









    Out[154]:





array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 5.,  0.,  0., ...,  0.,  0.,  0.]])



In [155]:

    
np.max(R_rec)









    Out[155]:





8.7346122364568544



In [156]:

    
lp = LaplacianParams()

# sim = similarity(als.U)
sim = build_graph(als.U, GraphParams())
# Seems to work better with U... 

print(sim)









    



[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]



In [157]:

    
L = build_laplacian(sim,lp)

print(L.shape)



In [158]:

    
supp = 100
test_vec = copy.deepcopy(R[:,321])*2
# test_vec[:supp] = [0 for i in range(supp)]
test_vec.shape









    Out[158]:





(671,)



In [159]:

    
# test_vec



In [160]:

    
hfs0, confidence = simple_hfs(als.U, test_vec, L, sim)
# hfs0/2









    



/home/marc/anaconda3/lib/python3.5/site-packages/numpy/core/numeric.py:190: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  a = empty(shape, dtype, order)



In [161]:

    
maxconfidences = np.array([max(confidence[i,:]) for i in range(len(confidence))])



In [162]:

    
lim = np.percentile(maxconfidences, 1)
for i in range(num_cold):
#     print(RMSEvec(ground_truth[i]*(maxconfidences > lim),hfs0/2))
    print(RMSEvec(ground_truth[i]*(maxconfidences > lim), R_rec[:,cold_movies[i]]))









    



0.967875185436
1.23220560066
0.816880673931
0.839941146207
0.988814010473



In [163]:

    
for i in range(num_cold):
#     print(meanError(ground_truth[i],hfs0/2))
    print(meanError(ground_truth[i],R_rec[:,cold_movies[i]]))









    



0.787283626648
0.943809639659
0.633564022911
0.685569595557
0.780961214644



In [164]:

    
# elmnt = 321
# val = []
# print(RMSEvec(R[:,elmnt],R_rec[:,elmnt]))
# for supp in range(1,671,10):
#     test_vec = copy.deepcopy(R[:,elmnt])*2
#     test_vec[:supp] = [0 for i in range(supp)]

#     hfs0 = simple_hfs(als.U, test_vec, L, sim)
#     val.append(RMSEvec(R[:,elmnt],hfs0/2))
    
# plt.plot(range(1,671,10),val)



In [165]:

    
lhfs = []
lconf = []
for i in range(len(R[0])):
    if i%1000 == 0:
        print(i)
    hfs0, confidence = simple_hfs(als.U, R[:,i]*2, L, sim)
    maxconfidences = np.array([max(confidence[i,:]) for i in range(len(confidence))])
    
    lim = np.percentile(maxconfidences, 95)
    
    lhfs.append(hfs0/2)
    lconf.append(maxconfidences > lim)

R_barre = np.vstack(lhfs).T
confs = np.vstack(lconf).T
    
# R_barre[R_barre < 1] = .5
# R_barre[R_barre > 5] = 5









    



0






    



/home/marc/anaconda3/lib/python3.5/site-packages/numpy/core/numeric.py:190: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  a = empty(shape, dtype, order)






    



1000
2000
3000
4000
5000
6000
7000
8000
9000



In [166]:

    
R_barre_limited = R_barre * confs



In [167]:

    
sum(R_barre_limited)









    Out[167]:





array([   0.,    0.,    0., ...,  170.,  102.,  170.])



In [168]:

    
print(RMSE(R_barre_limited,R_rec))









    



0.500344487029



In [169]:

    
R_barre_final = copy.deepcopy(R_barre_limited)
R_barre_final[R != 0] = 0



In [170]:

    
# R_dict_barre = dictfromR(R_barre_final)



In [171]:

    
N = len(R)
M = len(R[0])
K = 4

als_trans = ALS(K,N,M,"Users","Movies","Ratings",lbda = 0.1,lbda2 = 0.1)
print("Als created")

ans = als_trans.fitTransductive(R_dict,R_barre_final,C1=1,C2=0.1)









    



Als created



In [172]:

    
R_rec_trans = np.dot(als_trans.U,np.transpose(als_trans.V))
print(RMSE(R_rec_trans,R_rec))









    



0.43791864845



In [176]:

    
for i in range(num_cold):
    print("movie "+str(i+1))
    print(RMSEvec(ground_truth[i],R_rec_trans[:,cold_movies[i]]))
    print(RMSEvec(ground_truth[i],R_rec[:,cold_movies[i]]))
    print(RMSEvec(R_barre_limited[:,cold_movies[i]]*(ground_truth[i]>0),ground_truth[i]))









    



movie 1
0.991873083906
0.966816365453
0.947131893418
movie 2
1.18041594714
1.22940789828
1.33278497496
movie 3
0.822343741064
0.81956460168
1.05475115549
movie 4
0.8452482168
0.840963220711
0.82247832083
movie 5
0.822570994987
0.9933353707
1.19023807142

0.930864602802 0.97011494657 1.00036650175 (90 1 0.1)

0.910201784408 0.97011494657 1.00036650175 (90 1 0.5)

0.931048133417 0.97011494657 1.00036650175 (95 1 0.5)



In [174]:

    
R_barre









    Out[174]:





array([[ 4. ,  3. ,  4. , ...,  5. ,  3. ,  5. ],
       [ 4. ,  3. ,  3. , ...,  5. ,  3. ,  5. ],
       [ 4. ,  3. ,  3. , ...,  5. ,  3. ,  5. ],
       ..., 
       [ 5. ,  3.5,  3.5, ...,  5. ,  3. ,  5. ],
       [ 4. ,  4. ,  2. , ...,  5. ,  3. ,  5. ],
       [ 5. ,  3.5,  4. , ...,  5. ,  3. ,  5. ]])



In [ ]: