In [141]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
from matrix_factorization import matrix_factorization
from graph_init import *
from similarity import *
from create_R import *
from ALS import *
from hard_hfs import *
import copy
import matplotlib.pyplot as plt


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [142]:
def RMSE(ground, predict):
    
    error = 0
    n = 0
    
    for i in range(len(ground)):
        for j in range(len(ground[0])):
            if ground[i,j] != 0:
                error += (ground[i,j] - predict[i,j])**2
                n += 1
                
    return np.sqrt(error/n)

def RMSEvec(ground, predict):
    
    error = 0
    n = 0
    
    for i in range(len(ground)):
        if ground[i] != 0:
            error += (ground[i] - predict[i])**2
            n += 1
                
    return np.sqrt(error/n)

def meanError(ground_truth,new_res):
    return np.mean(abs((new_res - ground_truth)[ground_truth!=0]))

In [143]:
def dictfromR(R):

    R_dict = {"Users": np.empty([0]), "Movies": np.empty([0]), "Ratings": np.empty([0])}

    for i in range(len(R)):
        for j in range(len(R[0])):
            if R[i,j] != 0:
                R_dict["Users"] = np.append(R_dict["Users"],i)
                R_dict["Movies"] = np.append(R_dict["Movies"],j)
                R_dict["Ratings"] = np.append(R_dict["Ratings"],R[i,j])

    return R_dict

In [144]:
# R = [
#      [5,3,0,1],
#      [4,0,0,1],
#      [1,1,0,5],
#      [1,0,0,4],
#      [0,1,5,4],
#     ]

# R = np.array(R)

R,R_dict = create_R()

print(R_dict)


/home/marc/Documents/MVA/ProjetGraphes/src/create_R.py:21: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  R[ratingsnp[i,0]-1, ratingsnp[i,-1]] = ratingsnp[i,2]
{'Users': array([   0.,    0.,    0., ...,  670.,  670.,  670.]), 'Ratings': array([ 2.5,  3. ,  3. , ...,  4. ,  2.5,  3.5]), 'Movies': array([   30.,   833.,   859., ...,  4597.,  4610.,  4696.])}

In [145]:
P_dict = copy.deepcopy(R_dict)
P_dict["Ratings"] = np.ones([len(R_dict["Ratings"])])
P = R > 0
print(P)


[[False False False ..., False False False]
 [False False False ..., False False False]
 [False False False ..., False False False]
 ..., 
 [False False False ..., False False False]
 [ True False False ..., False False False]
 [ True False False ..., False False False]]

In [146]:
num_cold = 5
cold_movies = np.argsort([np.sum(P[i,:] for i in range(len(P)))][0])[-num_cold:]

In [147]:
to_keep = 5
ground_truth = []
sel = []
for i,c in enumerate(cold_movies):
    sel.append(np.where(R[:,c] != 0)[0])
    np.random.shuffle(sel[i])
    sel[i] = sel[i][:len(sel[i])-to_keep]
    ground_truth.append(copy.deepcopy(R[:,c]))
    R[sel[i],c]=0

In [148]:
R_dictCopy = copy.deepcopy(R_dict)
R_dict = dictfromR(R)

In [149]:
np.where(R_dict['Movies'] == 321)


Out[149]:
(array([13896, 20198, 34976, 52367, 57123]),)

In [150]:
N = len(R)
M = len(R[0])
K = 4

# P = np.random.rand(N,K)
# Q = np.random.rand(M,K)

# nP, nQ = matrix_factorization(R, P, Q, K)

als = ALS(K,N,M,"Users","Movies","Ratings",lbda = 0.1,lbda2 = 0.1)
print("Als created")
ans = als.fit(R_dict)

# nR = np.dot(nP, nQ.T)

# print(nP, "\n\n", nQ)


Als created

In [151]:
R_rec = np.dot(als.U,np.transpose(als.V))

In [152]:
print(RMSE(R,R_rec))
print(RMSE(R,(R_rec-np.min(R_rec))*5/np.max(R_rec-np.min(R_rec))))


0.699343983692
0.921908953239

In [153]:
for i in range(num_cold):
    print(RMSEvec(ground_truth[i], R_rec[:,321]))


0.985627858389
1.02384824386
1.0649895437
1.0872626417
0.9933353707

In [154]:
R


Out[154]:
array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 5.,  0.,  0., ...,  0.,  0.,  0.]])

In [155]:
np.max(R_rec)


Out[155]:
8.7346122364568544

In [156]:
lp = LaplacianParams()

# sim = similarity(als.U)
sim = build_graph(als.U, GraphParams())
# Seems to work better with U... 

print(sim)


[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]

In [157]:
L = build_laplacian(sim,lp)

print(L.shape)


(671, 671)

In [158]:
supp = 100
test_vec = copy.deepcopy(R[:,321])*2
# test_vec[:supp] = [0 for i in range(supp)]
test_vec.shape


Out[158]:
(671,)

In [159]:
# test_vec

In [160]:
hfs0, confidence = simple_hfs(als.U, test_vec, L, sim)
# hfs0/2


/home/marc/anaconda3/lib/python3.5/site-packages/numpy/core/numeric.py:190: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  a = empty(shape, dtype, order)

In [161]:
maxconfidences = np.array([max(confidence[i,:]) for i in range(len(confidence))])

In [162]:
lim = np.percentile(maxconfidences, 1)
for i in range(num_cold):
#     print(RMSEvec(ground_truth[i]*(maxconfidences > lim),hfs0/2))
    print(RMSEvec(ground_truth[i]*(maxconfidences > lim), R_rec[:,cold_movies[i]]))


0.967875185436
1.23220560066
0.816880673931
0.839941146207
0.988814010473

In [163]:
for i in range(num_cold):
#     print(meanError(ground_truth[i],hfs0/2))
    print(meanError(ground_truth[i],R_rec[:,cold_movies[i]]))


0.787283626648
0.943809639659
0.633564022911
0.685569595557
0.780961214644

In [164]:
# elmnt = 321
# val = []
# print(RMSEvec(R[:,elmnt],R_rec[:,elmnt]))
# for supp in range(1,671,10):
#     test_vec = copy.deepcopy(R[:,elmnt])*2
#     test_vec[:supp] = [0 for i in range(supp)]

#     hfs0 = simple_hfs(als.U, test_vec, L, sim)
#     val.append(RMSEvec(R[:,elmnt],hfs0/2))
    
# plt.plot(range(1,671,10),val)

In [165]:
lhfs = []
lconf = []
for i in range(len(R[0])):
    if i%1000 == 0:
        print(i)
    hfs0, confidence = simple_hfs(als.U, R[:,i]*2, L, sim)
    maxconfidences = np.array([max(confidence[i,:]) for i in range(len(confidence))])
    
    lim = np.percentile(maxconfidences, 95)
    
    lhfs.append(hfs0/2)
    lconf.append(maxconfidences > lim)

R_barre = np.vstack(lhfs).T
confs = np.vstack(lconf).T
    
# R_barre[R_barre < 1] = .5
# R_barre[R_barre > 5] = 5


0
/home/marc/anaconda3/lib/python3.5/site-packages/numpy/core/numeric.py:190: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  a = empty(shape, dtype, order)
1000
2000
3000
4000
5000
6000
7000
8000
9000

In [166]:
R_barre_limited = R_barre * confs

In [167]:
sum(R_barre_limited)


Out[167]:
array([   0.,    0.,    0., ...,  170.,  102.,  170.])

In [168]:
print(RMSE(R_barre_limited,R_rec))


0.500344487029

In [169]:
R_barre_final = copy.deepcopy(R_barre_limited)
R_barre_final[R != 0] = 0

In [170]:
# R_dict_barre = dictfromR(R_barre_final)

In [171]:
N = len(R)
M = len(R[0])
K = 4

als_trans = ALS(K,N,M,"Users","Movies","Ratings",lbda = 0.1,lbda2 = 0.1)
print("Als created")

ans = als_trans.fitTransductive(R_dict,R_barre_final,C1=1,C2=0.1)


Als created

In [172]:
R_rec_trans = np.dot(als_trans.U,np.transpose(als_trans.V))
print(RMSE(R_rec_trans,R_rec))


0.43791864845

In [176]:
for i in range(num_cold):
    print("movie "+str(i+1))
    print(RMSEvec(ground_truth[i],R_rec_trans[:,cold_movies[i]]))
    print(RMSEvec(ground_truth[i],R_rec[:,cold_movies[i]]))
    print(RMSEvec(R_barre_limited[:,cold_movies[i]]*(ground_truth[i]>0),ground_truth[i]))


movie 1
0.991873083906
0.966816365453
0.947131893418
movie 2
1.18041594714
1.22940789828
1.33278497496
movie 3
0.822343741064
0.81956460168
1.05475115549
movie 4
0.8452482168
0.840963220711
0.82247832083
movie 5
0.822570994987
0.9933353707
1.19023807142

0.930864602802 0.97011494657 1.00036650175 (90 1 0.1)

0.910201784408 0.97011494657 1.00036650175 (90 1 0.5)

0.931048133417 0.97011494657 1.00036650175 (95 1 0.5)


In [174]:
R_barre


Out[174]:
array([[ 4. ,  3. ,  4. , ...,  5. ,  3. ,  5. ],
       [ 4. ,  3. ,  3. , ...,  5. ,  3. ,  5. ],
       [ 4. ,  3. ,  3. , ...,  5. ,  3. ,  5. ],
       ..., 
       [ 5. ,  3.5,  3.5, ...,  5. ,  3. ,  5. ],
       [ 4. ,  4. ,  2. , ...,  5. ,  3. ,  5. ],
       [ 5. ,  3.5,  4. , ...,  5. ,  3. ,  5. ]])

In [ ]: