In [1]:
import pandas as pd
from scipy import stats as st
from random import sample
import numpy as np
df = pd.read_csv('train1.csv', sep=',')
print(df.head())
In [4]:
l = len(df['user_id'].values)
def reindex(df):
df.index = np.array(range(0, l))
def randomize(df):
df = df.sample(frac = 1.0)
reindex(df)
return df
#df = randomize(df)
print(df.head())
#print(min(df['userId'].value_counts()))
In [22]:
def common_movies(u, v):
dfu = df.loc[df['user_id'] == u]
dfv = df.loc[df['user_id'] == v]
#print(dfu.head())
#print(dfv.head())
dfm = pd.merge(dfu, dfv, how = 'inner', on = 'movie_id')
return dfm
C = [[]]
users = df['user_id'].values
def pcc(u, v):
dfm = common_movies(u, v)
X = dfm['rating_x'].values
Y = dfm['rating_y'].values
sim = st.pearsonr(X, Y)[0]
#print(sim)
sim += 1.0
sim /= 2.0
return sim
print(pcc(1, 2))
def cos(u, v):
dfm
In [ ]: