In [1]:
import pandas as pd
from scipy import stats as st
from random import sample
import numpy as np

df = pd.read_csv('train1.csv', sep=',')
print(df.head())


   user_id  movie_id  rating
0        1         1       5
1        1         2       3
2        1         3       4
3        1         4       3
4        1         5       3

In [4]:
l = len(df['user_id'].values)
def reindex(df):
    df.index = np.array(range(0, l))

def randomize(df):
    df = df.sample(frac = 1.0)
    reindex(df)
    return df
#df = randomize(df)
print(df.head())
#print(min(df['userId'].value_counts()))


   user_id  movie_id  rating
0        1         1       5
1        1         2       3
2        1         3       4
3        1         4       3
4        1         5       3

In [22]:
def common_movies(u, v):
    dfu = df.loc[df['user_id'] == u]
    dfv = df.loc[df['user_id'] == v]
    #print(dfu.head())
    #print(dfv.head())
    dfm = pd.merge(dfu, dfv, how = 'inner', on = 'movie_id')
    return dfm

C = [[]]
users = df['user_id'].values

def pcc(u, v):
    dfm = common_movies(u, v)
    X = dfm['rating_x'].values
    Y = dfm['rating_y'].values
    sim = st.pearsonr(X, Y)[0]
    #print(sim)
    sim += 1.0
    sim /= 2.0
    return sim
print(pcc(1, 2))

def cos(u, v):
    dfm


0.634839972493

In [ ]: