To See the Results:

1-Scroll down to the end of the page in and select (by click) the cell after the "Test the Results". 
2-Uncomment the desired statement (removing the # character)
3-  Press Ctrl+Enter and wait for some seconds

In [364]:
'''
Created on Feb 21, 2017

@author: mmoham12
'''
import csv
import collections as c
import math 
import operator
import numpy as np
import pandas as pd
import collections as c
from scipy import spatial

Reading raw data form the CSV files.


In [439]:
gUserID ='30838'
basePath="" #E:\\workspaces\\RecommSys_A2\\data\\"
tagsFile = basePath + "movie-tags.csv"
titlesFile = basePath +"movie-titles.csv"
ratingFile = basePath + "ratings.csv"

reader = csv.reader( open(tagsFile,encoding="ISO-8859-1"))
items = list(reader)

itemspd = pd.read_csv(tagsFile,encoding = 'iso-8859-1', header=None, index_col='movie', names=['movie' , 'tag'])

reader = csv.reader( open(titlesFile,encoding="ISO-8859-1"))
titles = list(reader)
reader = csv.reader( open(ratingFile,encoding="ISO-8859-1"))

#userID, movieID, rating
ratings = list(reader)

print(len(ratings))
print (len(items))
#print(items.l.oc[3916])


264505
94875

Calculating the TF for Movie Tags


In [440]:
# item Vectors as a dictionary(like hashMaps in Java) : { movie1 :{ tag1 : count1 , tag2:count2 ,...} , movieID: ...}
itemCounts={}
 
# Document Vectro as a dictionary : { tag1:count1 , tag2:count2 ,...}
docCount= {} 


# Calculating the document frequency
# Stroing the length of the vectors in the last field as vectorLength

for movie,tag in items:
    if not tag in docCount:
        docCount[tag]=0.0
    
    add=0
    if not movie in itemCounts:
        # tagVector = Tag Vector 
        tagVector={}
        tagVector[tag]=1.0
        
        # Initial Length is 0. Will be computed later
        #  tagVector["vectorLength"] =0.0
        
        # if movie not exists in the dict before add a new one with 
        itemCounts[movie]={}
        
        # It is new tag for this movie so its count in the docFreq should be incremented
        add=1
        
    else:
        tagVector=itemCounts[movie]
        
        if not tag in itemCounts[movie]:           
            tagVector[tag]=1.0
            
            # It is new tag for this movie so its count in the docFreq should be incremented
            add=1
            
        else:            
            tagVector[tag] = tagVector[tag]+1
    
    # List of tag vectors for this movie gets u_profilesdated
    itemCounts[movie] =tagVector
    
    # The document frequency for this tag in the current movie.
    docCount[tag] +=add        
  
logN = math.log(len(itemCounts))

print (len(itemCounts))
print(logN)


2495
7.822044008185619

Calculating the IDF


In [441]:
# Calculating the final IDF values
docFreq ={}
for tag,count in docCount.items():
    docFreq[tag]= logN - math.log(count)

# Testing Item 4
print('item 4==========================')       
print( docFreq['CLV'])
print( docFreq['characters'])
print( docFreq['chick flick'])
print( docFreq['revenge'])


item 4==========================
1.3987970446520999
5.424148735387249
3.814710822953148
3.6025363030095123

Calcularing the TF-IDF


In [515]:
# Calculating the sum of the power of tf-idf of tags in tag vector of each movie and store it in vectorLength tag
itemVectors = {}

for movie,tagVector in itemCounts.items():
    vectLen= 0.0
    for tag,count in tagVector.items():
            # TF-IDF = TF  * IDF. count = TF ,  docFreq[tag] = IDF
            tfidf= count * docFreq[tag]  
            
            vectLen += math.pow(tfidf, 2)

       
    vectLen = math.sqrt(vectLen)  
    
    iv ={}
    for tag,count in tagVector.items(): 
            # Calculating the normalized tf-idf 
            iv[tag] = count * docFreq[tag] / vectLen
    
    itemVectors[movie] = iv
print(len(docFreq))

def getItemVector(docID):
    df=pd.DataFrame.from_dict(itemVectors, orient='index')

    d=df.loc['2231']
    d=d[d.notnull()]

    d=pd.DataFrame(d)
    d.columns=['value']

    d.sort_values('value', axis=0,ascending=[False], inplace=True)
    print(d)


#df


13103

Calculating the User User Profiles


In [503]:
#Building User Profile as { user1 :{ tag1: count1 , tag2:count2, ...} , userID:...}
def buildUserProfiles(userID):
    userProfiles={}

    users = sorted(ratings, key= lambda x:x[0], reverse= False)

    if userID !='':
        users = [row for row in users if row[0] == userID]

# Iterating over all user ratings and then the item vector for each move the user has rated
    for user,movie,rate in users:
    
        if not user in userProfiles:
            userTagVector={}                     
            userProfiles[user]={}
        
        else:
            userTagVector=userProfiles[user]
        
    # Selecting movies with rate over the 3.5
        if float(rate) >= 3.5  and movie in itemVectors:
        
        # Iterating over the list of the item vectors for movies the current user rated. User Value = Sigma( tfidf) 
        # For all tags in the movie rated by the user
            for tname, qt in itemVectors[movie].items():

                if not tname in userTagVector:
                    userTagVector[tname] = 0.0
                
                userTagVector[tname] += float(qt)

        # userTagVector = list of tags of movie the current user( user) has rated: { tag1:score1 , tag2:score2, ...}
            userProfiles[user]= userTagVector
        
    u=userProfiles[userID]
    sorted_x = sorted(u.items(), key= lambda x:x[1], reverse= False)
    #print( sorted_x)
    
    return userProfiles

A function to calculate Item Scores


In [404]:
def calculateScores( u_profiles, m_items, docID):
    # Generating the Item Scores. Having User Profile and Movie tag vector what would be the score of the user for that movie
    
    m_itemscores={}
    
    # m_itemscores : { userID: { movie1:score1 , movie1:score2, ...} , user2: ...}
     #Iterating over all user profiles for the current movie
    for userID, userTagVector in u_profiles.items():
          
        if not userID in m_itemscores:
            m_itemscores[userID]={}
         
        #P= list(userTagVector.values())
        
        # Iterating over Item vectors  to calculate the user m_itemscores for each Movie
        for movieID,itemTagVector in m_items.items():     
            
           
            if docID != '':
                if movieID != docID:
                    continue
            
            #Q =list(itemTagVector.values())
            
            
            # List of user scores for the current movie. Each field of this list should be to be claculated using cosin 
            userScores = m_itemscores[userID]   
            
            p=ps=qs=0            
            #result = 1 - spatial.distance.cosine(P , Q)
            
            qs=0.0
            for tag1,count1 in itemTagVector.items():
                qs += math.pow( count1,2)
            
            #print(qs)
            for tag2,count2 in userTagVector.items():  

                
                # TFIDF of Item if the tag exists in the list of movie's tags
                if tag2 in itemTagVector:
                    qt=itemTagVector[tag2] 
                else:
                    qt=0

                # User Value for the current tag
                pt= count2                
                p += (pt * qt)                
                ps += math.pow(pt,2)                
                

            # Calculating cosin score for each user/item pair
            if ps <= 0.0 or qs <=0.0 :
                score= 0.0
            else:
                score= p / (math.sqrt(ps) * math.sqrt(qs))
            
            #print(math.sqrt(qs))
            #print(math.sqrt(ps))

            # Updating user scores vector 
           
                
            userScores[movieID]= score
            
            #Updating the user scores vector for the current movie
            m_itemscores[userID]= userScores   
            
    return m_itemscores

In [520]:
def getUserPredictions(userID):
    up= buildUserProfiles(userID)
    unweightedScores= calculateScores2(up, itemVectors ,'' )

    u=list(unweightedScores.values())

    sorted_x = sorted(u[0].items(), key= lambda x:x[1], reverse= True)
    d=pd.DataFrame.from_records(sorted_x)
    print(d)

In [430]:
def buildWightedProfile(userID):   
#Building User Profile as { user1 :{ tag1: count1 , tag2:count2} , userID:...}

    weighted_profiles={}

    users = sorted(ratings, key= lambda x:x[0], reverse= False)

    if userID !='':
        users = [row for row in users if row[0] == userID]
   
    # User rating average as { user1:av}
    userAvg = {}

    old=users[0][0]
    avg=float(users[0][2])
    count=1.0
    for row in users:
        if old != row[0]:
        
            userAvg[old] = avg
        
            avg=float(row[2])
            count=1.0        
            old= row[0]
    
        else:
            avg = ((count * avg) + float(row[2]) ) / ( count + 1)
            count +=1.0
        
    userAvg[old] = avg
    
     
# Iterating over all user ratings and then the item vector for each move the user has rated   

    for user,movie,rate in users:
    
        if not user in weightedu_profiles:
            userTagVector={}                     
            weightedu_profiles[user]={}
        
        else:
            userTagVector=weightedu_profiles[user]
    
        if movie in itemVectors:
        # Iterating over the list of the item vectors for movies the current user rated. User Value = Sigma( tfidf) 
            for tname, qt in itemVectors[movie].items():

                if not tname in userTagVector:
                    userTagVector[tname] = 0

                #print(qt)
                
                userTagVector[tname] += qt * ( float(rate) - userAvg[user])

        # userTagVector = list of tags of movie the current user( user) has rated: { tag1:score1 , tag2:score2, ...}
            weighted_profiles[user]= userTagVector

    #print( weightedu_profiles[gUserID])
    return weighted_profiles

In [523]:
def getWeighUserPredictions(userID):
    up= buildWightedProfile('320')
#print(up)
    weightedScores= calculateScores(up, itemVectors,'' )

#print(weightedScores)
    u=list(weightedScores.values())

#print(u[0])
    sorted_x = sorted(u[0].items(), key= lambda x:x[1], reverse= True)
    d=pd.DataFrame.from_records(sorted_x)
    print(d)
#print(sorted_x)

Test the Results


In [527]:
getItemVector('2231')
#getUserPredictions('320')
#getWeighUserPredictions('320')


                   value
poker           0.597000
Edward Norton   0.552893
Matt Damon      0.315939
John Turturro   0.260617
gambling        0.245714
John Malkovich  0.239701
card games      0.125276
John Dahl       0.114175
cards           0.107681
2.5             0.067883
watched 2006    0.061096
library vhs     0.060524