In [364]:
    
'''
Created on Feb 21, 2017
@author: mmoham12
'''
import csv
import collections as c
import math 
import operator
import numpy as np
import pandas as pd
import collections as c
from scipy import spatial
    
In [439]:
    
gUserID ='30838'
basePath="" #E:\\workspaces\\RecommSys_A2\\data\\"
tagsFile = basePath + "movie-tags.csv"
titlesFile = basePath +"movie-titles.csv"
ratingFile = basePath + "ratings.csv"
reader = csv.reader( open(tagsFile,encoding="ISO-8859-1"))
items = list(reader)
itemspd = pd.read_csv(tagsFile,encoding = 'iso-8859-1', header=None, index_col='movie', names=['movie' , 'tag'])
reader = csv.reader( open(titlesFile,encoding="ISO-8859-1"))
titles = list(reader)
reader = csv.reader( open(ratingFile,encoding="ISO-8859-1"))
#userID, movieID, rating
ratings = list(reader)
print(len(ratings))
print (len(items))
#print(items.l.oc[3916])
    
    
In [440]:
    
# item Vectors as a dictionary(like hashMaps in Java) : { movie1 :{ tag1 : count1 , tag2:count2 ,...} , movieID: ...}
itemCounts={}
 
# Document Vectro as a dictionary : { tag1:count1 , tag2:count2 ,...}
docCount= {} 
# Calculating the document frequency
# Stroing the length of the vectors in the last field as vectorLength
for movie,tag in items:
    if not tag in docCount:
        docCount[tag]=0.0
    
    add=0
    if not movie in itemCounts:
        # tagVector = Tag Vector 
        tagVector={}
        tagVector[tag]=1.0
        
        # Initial Length is 0. Will be computed later
        #  tagVector["vectorLength"] =0.0
        
        # if movie not exists in the dict before add a new one with 
        itemCounts[movie]={}
        
        # It is new tag for this movie so its count in the docFreq should be incremented
        add=1
        
    else:
        tagVector=itemCounts[movie]
        
        if not tag in itemCounts[movie]:           
            tagVector[tag]=1.0
            
            # It is new tag for this movie so its count in the docFreq should be incremented
            add=1
            
        else:            
            tagVector[tag] = tagVector[tag]+1
    
    # List of tag vectors for this movie gets u_profilesdated
    itemCounts[movie] =tagVector
    
    # The document frequency for this tag in the current movie.
    docCount[tag] +=add        
  
logN = math.log(len(itemCounts))
print (len(itemCounts))
print(logN)
    
    
In [441]:
    
# Calculating the final IDF values
docFreq ={}
for tag,count in docCount.items():
    docFreq[tag]= logN - math.log(count)
# Testing Item 4
print('item 4==========================')       
print( docFreq['CLV'])
print( docFreq['characters'])
print( docFreq['chick flick'])
print( docFreq['revenge'])
    
    
In [515]:
    
# Calculating the sum of the power of tf-idf of tags in tag vector of each movie and store it in vectorLength tag
itemVectors = {}
for movie,tagVector in itemCounts.items():
    vectLen= 0.0
    for tag,count in tagVector.items():
            # TF-IDF = TF  * IDF. count = TF ,  docFreq[tag] = IDF
            tfidf= count * docFreq[tag]  
            
            vectLen += math.pow(tfidf, 2)
       
    vectLen = math.sqrt(vectLen)  
    
    iv ={}
    for tag,count in tagVector.items(): 
            # Calculating the normalized tf-idf 
            iv[tag] = count * docFreq[tag] / vectLen
    
    itemVectors[movie] = iv
print(len(docFreq))
def getItemVector(docID):
    df=pd.DataFrame.from_dict(itemVectors, orient='index')
    d=df.loc['2231']
    d=d[d.notnull()]
    d=pd.DataFrame(d)
    d.columns=['value']
    d.sort_values('value', axis=0,ascending=[False], inplace=True)
    print(d)
#df
    
    
In [503]:
    
#Building User Profile as { user1 :{ tag1: count1 , tag2:count2, ...} , userID:...}
def buildUserProfiles(userID):
    userProfiles={}
    users = sorted(ratings, key= lambda x:x[0], reverse= False)
    if userID !='':
        users = [row for row in users if row[0] == userID]
# Iterating over all user ratings and then the item vector for each move the user has rated
    for user,movie,rate in users:
    
        if not user in userProfiles:
            userTagVector={}                     
            userProfiles[user]={}
        
        else:
            userTagVector=userProfiles[user]
        
    # Selecting movies with rate over the 3.5
        if float(rate) >= 3.5  and movie in itemVectors:
        
        # Iterating over the list of the item vectors for movies the current user rated. User Value = Sigma( tfidf) 
        # For all tags in the movie rated by the user
            for tname, qt in itemVectors[movie].items():
                if not tname in userTagVector:
                    userTagVector[tname] = 0.0
                
                userTagVector[tname] += float(qt)
        # userTagVector = list of tags of movie the current user( user) has rated: { tag1:score1 , tag2:score2, ...}
            userProfiles[user]= userTagVector
        
    u=userProfiles[userID]
    sorted_x = sorted(u.items(), key= lambda x:x[1], reverse= False)
    #print( sorted_x)
    
    return userProfiles
    
In [404]:
    
def calculateScores( u_profiles, m_items, docID):
    # Generating the Item Scores. Having User Profile and Movie tag vector what would be the score of the user for that movie
    
    m_itemscores={}
    
    # m_itemscores : { userID: { movie1:score1 , movie1:score2, ...} , user2: ...}
     #Iterating over all user profiles for the current movie
    for userID, userTagVector in u_profiles.items():
          
        if not userID in m_itemscores:
            m_itemscores[userID]={}
         
        #P= list(userTagVector.values())
        
        # Iterating over Item vectors  to calculate the user m_itemscores for each Movie
        for movieID,itemTagVector in m_items.items():     
            
           
            if docID != '':
                if movieID != docID:
                    continue
            
            #Q =list(itemTagVector.values())
            
            
            # List of user scores for the current movie. Each field of this list should be to be claculated using cosin 
            userScores = m_itemscores[userID]   
            
            p=ps=qs=0            
            #result = 1 - spatial.distance.cosine(P , Q)
            
            qs=0.0
            for tag1,count1 in itemTagVector.items():
                qs += math.pow( count1,2)
            
            #print(qs)
            for tag2,count2 in userTagVector.items():  
                
                # TFIDF of Item if the tag exists in the list of movie's tags
                if tag2 in itemTagVector:
                    qt=itemTagVector[tag2] 
                else:
                    qt=0
                # User Value for the current tag
                pt= count2                
                p += (pt * qt)                
                ps += math.pow(pt,2)                
                
            # Calculating cosin score for each user/item pair
            if ps <= 0.0 or qs <=0.0 :
                score= 0.0
            else:
                score= p / (math.sqrt(ps) * math.sqrt(qs))
            
            #print(math.sqrt(qs))
            #print(math.sqrt(ps))
            # Updating user scores vector 
           
                
            userScores[movieID]= score
            
            #Updating the user scores vector for the current movie
            m_itemscores[userID]= userScores   
            
    return m_itemscores
    
In [520]:
    
def getUserPredictions(userID):
    up= buildUserProfiles(userID)
    unweightedScores= calculateScores2(up, itemVectors ,'' )
    u=list(unweightedScores.values())
    sorted_x = sorted(u[0].items(), key= lambda x:x[1], reverse= True)
    d=pd.DataFrame.from_records(sorted_x)
    print(d)
    
In [430]:
    
def buildWightedProfile(userID):   
#Building User Profile as { user1 :{ tag1: count1 , tag2:count2} , userID:...}
    weighted_profiles={}
    users = sorted(ratings, key= lambda x:x[0], reverse= False)
    if userID !='':
        users = [row for row in users if row[0] == userID]
   
    # User rating average as { user1:av}
    userAvg = {}
    old=users[0][0]
    avg=float(users[0][2])
    count=1.0
    for row in users:
        if old != row[0]:
        
            userAvg[old] = avg
        
            avg=float(row[2])
            count=1.0        
            old= row[0]
    
        else:
            avg = ((count * avg) + float(row[2]) ) / ( count + 1)
            count +=1.0
        
    userAvg[old] = avg
    
     
# Iterating over all user ratings and then the item vector for each move the user has rated   
    for user,movie,rate in users:
    
        if not user in weightedu_profiles:
            userTagVector={}                     
            weightedu_profiles[user]={}
        
        else:
            userTagVector=weightedu_profiles[user]
    
        if movie in itemVectors:
        # Iterating over the list of the item vectors for movies the current user rated. User Value = Sigma( tfidf) 
            for tname, qt in itemVectors[movie].items():
                if not tname in userTagVector:
                    userTagVector[tname] = 0
                #print(qt)
                
                userTagVector[tname] += qt * ( float(rate) - userAvg[user])
        # userTagVector = list of tags of movie the current user( user) has rated: { tag1:score1 , tag2:score2, ...}
            weighted_profiles[user]= userTagVector
    #print( weightedu_profiles[gUserID])
    return weighted_profiles
    
In [523]:
    
def getWeighUserPredictions(userID):
    up= buildWightedProfile('320')
#print(up)
    weightedScores= calculateScores(up, itemVectors,'' )
#print(weightedScores)
    u=list(weightedScores.values())
#print(u[0])
    sorted_x = sorted(u[0].items(), key= lambda x:x[1], reverse= True)
    d=pd.DataFrame.from_records(sorted_x)
    print(d)
#print(sorted_x)
    
In [527]:
    
getItemVector('2231')
#getUserPredictions('320')
#getWeighUserPredictions('320')