In [364]:
'''
Created on Feb 21, 2017
@author: mmoham12
'''
import csv
import collections as c
import math
import operator
import numpy as np
import pandas as pd
import collections as c
from scipy import spatial
In [439]:
gUserID ='30838'
basePath="" #E:\\workspaces\\RecommSys_A2\\data\\"
tagsFile = basePath + "movie-tags.csv"
titlesFile = basePath +"movie-titles.csv"
ratingFile = basePath + "ratings.csv"
reader = csv.reader( open(tagsFile,encoding="ISO-8859-1"))
items = list(reader)
itemspd = pd.read_csv(tagsFile,encoding = 'iso-8859-1', header=None, index_col='movie', names=['movie' , 'tag'])
reader = csv.reader( open(titlesFile,encoding="ISO-8859-1"))
titles = list(reader)
reader = csv.reader( open(ratingFile,encoding="ISO-8859-1"))
#userID, movieID, rating
ratings = list(reader)
print(len(ratings))
print (len(items))
#print(items.l.oc[3916])
In [440]:
# item Vectors as a dictionary(like hashMaps in Java) : { movie1 :{ tag1 : count1 , tag2:count2 ,...} , movieID: ...}
itemCounts={}
# Document Vectro as a dictionary : { tag1:count1 , tag2:count2 ,...}
docCount= {}
# Calculating the document frequency
# Stroing the length of the vectors in the last field as vectorLength
for movie,tag in items:
if not tag in docCount:
docCount[tag]=0.0
add=0
if not movie in itemCounts:
# tagVector = Tag Vector
tagVector={}
tagVector[tag]=1.0
# Initial Length is 0. Will be computed later
# tagVector["vectorLength"] =0.0
# if movie not exists in the dict before add a new one with
itemCounts[movie]={}
# It is new tag for this movie so its count in the docFreq should be incremented
add=1
else:
tagVector=itemCounts[movie]
if not tag in itemCounts[movie]:
tagVector[tag]=1.0
# It is new tag for this movie so its count in the docFreq should be incremented
add=1
else:
tagVector[tag] = tagVector[tag]+1
# List of tag vectors for this movie gets u_profilesdated
itemCounts[movie] =tagVector
# The document frequency for this tag in the current movie.
docCount[tag] +=add
logN = math.log(len(itemCounts))
print (len(itemCounts))
print(logN)
In [441]:
# Calculating the final IDF values
docFreq ={}
for tag,count in docCount.items():
docFreq[tag]= logN - math.log(count)
# Testing Item 4
print('item 4==========================')
print( docFreq['CLV'])
print( docFreq['characters'])
print( docFreq['chick flick'])
print( docFreq['revenge'])
In [515]:
# Calculating the sum of the power of tf-idf of tags in tag vector of each movie and store it in vectorLength tag
itemVectors = {}
for movie,tagVector in itemCounts.items():
vectLen= 0.0
for tag,count in tagVector.items():
# TF-IDF = TF * IDF. count = TF , docFreq[tag] = IDF
tfidf= count * docFreq[tag]
vectLen += math.pow(tfidf, 2)
vectLen = math.sqrt(vectLen)
iv ={}
for tag,count in tagVector.items():
# Calculating the normalized tf-idf
iv[tag] = count * docFreq[tag] / vectLen
itemVectors[movie] = iv
print(len(docFreq))
def getItemVector(docID):
df=pd.DataFrame.from_dict(itemVectors, orient='index')
d=df.loc['2231']
d=d[d.notnull()]
d=pd.DataFrame(d)
d.columns=['value']
d.sort_values('value', axis=0,ascending=[False], inplace=True)
print(d)
#df
In [503]:
#Building User Profile as { user1 :{ tag1: count1 , tag2:count2, ...} , userID:...}
def buildUserProfiles(userID):
userProfiles={}
users = sorted(ratings, key= lambda x:x[0], reverse= False)
if userID !='':
users = [row for row in users if row[0] == userID]
# Iterating over all user ratings and then the item vector for each move the user has rated
for user,movie,rate in users:
if not user in userProfiles:
userTagVector={}
userProfiles[user]={}
else:
userTagVector=userProfiles[user]
# Selecting movies with rate over the 3.5
if float(rate) >= 3.5 and movie in itemVectors:
# Iterating over the list of the item vectors for movies the current user rated. User Value = Sigma( tfidf)
# For all tags in the movie rated by the user
for tname, qt in itemVectors[movie].items():
if not tname in userTagVector:
userTagVector[tname] = 0.0
userTagVector[tname] += float(qt)
# userTagVector = list of tags of movie the current user( user) has rated: { tag1:score1 , tag2:score2, ...}
userProfiles[user]= userTagVector
u=userProfiles[userID]
sorted_x = sorted(u.items(), key= lambda x:x[1], reverse= False)
#print( sorted_x)
return userProfiles
In [404]:
def calculateScores( u_profiles, m_items, docID):
# Generating the Item Scores. Having User Profile and Movie tag vector what would be the score of the user for that movie
m_itemscores={}
# m_itemscores : { userID: { movie1:score1 , movie1:score2, ...} , user2: ...}
#Iterating over all user profiles for the current movie
for userID, userTagVector in u_profiles.items():
if not userID in m_itemscores:
m_itemscores[userID]={}
#P= list(userTagVector.values())
# Iterating over Item vectors to calculate the user m_itemscores for each Movie
for movieID,itemTagVector in m_items.items():
if docID != '':
if movieID != docID:
continue
#Q =list(itemTagVector.values())
# List of user scores for the current movie. Each field of this list should be to be claculated using cosin
userScores = m_itemscores[userID]
p=ps=qs=0
#result = 1 - spatial.distance.cosine(P , Q)
qs=0.0
for tag1,count1 in itemTagVector.items():
qs += math.pow( count1,2)
#print(qs)
for tag2,count2 in userTagVector.items():
# TFIDF of Item if the tag exists in the list of movie's tags
if tag2 in itemTagVector:
qt=itemTagVector[tag2]
else:
qt=0
# User Value for the current tag
pt= count2
p += (pt * qt)
ps += math.pow(pt,2)
# Calculating cosin score for each user/item pair
if ps <= 0.0 or qs <=0.0 :
score= 0.0
else:
score= p / (math.sqrt(ps) * math.sqrt(qs))
#print(math.sqrt(qs))
#print(math.sqrt(ps))
# Updating user scores vector
userScores[movieID]= score
#Updating the user scores vector for the current movie
m_itemscores[userID]= userScores
return m_itemscores
In [520]:
def getUserPredictions(userID):
up= buildUserProfiles(userID)
unweightedScores= calculateScores2(up, itemVectors ,'' )
u=list(unweightedScores.values())
sorted_x = sorted(u[0].items(), key= lambda x:x[1], reverse= True)
d=pd.DataFrame.from_records(sorted_x)
print(d)
In [430]:
def buildWightedProfile(userID):
#Building User Profile as { user1 :{ tag1: count1 , tag2:count2} , userID:...}
weighted_profiles={}
users = sorted(ratings, key= lambda x:x[0], reverse= False)
if userID !='':
users = [row for row in users if row[0] == userID]
# User rating average as { user1:av}
userAvg = {}
old=users[0][0]
avg=float(users[0][2])
count=1.0
for row in users:
if old != row[0]:
userAvg[old] = avg
avg=float(row[2])
count=1.0
old= row[0]
else:
avg = ((count * avg) + float(row[2]) ) / ( count + 1)
count +=1.0
userAvg[old] = avg
# Iterating over all user ratings and then the item vector for each move the user has rated
for user,movie,rate in users:
if not user in weightedu_profiles:
userTagVector={}
weightedu_profiles[user]={}
else:
userTagVector=weightedu_profiles[user]
if movie in itemVectors:
# Iterating over the list of the item vectors for movies the current user rated. User Value = Sigma( tfidf)
for tname, qt in itemVectors[movie].items():
if not tname in userTagVector:
userTagVector[tname] = 0
#print(qt)
userTagVector[tname] += qt * ( float(rate) - userAvg[user])
# userTagVector = list of tags of movie the current user( user) has rated: { tag1:score1 , tag2:score2, ...}
weighted_profiles[user]= userTagVector
#print( weightedu_profiles[gUserID])
return weighted_profiles
In [523]:
def getWeighUserPredictions(userID):
up= buildWightedProfile('320')
#print(up)
weightedScores= calculateScores(up, itemVectors,'' )
#print(weightedScores)
u=list(weightedScores.values())
#print(u[0])
sorted_x = sorted(u[0].items(), key= lambda x:x[1], reverse= True)
d=pd.DataFrame.from_records(sorted_x)
print(d)
#print(sorted_x)
In [527]:
getItemVector('2231')
#getUserPredictions('320')
#getWeighUserPredictions('320')