In [39]:
from collections import Counter, defaultdict
import numpy as np
In [46]:
def cosin_sim(v, w):
return np.dot(v, w) / np.math.sqrt(np.dot(v, v) * np.dot(w, w))
cosin_sim([1, 1], [1, 1])
Out[46]:
In [6]:
users_interests = [
['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra'],
['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'],
['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas'],
['R', 'Python', 'statistics', 'regression', 'probability'],
['machine learning', 'regression', 'decision trees', 'libsvm'],
['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages'],
['statistics', 'probability', 'mathematics', 'theory'],
['machine learning', 'scikit-learn', 'Mahout', 'neural networks'],
['neural networks', 'deep learning', 'Big Data', 'artificial intelligence'],
['Hadoop', 'Java', 'MapReduce', 'Big Data'],
['statistics', 'R', 'statsmodels'],
['C++', 'deep learning', 'artificial intelligece', 'probability'],
['pandas', 'R', 'Python'],
['databases', 'HBase', 'Postgres', 'MySQL', 'MongoDB'],
['libsvm', 'regression', 'support vector machines']
]
In [8]:
popular_interests = Counter(interest for user_interests in users_interests
for interest in user_interests).most_common()
popular_interests
Out[8]:
In [14]:
def most_popular_new_interests(user_interests, max_results=5):
suggestions = [(interest, frequency)
for interest, frequency in popular_interests
if interest not in user_interests]
return suggestions[:max_results]
In [15]:
most_popular_new_interests(users_interests[1], 5)
Out[15]:
In [30]:
def cosine_similiarty(v, w):
return np.dot(v, w) / np.math.sqrt(np.dot(v, v) * np.dot(w, w))
In [31]:
unique_interests = sorted(list({ interest for user_interests in users_interests
for interest in user_interests}))
unique_interests
Out[31]:
In [32]:
def make_user_interest_vector(user_interests):
"""given a list of interests, produce a vector whose ith element is 1
if unique_interests[i] is in the list, 0 otherwise"""
return [1 if interest in user_interests else 0 for interest in unique_interests]
In [33]:
user_interest_matrix = map(make_user_interest_vector, users_interests)
In [35]:
user_similarities = [[cosine_similiarty(interest_vector_i, interest_vector_j)
for interest_vector_j in user_interest_matrix]
for interest_vector_i in user_interest_matrix]
user_similarities
Out[35]:
In [36]:
def most_similar_users_to(user_id):
pairs = [(other_user_id, similarity)
for other_user_id, similarity in
enumerate(user_similarities[user_id])
if user_id != other_user_id and similarity > 0]
return sorted(pairs, key=lambda (_, similarity): similarity, reverse=True)
In [40]:
def user_based_suggestions(user_id, include_current_interests=False):
# sum up the similarities
suggestions = defaultdict(float)
for other_user_id, similarity in most_similar_users_to(user_id):
for interest in users_interests[other_user_id]:
suggestions[interest] += similarity
# convert them to a sorted list
suggestions = sorted(suggestions.items(), key=lambda(_, weight): weight, reverse=True)
# and (maybe) exclude already-interests
if include_current_interests:
return suggestions
else:
return [(suggestion, weight)
for suggestion, weight in suggestions
if suggestion not in users_interests[user_id]]
In [41]:
user_based_suggestions(0)
Out[41]:
In [ ]:
movie_data = [['Superman', 'Walking Dead', 'CSI'],
['Superman', 'Walking Dead', 'CSI']