In [39]:
from collections import Counter, defaultdict
import numpy as np

In [46]:
def cosin_sim(v, w):
    return np.dot(v, w) / np.math.sqrt(np.dot(v, v) * np.dot(w, w))

cosin_sim([1, 1], [1, 1])


Out[46]:
1.0

In [6]:
users_interests = [
    ['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra'],
    ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'],
    ['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas'],
    ['R', 'Python', 'statistics', 'regression', 'probability'],
    ['machine learning', 'regression', 'decision trees', 'libsvm'],
    ['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages'],
    ['statistics', 'probability', 'mathematics', 'theory'],
    ['machine learning', 'scikit-learn', 'Mahout', 'neural networks'],
    ['neural networks', 'deep learning', 'Big Data', 'artificial intelligence'],
    ['Hadoop', 'Java', 'MapReduce', 'Big Data'],
    ['statistics', 'R', 'statsmodels'],
    ['C++', 'deep learning', 'artificial intelligece', 'probability'],
    ['pandas', 'R', 'Python'],
    ['databases', 'HBase', 'Postgres', 'MySQL', 'MongoDB'],
    ['libsvm', 'regression', 'support vector machines']
]

In [8]:
popular_interests = Counter(interest for user_interests in users_interests 
                            for interest in user_interests).most_common()
popular_interests


Out[8]:
[('Python', 4),
 ('R', 4),
 ('Java', 3),
 ('regression', 3),
 ('statistics', 3),
 ('probability', 3),
 ('HBase', 3),
 ('Big Data', 3),
 ('neural networks', 2),
 ('Hadoop', 2),
 ('deep learning', 2),
 ('pandas', 2),
 ('libsvm', 2),
 ('C++', 2),
 ('Postgres', 2),
 ('MongoDB', 2),
 ('scikit-learn', 2),
 ('machine learning', 2),
 ('statsmodels', 2),
 ('Cassandra', 2),
 ('NoSQL', 1),
 ('Mahout', 1),
 ('Storm', 1),
 ('MySQL', 1),
 ('programming languages', 1),
 ('Haskell', 1),
 ('mathematics', 1),
 ('Spark', 1),
 ('numpy', 1),
 ('artificial intelligence', 1),
 ('theory', 1),
 ('decision trees', 1),
 ('MapReduce', 1),
 ('scipy', 1),
 ('databases', 1),
 ('artificial intelligece', 1),
 ('support vector machines', 1)]

In [14]:
def most_popular_new_interests(user_interests, max_results=5):
    suggestions = [(interest, frequency) 
                   for interest, frequency in popular_interests 
                   if interest not in user_interests]
    return suggestions[:max_results]

In [15]:
most_popular_new_interests(users_interests[1], 5)


Out[15]:
[('Python', 4), ('R', 4), ('Java', 3), ('regression', 3), ('statistics', 3)]

In [30]:
def cosine_similiarty(v, w):
    return np.dot(v, w) / np.math.sqrt(np.dot(v, v) * np.dot(w, w))

In [31]:
unique_interests = sorted(list({ interest for user_interests in users_interests
                               for interest in user_interests}))
unique_interests


Out[31]:
['Big Data',
 'C++',
 'Cassandra',
 'HBase',
 'Hadoop',
 'Haskell',
 'Java',
 'Mahout',
 'MapReduce',
 'MongoDB',
 'MySQL',
 'NoSQL',
 'Postgres',
 'Python',
 'R',
 'Spark',
 'Storm',
 'artificial intelligece',
 'artificial intelligence',
 'databases',
 'decision trees',
 'deep learning',
 'libsvm',
 'machine learning',
 'mathematics',
 'neural networks',
 'numpy',
 'pandas',
 'probability',
 'programming languages',
 'regression',
 'scikit-learn',
 'scipy',
 'statistics',
 'statsmodels',
 'support vector machines',
 'theory']

In [32]:
def make_user_interest_vector(user_interests):
    """given a list of interests, produce a vector whose ith element is 1 
    if unique_interests[i] is in the list, 0 otherwise"""
    return [1 if interest in user_interests else 0 for interest in unique_interests]

In [33]:
user_interest_matrix = map(make_user_interest_vector, users_interests)

In [35]:
user_similarities = [[cosine_similiarty(interest_vector_i, interest_vector_j)
                     for interest_vector_j in user_interest_matrix]
                    for interest_vector_i in user_interest_matrix]
user_similarities


Out[35]:
[[1.0,
  0.33806170189140661,
  0.0,
  0.0,
  0.0,
  0.15430334996209191,
  0.0,
  0.0,
  0.1889822365046136,
  0.56694670951384085,
  0.0,
  0.0,
  0.0,
  0.1690308509457033,
  0.0],
 [0.33806170189140661,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.59999999999999998,
  0.0],
 [0.0,
  0.0,
  1.0,
  0.18257418583505536,
  0.0,
  0.16666666666666666,
  0.0,
  0.20412414523193154,
  0.0,
  0.0,
  0.23570226039551587,
  0.0,
  0.47140452079103173,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.18257418583505536,
  1.0,
  0.22360679774997896,
  0.36514837167011072,
  0.44721359549995793,
  0.0,
  0.0,
  0.0,
  0.5163977794943222,
  0.22360679774997896,
  0.5163977794943222,
  0.0,
  0.2581988897471611],
 [0.0,
  0.0,
  0.0,
  0.22360679774997896,
  1.0,
  0.0,
  0.0,
  0.25,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.57735026918962584],
 [0.15430334996209191,
  0.0,
  0.16666666666666666,
  0.36514837167011072,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.20412414523193154,
  0.23570226039551587,
  0.20412414523193154,
  0.47140452079103173,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  0.44721359549995793,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.28867513459481292,
  0.25,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.20412414523193154,
  0.0,
  0.25,
  0.0,
  0.0,
  1.0,
  0.25,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.1889822365046136,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.25,
  1.0,
  0.25,
  0.0,
  0.25,
  0.0,
  0.0,
  0.0],
 [0.56694670951384085,
  0.0,
  0.0,
  0.0,
  0.0,
  0.20412414523193154,
  0.0,
  0.0,
  0.25,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.23570226039551587,
  0.5163977794943222,
  0.0,
  0.23570226039551587,
  0.28867513459481292,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.33333333333333331,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  0.22360679774997896,
  0.0,
  0.20412414523193154,
  0.25,
  0.0,
  0.25,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.47140452079103173,
  0.5163977794943222,
  0.0,
  0.47140452079103173,
  0.0,
  0.0,
  0.0,
  0.0,
  0.33333333333333331,
  0.0,
  1.0,
  0.0,
  0.0],
 [0.1690308509457033,
  0.59999999999999998,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  0.2581988897471611,
  0.57735026918962584,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0]]

In [36]:
def most_similar_users_to(user_id):
    pairs = [(other_user_id, similarity)
            for other_user_id, similarity in
                enumerate(user_similarities[user_id])
            if user_id != other_user_id and similarity > 0]
    
    return sorted(pairs, key=lambda (_, similarity): similarity, reverse=True)

In [40]:
def user_based_suggestions(user_id, include_current_interests=False):
    # sum up the similarities
    suggestions = defaultdict(float)
    for other_user_id, similarity in most_similar_users_to(user_id):
        for interest in users_interests[other_user_id]:
            suggestions[interest] += similarity
    
    # convert them to a sorted list
    suggestions = sorted(suggestions.items(), key=lambda(_, weight): weight, reverse=True)
    
    # and (maybe) exclude already-interests
    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight)
               for suggestion, weight in suggestions 
                   if suggestion not in users_interests[user_id]]

In [41]:
user_based_suggestions(0)


Out[41]:
[('MapReduce', 0.56694670951384085),
 ('MongoDB', 0.50709255283710997),
 ('Postgres', 0.50709255283710997),
 ('NoSQL', 0.33806170189140661),
 ('neural networks', 0.1889822365046136),
 ('deep learning', 0.1889822365046136),
 ('artificial intelligence', 0.1889822365046136),
 ('databases', 0.1690308509457033),
 ('MySQL', 0.1690308509457033),
 ('programming languages', 0.15430334996209191),
 ('Python', 0.15430334996209191),
 ('Haskell', 0.15430334996209191),
 ('C++', 0.15430334996209191),
 ('R', 0.15430334996209191)]

Movie Data


In [ ]:
movie_data = [['Superman', 'Walking Dead', 'CSI'], 
              ['Superman', 'Walking Dead', 'CSI']