In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
ratings = pd.read_csv("datasets/ml-latest/ratings.csv", usecols=['movieId', 'rating'])

In [4]:
genome_scores = pd.read_csv("datasets/ml-latest/genome-scores.csv")

In [5]:
genome_tags = pd.read_csv("datasets/ml-latest/genome-tags.csv")

In [6]:
movie_names = pd.read_csv("datasets/ml-latest/movies.csv")

In [7]:
links = pd.read_csv("datasets/ml-latest/links.csv")

In [8]:
movie_tags_in_text = pd.merge(genome_scores, genome_tags, on='tagId')[['movieId', 'tag', 'relevance']]

Determine a good tag relevancy score cut-off


In [9]:
%matplotlib inline

In [10]:
genome_scores.head()


Out[10]:
movieId tagId relevance
0 1 1 0.02400
1 1 2 0.02400
2 1 3 0.05475
3 1 4 0.09200
4 1 5 0.14825

In [11]:
genome_scores[:100000].pivot(index='movieId', columns='tagId')['relevance'].head()


Out[11]:
tagId 1 2 3 4 5 6 7 8 9 10 ... 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128
movieId
1 0.02400 0.02400 0.05475 0.09200 0.14825 0.21500 0.06625 0.27025 0.26050 0.03025 ... 0.03650 0.01800 0.04525 0.03275 0.12450 0.04175 0.02000 0.03475 0.08350 0.02525
2 0.03800 0.04175 0.03700 0.04875 0.11075 0.07325 0.04950 0.10775 0.10200 0.02050 ... 0.03900 0.01925 0.01725 0.02425 0.13425 0.02225 0.01600 0.01450 0.09600 0.02025
3 0.04200 0.05250 0.02725 0.07975 0.05625 0.07025 0.05975 0.18275 0.05175 0.02725 ... 0.03950 0.02625 0.02725 0.03450 0.16925 0.03525 0.01725 0.01875 0.09925 0.02000
4 0.03600 0.03850 0.03500 0.03125 0.07100 0.04500 0.02475 0.08300 0.05150 0.02975 ... 0.05375 0.03300 0.02275 0.04025 0.19600 0.05700 0.01550 0.01475 0.06625 0.01400
5 0.04075 0.05125 0.05800 0.03675 0.07575 0.12675 0.02975 0.08175 0.03075 0.02950 ... 0.04000 0.02850 0.02100 0.02650 0.15475 0.02050 0.01700 0.01575 0.11275 0.01975

5 rows × 1128 columns


In [12]:
table = genome_scores[:100000].pivot_table('relevance', index='movieId', columns='tagId', aggfunc='mean')

In [13]:
table.head()


Out[13]:
tagId 1 2 3 4 5 6 7 8 9 10 ... 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128
movieId
1 0.02400 0.02400 0.05475 0.09200 0.14825 0.21500 0.06625 0.27025 0.26050 0.03025 ... 0.03650 0.01800 0.04525 0.03275 0.12450 0.04175 0.02000 0.03475 0.08350 0.02525
2 0.03800 0.04175 0.03700 0.04875 0.11075 0.07325 0.04950 0.10775 0.10200 0.02050 ... 0.03900 0.01925 0.01725 0.02425 0.13425 0.02225 0.01600 0.01450 0.09600 0.02025
3 0.04200 0.05250 0.02725 0.07975 0.05625 0.07025 0.05975 0.18275 0.05175 0.02725 ... 0.03950 0.02625 0.02725 0.03450 0.16925 0.03525 0.01725 0.01875 0.09925 0.02000
4 0.03600 0.03850 0.03500 0.03125 0.07100 0.04500 0.02475 0.08300 0.05150 0.02975 ... 0.05375 0.03300 0.02275 0.04025 0.19600 0.05700 0.01550 0.01475 0.06625 0.01400
5 0.04075 0.05125 0.05800 0.03675 0.07575 0.12675 0.02975 0.08175 0.03075 0.02950 ... 0.04000 0.02850 0.02100 0.02650 0.15475 0.02050 0.01700 0.01575 0.11275 0.01975

5 rows × 1128 columns


In [14]:
%matplotlib inline

In [15]:
table[:1].T.hist()


Out[15]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1065d7550>]], dtype=object)

conclusion: 0.3 seems to be a good cut-off


In [16]:
movie_tags = genome_scores[genome_scores.relevance > 0.3][['movieId', 'tagId']]

Merge in tag and movie names


In [17]:
tags_to_movies = pd.merge(movie_tags, genome_tags, on='tagId', how='left')[['movieId', 'tagId']]

In [18]:
tags_to_movies['tagId'] = tags_to_movies.tagId.astype(str)

In [19]:
def _concatenate_tags_of_movie(tags):
    tags_as_str = ' '.join(set(tags))
    return tags_as_str

In [20]:
tags_per_movie = tags_to_movies.groupby('movieId')['tagId'].agg({
    'movie_tags': _concatenate_tags_of_movie
}).reset_index()

In [21]:
avg_ratings = ratings.groupby('movieId')['rating'].agg({
    'rating_mean': 'mean',
    'rating_median': 'median',
    'num_ratings': 'size'
}).reset_index()

In [22]:
movies_with_ratings = pd.merge(movie_names, avg_ratings, how='left', on='movieId')

In [23]:
dataset = pd.merge(movies_with_ratings, tags_per_movie, how='left', on='movieId')

In [24]:
dataset.rename(columns={'median': 'rating_median', 'mean': 'rating_mean', 'tagId': 'movie_tags'}, inplace=True)

Extracting movie year from title


In [25]:
import re

def extract_year_from_movie_title(movie_title):
    matches = re.findall(r'\d{4}', movie_title)
    if len(matches) > 1:
        return int(matches[-1])
    if len(matches) < 1:
        return np.nan
    return int(matches[0])

In [26]:
dataset['year'] = dataset.title.apply(extract_year_from_movie_title)

In [27]:
dataset.head()


Out[27]:
movieId title genres num_ratings rating_median rating_mean movie_tags year
0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 63469.0 4.0 3.889300 113 93 1071 745 881 186 1025 464 588 355 942 1... 1995.0
1 2 Jumanji (1995) Adventure|Children|Fantasy 25045.0 3.0 3.229527 113 745 881 186 694 20 464 588 355 314 22 664 ... 1995.0
2 3 Grumpier Old Men (1995) Comedy|Romance 15381.0 3.0 3.178142 1071 374 846 902 919 629 469 609 464 807 1057 ... 1995.0
3 4 Waiting to Exhale (1995) Comedy|Drama|Romance 2961.0 3.0 2.879433 864 374 846 425 602 900 388 807 464 107 726 97... 1995.0
4 5 Father of the Bride Part II (1995) Comedy 15023.0 3.0 3.080410 1040 157 926 1071 204 864 374 334 902 694 919 ... 1995.0

There are movies without tags


In [28]:
dataset[dataset.movie_tags.isnull()].head()


Out[28]:
movieId title genres num_ratings rating_median rating_mean movie_tags year
50 51 Guardian Angel (1994) Action|Drama|Thriller 34.0 3.00 2.588235 NaN 1994.0
107 109 Headless Body in Topless Bar (1995) Comedy|Drama|Thriller 18.0 2.25 2.333333 NaN 1995.0
113 115 Happiness Is in the Field (Bonheur est dans le... Comedy 50.0 4.00 3.380000 NaN 1995.0
122 124 Star Maker, The (Uomo delle stelle, L') (1995) Drama 335.0 4.00 3.489552 NaN 1995.0
125 127 Silences of the Palace, The (Saimt el Qusur) (... Drama 51.0 3.00 3.215686 NaN 1994.0

There are movies without ratings


In [29]:
dataset[dataset.rating_mean.isnull()].head()


Out[29]:
movieId title genres num_ratings rating_median rating_mean movie_tags year
8531 25981 Man on a Tightrope (1953) Drama NaN NaN NaN NaN 1953.0
9322 27396 Gentleman's Game, A (2002) Drama NaN NaN NaN NaN 2002.0
9779 31797 White Banners (1938) Drama NaN NaN NaN NaN 1938.0
13307 65078 Jane Austen in Manhattan (1980) Drama|Romance NaN NaN NaN NaN 1980.0
13510 66622 His Private Secretary (1933) Comedy|Romance NaN NaN NaN NaN 1933.0

Conclusion: These cannot be related to other movies due to lack of features (tags), they could be presented as a "random recommendation" solution

Bag of words vectorization


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
dataset_with_tags = dataset[~dataset.movie_tags.isnull()].reset_index(drop=True)

In [33]:
bag_of_words = CountVectorizer()

In [34]:
tags_as_descriptors = [' '.join(genome_tags.tagId.astype(str))]

In [35]:
movies_described_bag_of_words = bag_of_words.fit_transform(dataset_with_tags.movie_tags)

In [36]:
df_bag_m2m = pd.DataFrame(cosine_similarity(movies_described_bag_of_words))

Tf-Idf Vectorization


In [37]:
tf_idf = TfidfVectorizer()

In [38]:
movies_tf_idf_described = tf_idf.fit_transform(dataset_with_tags.movie_tags)

In [39]:
m2m = cosine_similarity(movies_tf_idf_described)

In [40]:
df_tfidf_m2m = pd.DataFrame(cosine_similarity(movies_tf_idf_described))

In [41]:
df_tfidf_m2m.head()


Out[41]:
0 1 2 3 4 5 6 7 8 9 ... 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673
0 1.000000 0.431993 0.159781 0.141310 0.216984 0.256402 0.219518 0.255039 0.084687 0.251235 ... 0.253995 0.245707 0.298894 0.218980 0.243204 0.228843 0.338929 0.390264 0.527402 0.210074
1 0.431993 1.000000 0.107847 0.120763 0.237059 0.120485 0.181958 0.282949 0.188213 0.234384 ... 0.365564 0.130227 0.221158 0.171833 0.172612 0.160411 0.251052 0.265634 0.299784 0.181168
2 0.159781 0.107847 1.000000 0.260063 0.430876 0.118901 0.369714 0.206708 0.067116 0.175896 ... 0.151341 0.135469 0.134628 0.093538 0.102329 0.123552 0.142192 0.124184 0.133702 0.096047
3 0.141310 0.120763 0.260063 1.000000 0.263465 0.088910 0.374638 0.244020 0.077906 0.075830 ... 0.144143 0.136691 0.119444 0.108307 0.110534 0.113956 0.159768 0.165774 0.165368 0.079979
4 0.216984 0.237059 0.430876 0.263465 1.000000 0.050463 0.446251 0.182284 0.108008 0.137105 ... 0.180975 0.204900 0.114578 0.107630 0.137742 0.105655 0.173343 0.187519 0.182170 0.098338

5 rows × 10674 columns

Match indices to movie IDs


In [42]:
index_to_movie_id = dataset_with_tags['movieId']

In [43]:
index_to_movie_id[0]


Out[43]:
1

In [44]:
index_to_movie_id[10665]


Out[44]:
160980

In [45]:
dataset_with_tags.reset_index(drop=True).ix[50]


Out[45]:
movieId                                                         52
title                                      Mighty Aphrodite (1995)
genres                                        Comedy|Drama|Romance
num_ratings                                                  10277
rating_median                                                    4
rating_mean                                                3.53741
movie_tags       829 335 1071 745 845 297 704 609 464 726 1062 ...
year                                                          1995
Name: 50, dtype: object

In [46]:
df_tfidf_m2m.columns = [str(index_to_movie_id[int(col)]) for col in df_tfidf_m2m.columns]

In [47]:
df_tfidf_m2m.index = [index_to_movie_id[idx] for idx in df_tfidf_m2m.index]

In [48]:
df_tfidf_m2m.head()


Out[48]:
1 2 3 4 5 6 7 8 9 10 ... 160954 160980 161131 161354 161582 161634 162350 162376 162578 162600
1 1.000000 0.431993 0.159781 0.141310 0.216984 0.256402 0.219518 0.255039 0.084687 0.251235 ... 0.253995 0.245707 0.298894 0.218980 0.243204 0.228843 0.338929 0.390264 0.527402 0.210074
2 0.431993 1.000000 0.107847 0.120763 0.237059 0.120485 0.181958 0.282949 0.188213 0.234384 ... 0.365564 0.130227 0.221158 0.171833 0.172612 0.160411 0.251052 0.265634 0.299784 0.181168
3 0.159781 0.107847 1.000000 0.260063 0.430876 0.118901 0.369714 0.206708 0.067116 0.175896 ... 0.151341 0.135469 0.134628 0.093538 0.102329 0.123552 0.142192 0.124184 0.133702 0.096047
4 0.141310 0.120763 0.260063 1.000000 0.263465 0.088910 0.374638 0.244020 0.077906 0.075830 ... 0.144143 0.136691 0.119444 0.108307 0.110534 0.113956 0.159768 0.165774 0.165368 0.079979
5 0.216984 0.237059 0.430876 0.263465 1.000000 0.050463 0.446251 0.182284 0.108008 0.137105 ... 0.180975 0.204900 0.114578 0.107630 0.137742 0.105655 0.173343 0.187519 0.182170 0.098338

5 rows × 10674 columns


In [49]:
df_tfidf_m2m.tail()


Out[49]:
1 2 3 4 5 6 7 8 9 10 ... 160954 160980 161131 161354 161582 161634 162350 162376 162578 162600
161634 0.228843 0.160411 0.123552 0.113956 0.105655 0.384652 0.118860 0.114861 0.092883 0.189042 ... 0.185566 0.168822 0.256561 0.194614 0.274796 1.000000 0.258832 0.450956 0.319277 0.097457
162350 0.338929 0.251052 0.142192 0.159768 0.173343 0.355809 0.191053 0.226337 0.164536 0.232459 ... 0.275451 0.195042 0.404297 0.320394 0.381101 0.258832 1.000000 0.367129 0.316950 0.253319
162376 0.390264 0.265634 0.124184 0.165774 0.187519 0.451733 0.160788 0.161037 0.083897 0.196478 ... 0.269517 0.265497 0.398473 0.213949 0.455381 0.450956 0.367129 1.000000 0.555852 0.204455
162578 0.527402 0.299784 0.133702 0.165368 0.182170 0.340640 0.183740 0.229345 0.099949 0.177231 ... 0.222804 0.279660 0.314855 0.246288 0.333284 0.319277 0.316950 0.555852 1.000000 0.209276
162600 0.210074 0.181168 0.096047 0.079979 0.098338 0.069668 0.069561 0.117470 0.121824 0.128960 ... 0.208857 0.116181 0.166620 0.222607 0.126180 0.097457 0.253319 0.204455 0.209276 1.000000

5 rows × 10674 columns

Check similarities


In [50]:
df_tfidf_m2m.ix[1].sort_values(ascending=False)[:20]


Out[50]:
1         1.000000
3114      0.736535
4886      0.724898
78499     0.720759
2355      0.714265
76093     0.686358
5218      0.672731
68954     0.648925
6377      0.647981
4306      0.641639
50872     0.635019
8961      0.626673
2761      0.608781
81847     0.604853
45517     0.602484
152081    0.601611
42191     0.599953
97913     0.594801
98491     0.591688
108932    0.590483
Name: 1, dtype: float64

In [51]:
dataset_with_tags[dataset_with_tags.movieId == 1]


Out[51]:
movieId title genres num_ratings rating_median rating_mean movie_tags year
0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 63469.0 4.0 3.8893 113 93 1071 745 881 186 1025 464 588 355 942 1... 1995.0

In [52]:
dataset_with_tags[dataset_with_tags.movieId == 3114]


Out[52]:
movieId title genres num_ratings rating_median rating_mean movie_tags year
2769 3114 Toy Story 2 (1999) Adventure|Animation|Children|Comedy|Fantasy 26904.0 4.0 3.820844 1071 745 186 464 588 355 1062 664 244 455 128 ... 1999.0

In [53]:
dataset_with_tags[dataset_with_tags.movieId == 4886]


Out[53]:
movieId title genres num_ratings rating_median rating_mean movie_tags year
4331 4886 Monsters, Inc. (2001) Adventure|Animation|Children|Comedy|Fantasy 31089.0 4.0 3.863633 113 1071 745 186 464 588 355 1062 755 22 372 6... 2001.0

In [54]:
dataset_with_tags[dataset_with_tags.movieId == 78499]


Out[54]:
movieId title genres num_ratings rating_median rating_mean movie_tags year
9070 78499 Toy Story 3 (2010) Adventure|Animation|Children|Comedy|Fantasy|IMAX 10963.0 4.0 3.910244 1071 864 745 881 186 845 464 588 355 1062 755 ... 2010.0

The closest movies to Toy Story 1 are the sequels and Monsters Inc! (No sh*t sherlock)

Stacking for writing to DB


In [55]:
m2m_similarity_stacked = df_tfidf_m2m.stack().reset_index()
m2m_similarity_stacked.columns = ['first_movie', 'second_movie', 'similarity_score']

In [56]:
m2m_similarity_stacked.head()


Out[56]:
first_movie second_movie similarity_score
0 1 1 1.000000
1 1 2 0.431993
2 1 3 0.159781
3 1 4 0.141310
4 1 5 0.216984

In [57]:
m2m_similarity_stacked.tail()


Out[57]:
first_movie second_movie similarity_score
113934271 162600 161634 0.097457
113934272 162600 162350 0.253319
113934273 162600 162376 0.204455
113934274 162600 162578 0.209276
113934275 162600 162600 1.000000

Writing to DB


In [58]:
import sqlite3 as db

In [59]:
connection = db.connect('db.sqlite3')

In [60]:
for_db = dataset.rename(columns={
    'movieId': 'movie_id'
})[['movie_id', 'title', 'year', 'genres', 'num_ratings', 'rating_median', 'rating_mean']]

In [61]:
for_db['relatable'] = True

In [62]:
for_db.head()


Out[62]:
movie_id title year genres num_ratings rating_median rating_mean relatable
0 1 Toy Story (1995) 1995.0 Adventure|Animation|Children|Comedy|Fantasy 63469.0 4.0 3.889300 True
1 2 Jumanji (1995) 1995.0 Adventure|Children|Fantasy 25045.0 3.0 3.229527 True
2 3 Grumpier Old Men (1995) 1995.0 Comedy|Romance 15381.0 3.0 3.178142 True
3 4 Waiting to Exhale (1995) 1995.0 Comedy|Drama|Romance 2961.0 3.0 2.879433 True
4 5 Father of the Bride Part II (1995) 1995.0 Comedy 15023.0 3.0 3.080410 True

In [63]:
from tqdm import tqdm

In [64]:
total_length = len(for_db)
step = int(total_length / 100)

with tqdm(total=total_length) as pbar:
    for i in range(0, total_length, step):
        subset = for_db[i: i+step]
        subset.to_sql('movie_time_app_movie', connection, if_exists='append', index=False)
        pbar.update(step)


40501it [00:01, 38515.09it/s]                           

In [65]:
pd.read_sql_query('SELECT * FROM movie_time_app_movie LIMIT 5', connection)


Out[65]:
movie_id title poster year genres num_ratings rating_median rating_mean relatable
0 1 Toy Story (1995) None 1995 Adventure|Animation|Children|Comedy|Fantasy 63469 4.0 3.889300 1
1 2 Jumanji (1995) None 1995 Adventure|Children|Fantasy 25045 3.0 3.229527 1
2 3 Grumpier Old Men (1995) None 1995 Comedy|Romance 15381 3.0 3.178142 1
3 4 Waiting to Exhale (1995) None 1995 Comedy|Drama|Romance 2961 3.0 2.879433 1
4 5 Father of the Bride Part II (1995) None 1995 Comedy 15023 3.0 3.080410 1