In [1]:
    
import pandas as pd
    
In [2]:
    
import numpy as np
    
In [3]:
    
ratings = pd.read_csv("datasets/ml-latest/ratings.csv", usecols=['movieId', 'rating'])
    
In [4]:
    
genome_scores = pd.read_csv("datasets/ml-latest/genome-scores.csv")
    
In [5]:
    
genome_tags = pd.read_csv("datasets/ml-latest/genome-tags.csv")
    
In [6]:
    
movie_names = pd.read_csv("datasets/ml-latest/movies.csv")
    
In [7]:
    
links = pd.read_csv("datasets/ml-latest/links.csv")
    
In [8]:
    
movie_tags_in_text = pd.merge(genome_scores, genome_tags, on='tagId')[['movieId', 'tag', 'relevance']]
    
In [9]:
    
%matplotlib inline
    
In [10]:
    
genome_scores.head()
    
    Out[10]:
In [11]:
    
genome_scores[:100000].pivot(index='movieId', columns='tagId')['relevance'].head()
    
    Out[11]:
In [12]:
    
table = genome_scores[:100000].pivot_table('relevance', index='movieId', columns='tagId', aggfunc='mean')
    
In [13]:
    
table.head()
    
    Out[13]:
In [14]:
    
%matplotlib inline
    
In [15]:
    
table[:1].T.hist()
    
    Out[15]:
    
conclusion: 0.3 seems to be a good cut-off
In [16]:
    
movie_tags = genome_scores[genome_scores.relevance > 0.3][['movieId', 'tagId']]
    
In [17]:
    
tags_to_movies = pd.merge(movie_tags, genome_tags, on='tagId', how='left')[['movieId', 'tagId']]
    
In [18]:
    
tags_to_movies['tagId'] = tags_to_movies.tagId.astype(str)
    
In [19]:
    
def _concatenate_tags_of_movie(tags):
    tags_as_str = ' '.join(set(tags))
    return tags_as_str
    
In [20]:
    
tags_per_movie = tags_to_movies.groupby('movieId')['tagId'].agg({
    'movie_tags': _concatenate_tags_of_movie
}).reset_index()
    
In [21]:
    
avg_ratings = ratings.groupby('movieId')['rating'].agg({
    'rating_mean': 'mean',
    'rating_median': 'median',
    'num_ratings': 'size'
}).reset_index()
    
In [22]:
    
movies_with_ratings = pd.merge(movie_names, avg_ratings, how='left', on='movieId')
    
In [23]:
    
dataset = pd.merge(movies_with_ratings, tags_per_movie, how='left', on='movieId')
    
In [24]:
    
dataset.rename(columns={'median': 'rating_median', 'mean': 'rating_mean', 'tagId': 'movie_tags'}, inplace=True)
    
In [25]:
    
import re
def extract_year_from_movie_title(movie_title):
    matches = re.findall(r'\d{4}', movie_title)
    if len(matches) > 1:
        return int(matches[-1])
    if len(matches) < 1:
        return np.nan
    return int(matches[0])
    
In [26]:
    
dataset['year'] = dataset.title.apply(extract_year_from_movie_title)
    
In [27]:
    
dataset.head()
    
    Out[27]:
There are movies without tags
In [28]:
    
dataset[dataset.movie_tags.isnull()].head()
    
    Out[28]:
There are movies without ratings
In [29]:
    
dataset[dataset.rating_mean.isnull()].head()
    
    Out[29]:
Conclusion: These cannot be related to other movies due to lack of features (tags), they could be presented as a "random recommendation" solution
In [30]:
    
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    
In [31]:
    
from sklearn.metrics.pairwise import cosine_similarity
    
In [32]:
    
dataset_with_tags = dataset[~dataset.movie_tags.isnull()].reset_index(drop=True)
    
In [33]:
    
bag_of_words = CountVectorizer()
    
In [34]:
    
tags_as_descriptors = [' '.join(genome_tags.tagId.astype(str))]
    
In [35]:
    
movies_described_bag_of_words = bag_of_words.fit_transform(dataset_with_tags.movie_tags)
    
In [36]:
    
df_bag_m2m = pd.DataFrame(cosine_similarity(movies_described_bag_of_words))
    
In [37]:
    
tf_idf = TfidfVectorizer()
    
In [38]:
    
movies_tf_idf_described = tf_idf.fit_transform(dataset_with_tags.movie_tags)
    
In [39]:
    
m2m = cosine_similarity(movies_tf_idf_described)
    
In [40]:
    
df_tfidf_m2m = pd.DataFrame(cosine_similarity(movies_tf_idf_described))
    
In [41]:
    
df_tfidf_m2m.head()
    
    Out[41]:
In [42]:
    
index_to_movie_id = dataset_with_tags['movieId']
    
In [43]:
    
index_to_movie_id[0]
    
    Out[43]:
In [44]:
    
index_to_movie_id[10665]
    
    Out[44]:
In [45]:
    
dataset_with_tags.reset_index(drop=True).ix[50]
    
    Out[45]:
In [46]:
    
df_tfidf_m2m.columns = [str(index_to_movie_id[int(col)]) for col in df_tfidf_m2m.columns]
    
In [47]:
    
df_tfidf_m2m.index = [index_to_movie_id[idx] for idx in df_tfidf_m2m.index]
    
In [48]:
    
df_tfidf_m2m.head()
    
    Out[48]:
In [49]:
    
df_tfidf_m2m.tail()
    
    Out[49]:
In [50]:
    
df_tfidf_m2m.ix[1].sort_values(ascending=False)[:20]
    
    Out[50]:
In [51]:
    
dataset_with_tags[dataset_with_tags.movieId == 1]
    
    Out[51]:
In [52]:
    
dataset_with_tags[dataset_with_tags.movieId == 3114]
    
    Out[52]:
In [53]:
    
dataset_with_tags[dataset_with_tags.movieId == 4886]
    
    Out[53]:
In [54]:
    
dataset_with_tags[dataset_with_tags.movieId == 78499]
    
    Out[54]:
The closest movies to Toy Story 1 are the sequels and Monsters Inc! (No sh*t sherlock)
In [55]:
    
m2m_similarity_stacked = df_tfidf_m2m.stack().reset_index()
m2m_similarity_stacked.columns = ['first_movie', 'second_movie', 'similarity_score']
    
In [56]:
    
m2m_similarity_stacked.head()
    
    Out[56]:
In [57]:
    
m2m_similarity_stacked.tail()
    
    Out[57]:
In [58]:
    
import sqlite3 as db
    
In [59]:
    
connection = db.connect('db.sqlite3')
    
In [60]:
    
for_db = dataset.rename(columns={
    'movieId': 'movie_id'
})[['movie_id', 'title', 'year', 'genres', 'num_ratings', 'rating_median', 'rating_mean']]
    
In [61]:
    
for_db['relatable'] = True
    
In [62]:
    
for_db.head()
    
    Out[62]:
In [63]:
    
from tqdm import tqdm
    
In [64]:
    
total_length = len(for_db)
step = int(total_length / 100)
with tqdm(total=total_length) as pbar:
    for i in range(0, total_length, step):
        subset = for_db[i: i+step]
        subset.to_sql('movie_time_app_movie', connection, if_exists='append', index=False)
        pbar.update(step)
    
    
In [65]:
    
pd.read_sql_query('SELECT * FROM movie_time_app_movie LIMIT 5', connection)
    
    Out[65]: