In [1]:
import pandas as pd
In [2]:
import numpy as np
In [3]:
ratings = pd.read_csv("datasets/ml-latest/ratings.csv", usecols=['movieId', 'rating'])
In [4]:
genome_scores = pd.read_csv("datasets/ml-latest/genome-scores.csv")
In [5]:
genome_tags = pd.read_csv("datasets/ml-latest/genome-tags.csv")
In [6]:
movie_names = pd.read_csv("datasets/ml-latest/movies.csv")
In [7]:
links = pd.read_csv("datasets/ml-latest/links.csv")
In [8]:
movie_tags_in_text = pd.merge(genome_scores, genome_tags, on='tagId')[['movieId', 'tag', 'relevance']]
In [9]:
%matplotlib inline
In [10]:
genome_scores.head()
Out[10]:
In [11]:
genome_scores[:100000].pivot(index='movieId', columns='tagId')['relevance'].head()
Out[11]:
In [12]:
table = genome_scores[:100000].pivot_table('relevance', index='movieId', columns='tagId', aggfunc='mean')
In [13]:
table.head()
Out[13]:
In [14]:
%matplotlib inline
In [15]:
table[:1].T.hist()
Out[15]:
conclusion: 0.3 seems to be a good cut-off
In [16]:
movie_tags = genome_scores[genome_scores.relevance > 0.3][['movieId', 'tagId']]
In [17]:
tags_to_movies = pd.merge(movie_tags, genome_tags, on='tagId', how='left')[['movieId', 'tagId']]
In [18]:
tags_to_movies['tagId'] = tags_to_movies.tagId.astype(str)
In [19]:
def _concatenate_tags_of_movie(tags):
tags_as_str = ' '.join(set(tags))
return tags_as_str
In [20]:
tags_per_movie = tags_to_movies.groupby('movieId')['tagId'].agg({
'movie_tags': _concatenate_tags_of_movie
}).reset_index()
In [21]:
avg_ratings = ratings.groupby('movieId')['rating'].agg({
'rating_mean': 'mean',
'rating_median': 'median',
'num_ratings': 'size'
}).reset_index()
In [22]:
movies_with_ratings = pd.merge(movie_names, avg_ratings, how='left', on='movieId')
In [23]:
dataset = pd.merge(movies_with_ratings, tags_per_movie, how='left', on='movieId')
In [24]:
dataset.rename(columns={'median': 'rating_median', 'mean': 'rating_mean', 'tagId': 'movie_tags'}, inplace=True)
In [25]:
import re
def extract_year_from_movie_title(movie_title):
matches = re.findall(r'\d{4}', movie_title)
if len(matches) > 1:
return int(matches[-1])
if len(matches) < 1:
return np.nan
return int(matches[0])
In [26]:
dataset['year'] = dataset.title.apply(extract_year_from_movie_title)
In [27]:
dataset.head()
Out[27]:
There are movies without tags
In [28]:
dataset[dataset.movie_tags.isnull()].head()
Out[28]:
There are movies without ratings
In [29]:
dataset[dataset.rating_mean.isnull()].head()
Out[29]:
Conclusion: These cannot be related to other movies due to lack of features (tags), they could be presented as a "random recommendation" solution
In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
In [31]:
from sklearn.metrics.pairwise import cosine_similarity
In [32]:
dataset_with_tags = dataset[~dataset.movie_tags.isnull()].reset_index(drop=True)
In [33]:
bag_of_words = CountVectorizer()
In [34]:
tags_as_descriptors = [' '.join(genome_tags.tagId.astype(str))]
In [35]:
movies_described_bag_of_words = bag_of_words.fit_transform(dataset_with_tags.movie_tags)
In [36]:
df_bag_m2m = pd.DataFrame(cosine_similarity(movies_described_bag_of_words))
In [37]:
tf_idf = TfidfVectorizer()
In [38]:
movies_tf_idf_described = tf_idf.fit_transform(dataset_with_tags.movie_tags)
In [39]:
m2m = cosine_similarity(movies_tf_idf_described)
In [40]:
df_tfidf_m2m = pd.DataFrame(cosine_similarity(movies_tf_idf_described))
In [41]:
df_tfidf_m2m.head()
Out[41]:
In [42]:
index_to_movie_id = dataset_with_tags['movieId']
In [43]:
index_to_movie_id[0]
Out[43]:
In [44]:
index_to_movie_id[10665]
Out[44]:
In [45]:
dataset_with_tags.reset_index(drop=True).ix[50]
Out[45]:
In [46]:
df_tfidf_m2m.columns = [str(index_to_movie_id[int(col)]) for col in df_tfidf_m2m.columns]
In [47]:
df_tfidf_m2m.index = [index_to_movie_id[idx] for idx in df_tfidf_m2m.index]
In [48]:
df_tfidf_m2m.head()
Out[48]:
In [49]:
df_tfidf_m2m.tail()
Out[49]:
In [50]:
df_tfidf_m2m.ix[1].sort_values(ascending=False)[:20]
Out[50]:
In [51]:
dataset_with_tags[dataset_with_tags.movieId == 1]
Out[51]:
In [52]:
dataset_with_tags[dataset_with_tags.movieId == 3114]
Out[52]:
In [53]:
dataset_with_tags[dataset_with_tags.movieId == 4886]
Out[53]:
In [54]:
dataset_with_tags[dataset_with_tags.movieId == 78499]
Out[54]:
The closest movies to Toy Story 1 are the sequels and Monsters Inc! (No sh*t sherlock)
In [55]:
m2m_similarity_stacked = df_tfidf_m2m.stack().reset_index()
m2m_similarity_stacked.columns = ['first_movie', 'second_movie', 'similarity_score']
In [56]:
m2m_similarity_stacked.head()
Out[56]:
In [57]:
m2m_similarity_stacked.tail()
Out[57]:
In [58]:
import sqlite3 as db
In [59]:
connection = db.connect('db.sqlite3')
In [60]:
for_db = dataset.rename(columns={
'movieId': 'movie_id'
})[['movie_id', 'title', 'year', 'genres', 'num_ratings', 'rating_median', 'rating_mean']]
In [61]:
for_db['relatable'] = True
In [62]:
for_db.head()
Out[62]:
In [63]:
from tqdm import tqdm
In [64]:
total_length = len(for_db)
step = int(total_length / 100)
with tqdm(total=total_length) as pbar:
for i in range(0, total_length, step):
subset = for_db[i: i+step]
subset.to_sql('movie_time_app_movie', connection, if_exists='append', index=False)
pbar.update(step)
In [65]:
pd.read_sql_query('SELECT * FROM movie_time_app_movie LIMIT 5', connection)
Out[65]: