- See here for more information.
- Author: Lilian Besson.
- License: MIT License.
In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
In [2]:
!ls -larth *.csv
In [3]:
!cp -vf submission.csv submission.csv.old
In [57]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
titles = pd.read_csv("titles.csv")
watched = pd.read_csv("watched.csv")
In [56]:
np.unique(titles.category)
Out[56]:
Just to check they have correctly been read:
In [58]:
train[:5]
len(train)
min(train['user_id']), max(train['user_id'])
min(train['work_id']), max(train['work_id'])
Out[58]:
Out[58]:
Out[58]:
Out[58]:
In [6]:
test[:5]
len(test)
min(test['user_id']), max(test['user_id'])
min(test['work_id']), max(test['work_id'])
Out[6]:
Out[6]:
Out[6]:
Out[6]:
In [7]:
watched[:5]
len(watched)
min(watched['user_id']), max(watched['user_id'])
min(watched['work_id']), max(watched['work_id'])
Out[7]:
Out[7]:
Out[7]:
Out[7]:
In [32]:
submission = test.copy()
In [33]:
total_average_rating = train.rating.mean()
In [34]:
submission[:5]
len(submission)
Out[34]:
Out[34]:
In [35]:
works_id = np.unique(np.append(test.work_id.unique(), train.work_id.unique()))
In [36]:
mean_ratings = pd.DataFrame(data={'mean_rating': 0}, index=works_id)
mean_ratings[:5]
len(mean_ratings)
Out[36]:
Out[36]:
In [37]:
computed_means = pd.DataFrame(data={'mean_rating': train.groupby('work_id').mean()['rating']}, index=works_id)
computed_means[:5]
len(computed_means)
Out[37]:
Out[37]:
In [38]:
mean_ratings.update(computed_means)
In [39]:
mean_ratings[:10]
len(mean_ratings)
Out[39]:
Out[39]:
In [41]:
submission = submission.join(mean_ratings, on='work_id')
submission.rename_axis({'mean_rating': 'prob_willsee'}, axis="columns", inplace=True)
In [43]:
# in case of mean on empty values
submission.fillna(value=total_average_rating, inplace=True)
In [51]:
submission[:10]
Out[51]:
Let save it to submission_naive1.csv:
In [52]:
submission.to_csv("submission_naive1.csv", index=False)
In [49]:
!ls -larth submission_naive1.csv
The bonus data set watched can give a lot of information. There is 200000 entries in it and only 100000 in test.csv.
In [66]:
len(test), len(watched)
Out[66]:
In [68]:
ratings = np.unique(watched.rating).tolist()
ratings
Out[68]:
In [67]:
watched[:5]
Out[67]:
In [84]:
watched.rename_axis({'rating': 'strrating'}, axis="columns", inplace=True)
In [85]:
watched[:5]
Out[85]:
In [69]:
train[:5]
Out[69]:
Is there pairs (user, work) for which both train data and watched data are available (i.e., both see/notsee and liked/disliked) ?
In [109]:
train.merge(watched, on=['user_id', 'work_id'])
Out[109]:
And what about test data?
In [108]:
test.merge(watched, on=['user_id', 'work_id'])
Out[108]:
In [144]:
test.merge(watched, on=['work_id'])
Out[144]:
No! So we can forget about the user_id, and we will learn how to map liked/disliked to see/notsee for each movie.
In [105]:
all_train = watched.merge(train, on='work_id')
all_train[:5]
Out[105]:
In [106]:
del all_train['user_id_x']
del all_train['user_id_y']
We can delete the user_id axes.
In [107]:
all_train[:5]
Out[107]:
We can first get the average rating of each work:
In [130]:
all_train.groupby('work_id').rating.mean()[:10]
Out[130]:
This table now contains, for each work, a list of mapping from strrating to rating.
It can be combined into a concise mapping, like in this form:
In [80]:
mapping_strrating_probwillsee = {
'dislike': 0,
'neutral': 0.50,
'like': 0.75,
'love': 1,
}
Manually, for instance for one movie:
In [129]:
all_train[(all_train.work_id == 8025) & (all_train.strrating == 'dislike')]
Out[129]:
In [133]:
all_train[all_train.work_id == 8025].rating.mean()
Out[133]:
In [134]:
len(all_train[(all_train.work_id == 8025) & (all_train.strrating == 'dislike')].rating)
all_train[(all_train.work_id == 8025) & (all_train.strrating == 'dislike')].rating.mean()
Out[134]:
Out[134]:
In [135]:
len(all_train[(all_train.work_id == 8025) & (all_train.strrating == 'neutral')].rating)
all_train[(all_train.work_id == 8025) & (all_train.strrating == 'neutral')].rating.mean()
Out[135]:
Out[135]:
In [141]:
len(all_train[(all_train.work_id == 8025) & (all_train.strrating == 'like')].rating)
all_train[(all_train.work_id == 8025) & (all_train.strrating == 'like')].rating.mean()
Out[141]:
Out[141]:
In [142]:
len(all_train[(all_train.work_id == 8025) & (all_train.strrating == 'love')].rating)
all_train[(all_train.work_id == 8025) & (all_train.strrating == 'love')].rating.mean()
Out[142]:
Out[142]:
That's weird!
In [63]:
categories = np.unique(titles.category).tolist()
categories
Out[63]:
In [132]:
for cat in categories:
print("There is {:>5} work(s) in category '{}'.".format(sum(titles.category == cat), cat))
One category is alone, let rewrite it to 'anime'.
In [65]:
categories = {
'anime': 0,
'album': 0,
'manga': 1,
}
TODO !
TODO !