In [6]:
import pandas as pd
# unzip ml-20.zip into cwd
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
In [7]:
df = ratings.join(movies, on='movieId', rsuffix='drop').drop(columns=['movieId', 'movieIddrop'])
df.head()
Out[7]:
In [8]:
len(movies), len(ratings), len(ratings) / len(movies)
Out[8]:
In [9]:
df.title.to_pickle('title')
In [1]:
import pandas as pd
titles = pd.read_pickle('title').fillna('null')
In [2]:
titles.head()
Out[2]:
In [3]:
len(titles)
Out[3]:
In [4]:
cat_titles = titles.astype(
pd.api.types.CategoricalDtype(
pd.unique(titles)))
len(cat_titles.cat.categories)
Out[4]:
In [5]:
len(cat_titles.cat.codes)
Out[5]:
In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
counts_desc = cat_titles.value_counts().values
assert sorted(counts_desc, reverse=True)
plt.plot(counts_desc[1:]) # 0-th is too large
plt.xlabel("movie index, by popularity")
plt.ylabel("log # times movie appears")
plt.title("movie apperance count among ratings")
plt.show()
In [7]:
import numpy as np
cdf = counts_desc.cumsum() / counts_desc.sum()
np.searchsorted(cdf, [.95, .99, .999, 1])
Out[7]:
In [1]:
import numpy as np
from collections import Counter
from scipy.stats import truncnorm
%load_ext memory_profiler
d = 10000
e = 1000
n = 100000000
if d < n:
dindices = np.random.geometric(p=0.01, size=(n - d)) - 1
dindices = np.concatenate([dindices, np.arange(d)])
dcounts = np.bincount(dindices)
selected = dcounts.argsort()[::-1][:e]
else:
dindices = np.random.choice(d, n // 2)
frequent = np.random.choice(n, n - n // 2)
dindices = np.concatenate([dindices, frequent])
c = Counter(dindices)
selected = np.asarray(sorted(c, key=c.get, reverse=True)[:e])
In [2]:
%%memit
searched = np.searchsorted(selected, dindices)
selected2 = np.append(selected, [-1])
searched[selected2[searched] != dindices] = -1
searched[searched == -1] = e
result = searched
In [3]:
%%timeit
searched = np.searchsorted(selected, dindices)
selected2 = np.append(selected, [-1])
searched[selected2[searched] != dindices] = -1
searched[searched == -1] = e
result = searched
In [3]:
%%memit
mapping = np.full(d, e)
mapping[selected] = np.arange(e)
dindices = np.take(mapping, dindices)
In [3]:
%%timeit
mapping = np.full(d, e)
mapping[selected] = np.arange(e)
dindices = np.take(mapping, dindices)