notebook.community

Edit and run



In [6]:

    
import pandas as pd
# unzip ml-20.zip into cwd
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')



In [7]:

    
df = ratings.join(movies, on='movieId', rsuffix='drop').drop(columns=['movieId', 'movieIddrop'])
df.head()









    Out[7]:







  
    
      
      userId
      rating
      timestamp
      title
      genres
    
  
  
    
      0
      1
      3.5
      1112486027
      Grumpier Old Men (1995)
      Comedy|Romance
    
    
      1
      1
      3.5
      1112484676
      Shanghai Triad (Yao a yao yao dao waipo qiao) ...
      Crime|Drama
    
    
      2
      1
      3.5
      1112484819
      Wings of Courage (1995)
      Adventure|Romance|IMAX
    
    
      3
      1
      3.5
      1112484727
      Pocahontas (1995)
      Animation|Children|Drama|Musical|Romance
    
    
      4
      1
      3.5
      1112484580
      Guardian Angel (1994)
      Action|Drama|Thriller



In [8]:

    
len(movies), len(ratings), len(ratings) / len(movies)









    Out[8]:





(27278, 20000263, 733.2012244299435)



In [9]:

    
df.title.to_pickle('title')



In [1]:

    
import pandas as pd
titles = pd.read_pickle('title').fillna('null')



In [2]:

    
titles.head()









    Out[2]:





0                              Grumpier Old Men (1995)
1    Shanghai Triad (Yao a yao yao dao waipo qiao) ...
2                              Wings of Courage (1995)
3                                    Pocahontas (1995)
4                                Guardian Angel (1994)
Name: title, dtype: object



In [3]:

    
len(titles)









    Out[3]:





20000263



In [4]:

    
cat_titles = titles.astype(
    pd.api.types.CategoricalDtype(
        pd.unique(titles)))
len(cat_titles.cat.categories)









    Out[4]:





9260



In [5]:

    
len(cat_titles.cat.codes)









    Out[5]:





20000263



In [6]:

    
%matplotlib inline
import matplotlib.pyplot as plt
counts_desc = cat_titles.value_counts().values
assert sorted(counts_desc, reverse=True)

plt.plot(counts_desc[1:]) # 0-th is too large
plt.xlabel("movie index, by popularity")
plt.ylabel("log # times movie appears")
plt.title("movie apperance count among ratings")
plt.show()



In [7]:

    
import numpy as np
cdf = counts_desc.cumsum() / counts_desc.sum()
np.searchsorted(cdf, [.95, .99, .999, 1])









    Out[7]:





array([3204, 5575, 7918, 9259])



In [1]:

    
import numpy as np
from collections import Counter
from scipy.stats import truncnorm
%load_ext memory_profiler
d = 10000
e = 1000
n = 100000000
if d < n:
    dindices = np.random.geometric(p=0.01, size=(n - d)) - 1
    dindices = np.concatenate([dindices, np.arange(d)])
    dcounts = np.bincount(dindices)
    selected = dcounts.argsort()[::-1][:e]
else:
    dindices = np.random.choice(d, n // 2)
    frequent = np.random.choice(n, n - n // 2)
    dindices = np.concatenate([dindices, frequent])
    c = Counter(dindices)
    selected = np.asarray(sorted(c, key=c.get, reverse=True)[:e])



In [2]:

    
%%memit

searched = np.searchsorted(selected, dindices)
selected2 = np.append(selected, [-1])
searched[selected2[searched] != dindices] = -1
searched[searched == -1] = e
result = searched









    



peak memory: 2401.04 MiB, increment: 1546.69 MiB



In [3]:

    
%%timeit

searched = np.searchsorted(selected, dindices)
selected2 = np.append(selected, [-1])
searched[selected2[searched] != dindices] = -1
searched[searched == -1] = e
result = searched









    



5.07 s ± 55.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)



In [3]:

    
%%memit
mapping = np.full(d, e)
mapping[selected] = np.arange(e)
dindices = np.take(mapping, dindices)









    



peak memory: 1501.95 MiB, increment: 647.29 MiB



In [3]:

    
%%timeit
mapping = np.full(d, e)
mapping[selected] = np.arange(e)
dindices = np.take(mapping, dindices)









    



330 ms ± 6.82 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

	userId	rating	timestamp	title	genres
0	1	3.5	1112486027	Grumpier Old Men (1995)	Comedy\|Romance
1	1	3.5	1112484676	Shanghai Triad (Yao a yao yao dao waipo qiao) ...	Crime\|Drama
2	1	3.5	1112484819	Wings of Courage (1995)	Adventure\|Romance\|IMAX
3	1	3.5	1112484727	Pocahontas (1995)	Animation\|Children\|Drama\|Musical\|Romance
4	1	3.5	1112484580	Guardian Angel (1994)	Action\|Drama\|Thriller