In [2]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import numpy as np
import pandas as pd
import scipy
import csv

from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors

matplotlib.style.use('ggplot')
pd.options.display.float_format = '{:20,.2f}'.format
pd.set_option('display.max_columns', 50)

In [3]:
data_rating = pd.io.parsers.read_csv('raw/rating.csv')
#data_rating = data_rating.loc[data_rating['user_id'] != 48766]
data_anime = pd.io.parsers.read_csv('raw/anime.csv')

train_rating = pd.io.parsers.read_csv('omer/rating_train.csv')
test_rating = pd.io.parsers.read_csv('omer/rating_test.csv')
final_profiles = pd.io.parsers.read_csv('raw/user_profiles_final.csv')

print "Animes: "
print data_anime.describe()

print "\nRatings: "
print data_rating.describe()


Animes: 
                  anime_id               rating              members
count            12,294.00            12,064.00            12,294.00
mean             14,058.22                 6.47            18,071.34
std              11,455.29                 1.03            54,820.68
min                   1.00                 1.67                 5.00
25%               3,484.25                 5.88               225.00
50%              10,260.50                 6.57             1,550.00
75%              24,794.50                 7.18             9,437.00
max              34,527.00                10.00         1,013,917.00

Ratings: 
                   user_id             anime_id               rating
count         7,813,737.00         7,813,737.00         7,813,737.00
mean             36,727.96             8,909.07                 6.14
std              20,997.95             8,883.95                 3.73
min                   1.00                 1.00                -1.00
25%              18,974.00             1,240.00                 6.00
50%              36,791.00             6,213.00                 7.00
75%              54,757.00            14,093.00                 9.00
max              73,516.00            34,519.00                10.00

In [241]:
data_anime['rating'].plot(kind="hist", bins=40,figsize=(15,4))
plt.show()



In [3]:
data_rating.groupby('user_id').size().to_frame().sort_values(by=0).plot(kind="density", logx=True, figsize=(15,4))
print data_rating.groupby('user_id').size().to_frame().sort_values(by=0).quantile(0.01)
plt.show()


0                   1.00
Name: 0.01, dtype: float64

In [4]:
print 'Anime rating:'
print data_anime['rating'].describe(include='all')

for t in ['TV', 'OVA', 'Movie', 'Special', 'ONA', 'Music']:
    print '\nRating: ' + t
    print data_anime.loc[data_anime['type'] == t]['rating'].describe(include='all')
    data_anime.loc[data_anime['type'] == t]['rating'].plot(kind="density", figsize=(15,8))

L=plt.legend()
for i, t in enumerate(['TV', 'OVA', 'Movie', 'Special', 'ONA', 'Music']):
    L.get_texts()[i].set_text(t)

plt.show()


Anime rating:
count              12,064.00
mean                    6.47
std                     1.03
min                     1.67
25%                     5.88
50%                     6.57
75%                     7.18
max                    10.00
Name: rating, dtype: float64

Rating: TV
count               3,671.00
mean                    6.90
std                     0.86
min                     2.67
25%                     6.44
50%                     6.94
75%                     7.46
max                     9.60
Name: rating, dtype: float64

Rating: OVA
count               3,285.00
mean                    6.38
std                     0.86
min                     2.00
25%                     5.85
50%                     6.38
75%                     6.92
max                     9.25
Name: rating, dtype: float64

Rating: Movie
count               2,297.00
mean                    6.32
std                     1.21
min                     1.92
25%                     5.42
50%                     6.47
75%                     7.25
max                    10.00
Name: rating, dtype: float64

Rating: Special
count               1,671.00
mean                    6.52
std                     0.89
min                     1.67
25%                     6.08
50%                     6.63
75%                     7.11
max                     8.66
Name: rating, dtype: float64

Rating: ONA
count                 652.00
mean                    5.64
std                     1.13
min                     2.58
25%                     4.87
50%                     5.75
75%                     6.41
max                     8.26
Name: rating, dtype: float64

Rating: Music
count                 488.00
mean                    5.59
std                     0.96
min                     3.28
25%                     5.01
50%                     5.62
75%                     6.15
max                     8.38
Name: rating, dtype: float64

In [5]:
_genre = data_anime['genre']
_genre_list = []
genre_count = []
unique_genre = []
for g in _genre:
    try:
        gs = g.split(',')
        if len(gs) == 1:
            unique_genre.extend(gs)
        _genre_list.extend(map(lambda s: s.strip(), gs))
        genre_count.append(gs)
    except:
        pass
    
#print genre_count / len(data_anime) 

print sorted(set(unique_genre))
print len(sorted(set(unique_genre)))

print len(data_anime) 
print len(sorted(set(_genre_list)))
print sorted(set(_genre_list))
print
print set(_genre_list) - set(unique_genre)

pd.DataFrame(map(lambda x: len(x), genre_count)).plot(kind='hist', bins=15)
plt.show()


['Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical', 'Horror', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery', 'Parody', 'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shounen', 'Slice of Life', 'Space', 'Sports', 'Supernatural', 'Thriller', 'Vampire', 'Yaoi']
37
12294
43
['Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical', 'Horror', 'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shoujo Ai', 'Shounen', 'Shounen Ai', 'Slice of Life', 'Space', 'Sports', 'Super Power', 'Supernatural', 'Thriller', 'Vampire', 'Yaoi', 'Yuri']

set(['Josei', 'Police', 'Yuri', 'Shounen Ai', 'Super Power', 'Shoujo Ai'])

In [6]:
genre_count = pd.DataFrame(_genre_list).groupby(0)
genre_count.size().sort_values(ascending=False).plot(kind="bar", width=0.9, figsize=(15,4))
plt.show()



In [7]:
members = data_anime['members'].cumsum()

data_anime['members'].quantile(np.arange(0.0, 1.0, 0.01)).plot(kind="line")
plt.show()



In [8]:
data_anime['members'].plot(kind="box", logy=True)
plt.show()



In [9]:
movies = data_rating.groupby('user_id').size().to_frame().sort_values(by=0)
movies.loc[movies[0] < 314].describe()
movies.loc[movies[0] < 314].plot(kind="hist", bins=313)
plt.show()



In [4]:
def normalize(df_user_profiles):
    x = df_user_profiles.iloc[:,1:].values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    
    x_scaled = min_max_scaler.fit_transform(x)
    
    df_scaled = pd.DataFrame(x_scaled, columns=df_user_profiles.columns.difference(['user_id','rating','genre']))
    
    df_scaled['user_id'] = df_user_profiles['user_id'].values
    df_scaled['genre_count'] = map(lambda x: x /10.0, df_user_profiles['genre_count'].values)
    #df_scaled['rating'] = 1.0
    
    return df_scaled

def normalize_prof_from_file(df_user_profiles):
    x = df_user_profiles.iloc[:,:-3].values #returns a numpy array
    print len(x.T)
    min_max_scaler = preprocessing.MinMaxScaler()
    
    x_scaled = min_max_scaler.fit_transform(x.T)
    print len(df_user_profiles.columns.difference(['user_id','rating','genre_count']))
    df_scaled = pd.DataFrame(x_scaled.T, columns=df_user_profiles.columns.difference(['user_id','rating','genre_count']))
    
    df_scaled['user_id'] = df_user_profiles['user_id'].values
    df_scaled['genre_count'] = map(lambda x: x /13.0, df_user_profiles['genre_count'].values)
    df_scaled['rating'] = 1.0
    
    return df_scaled

def get_user_profile(user_id, df_rating, data_anime):
    df_anime_genres = pd.get_dummies(data_anime['genre'].str.get_dummies(sep=", ")) # creates genre vectors
    df_anime_vector = pd.concat([data_anime['anime_id'], df_anime_genres], axis=1)
    
    df_user = df_rating.loc[df_rating['user_id'] == user_id]
    df_merged = pd.merge(df_user, df_anime_vector, how='left', left_on='anime_id', right_on='anime_id' 
                        ).drop(['anime_id', 'rating'], axis=1)

    
    avg_genre = df_merged[df_merged.columns.difference(['user_id'])].sum(axis=1)
    
    # Count only 1's
    df_user_sum = df_merged.apply(pd.Series.value_counts).loc[df_merged.index == 1]
    df_user_sum.fillna(0, inplace = True)
    df_user_sum = df_user_sum.apply(func=lambda x: x**2,axis=0)

    df_user_sum['genre_count'] = avg_genre.sum() / float(len(avg_genre))
    df_user_sum['user_id'] = user_id
   # df_user_sum['rating'] = 10.0

    return df_user_sum

def build_user_profiles(user_ids):
    df_user_profiles = pd.DataFrame()

    for id in user_ids:
        u_prof = get_user_profile(id, data_rating, data_anime)
        df_user_profiles = df_user_profiles.append(u_prof, ignore_index = True)
    
    return df_user_profiles

def build_knn(n, id, rating=False):
    filter_out = train_rating.loc[train_rating['user_id'] == id]['anime_id']
    filter_anime = data_anime.loc[~data_anime['anime_id'].isin(set(filter_out))]
    
    filter_anime_genres = pd.get_dummies(filter_anime['genre'].str.get_dummies(sep=", ")) # creates genre vectors
    df_anime_vector = pd.concat([filter_anime['anime_id'], filter_anime_genres], axis=1) # anime_id + genre vector
    df_anime_vector['genre_count'] =  df_anime_vector[df_anime_vector.columns.difference(['anime_id'])].sum(axis=1).apply(lambda x: x / 13.0)
    if rating:
        filter_anime_genres['rating'] = 0
        df_anime_vector['rating'] =  filter_anime['rating'].apply(lambda x: x / 10.0)
        df_anime_vector.fillna(0, inplace = True)

    return NearestNeighbors(n_neighbors=n, algorithm='ball_tree').fit(df_anime_vector.iloc[:,1:])

def get_n_closest_users(norm_profile, n, rating):
    
    nbrs = build_knn(n, norm_profile.user_id, rating=rating)
    norm_profile = norm_profile.drop('user_id')

    # Get closest neighbours
    distances, indices = nbrs.kneighbors(norm_profile)
    
    return distances, indices, norm_profile

In [11]:
profile1 = get_user_profile(1, train_rating, data_anime)
profile2 = get_user_profile(2, train_rating, data_anime)

profiles = pd.DataFrame.append(profile1,profile2)
# print normalize(profiles)

final_normalized = normalize_prof_from_file(final_profiles)
final_normalized.head(5)


43
43
Out[11]:
Action Adventure Cars Comedy Dementia Demons Drama Ecchi Fantasy Game Harem Hentai Historical Horror Josei Kids Magic Martial Arts Mecha Military Music Mystery Parody Police Psychological Romance Samurai School Sci-Fi Seinen Shoujo Shoujo Ai Shounen Shounen Ai Slice of Life Space Sports Super Power Supernatural Thriller Vampire Yaoi Yuri user_id genre_count rating
0 0.78 0.14 0.00 1.00 0.00 0.16 0.34 0.67 0.50 0.08 0.72 0.00 0.03 0.08 0.00 0.00 0.18 0.04 0.09 0.01 0.01 0.07 0.03 0.00 0.08 0.91 0.00 0.74 0.28 0.21 0.00 0.03 0.37 0.00 0.08 0.00 0.00 0.16 0.62 0.00 0.08 0.00 0.00 1.00 0.41 1.00
1 1.00 0.75 0.00 0.65 0.00 0.06 0.48 0.08 0.81 0.12 0.00 0.00 0.00 0.10 0.00 0.27 0.21 0.06 0.02 0.02 0.00 0.17 0.00 0.02 0.17 0.17 0.00 0.19 0.15 0.06 0.02 0.00 0.71 0.00 0.08 0.00 0.19 0.15 0.44 0.06 0.02 0.00 0.00 3.00 0.38 1.00
2 1.00 0.41 0.00 0.78 0.00 0.04 0.19 0.15 0.81 0.07 0.07 0.00 0.19 0.07 0.00 0.00 0.37 0.04 0.00 0.07 0.00 0.22 0.00 0.04 0.11 0.30 0.15 0.30 0.04 0.07 0.07 0.00 0.56 0.00 0.15 0.00 0.04 0.19 0.44 0.07 0.15 0.00 0.00 4.00 0.35 1.00
3 0.62 0.46 0.05 1.00 0.00 0.11 0.39 0.07 0.39 0.01 0.04 0.00 0.12 0.01 0.02 0.12 0.11 0.05 0.01 0.01 0.04 0.07 0.10 0.00 0.04 0.13 0.10 0.30 0.33 0.20 0.00 0.00 0.67 0.00 0.28 0.02 0.29 0.13 0.34 0.07 0.05 0.00 0.00 5.00 0.32 1.00
4 0.83 0.08 0.00 1.00 0.00 0.17 0.67 0.33 0.33 0.17 0.33 0.00 0.08 0.33 0.17 0.00 0.00 0.17 0.00 0.00 0.08 0.50 0.00 0.17 0.42 0.75 0.00 0.75 0.17 0.42 0.08 0.00 0.58 0.00 0.58 0.00 0.17 0.33 1.00 0.17 0.00 0.00 0.00 6.00 0.33 1.00

In [13]:
data_rating.loc[data_rating['user_id'] == 102]


Out[13]:
user_id anime_id rating
8280 102 24 8
8281 102 30 7
8282 102 45 10
8283 102 48 8
8284 102 66 8
8285 102 71 7
8286 102 72 7
8287 102 73 7
8288 102 79 7
8289 102 98 6
8290 102 99 7
8291 102 121 6
8292 102 145 5
8293 102 166 7
8294 102 167 8
8295 102 169 5
8296 102 189 7
8297 102 190 7
8298 102 199 9
8299 102 202 7
8300 102 205 7
8301 102 226 6
8302 102 237 8
8303 102 355 8
8304 102 356 8
8305 102 357 7
8306 102 371 6
8307 102 431 9
8308 102 534 8
8309 102 846 8
... ... ... ...
8357 102 10020 9
8358 102 10080 8
8359 102 10464 7
8360 102 10491 6
8361 102 10521 8
8362 102 10578 6
8363 102 10620 8
8364 102 10711 7
8365 102 10719 8
8366 102 10721 8
8367 102 10793 8
8368 102 11111 7
8369 102 11433 9
8370 102 11757 8
8371 102 11759 7
8372 102 11785 6
8373 102 13659 9
8374 102 13859 5
8375 102 13939 7
8376 102 14741 8
8377 102 14829 8
8378 102 14967 8
8379 102 15699 6
8380 102 16498 9
8381 102 18857 9
8382 102 19815 9
8383 102 22297 8
8384 102 27821 7
8385 102 28701 8
8386 102 30276 7

107 rows × 3 columns


In [10]:
# profiles = build_user_profiles([10203,43202,1300])


usdf = pd.DataFrame()

with open('content_recommendations', 'ab') as file:
    writer = csv.writer(file)
    for idx in [5,6,7,8,14,17,21,23,25,26,27]:    
        distances, indices, us = get_n_closest_users(final_normalized.drop([], axis=1).iloc[idx], 10, True)

        usdf = usdf.append(final_normalized.iloc[idx], ignore_index=True)
        test_movies = test_rating.loc[test_rating['user_id'] == final_normalized.iloc[idx]['user_id']]
        for ind in indices:
            print "-----------------------"
            print final_normalized.iloc[idx]['user_id']
            # print data_anime.loc[ind][['anime_id','genre', 'rating']]
            print len(data_anime.loc[ind]['anime_id'])
            print set(data_anime.loc[ind]['anime_id']).intersection(set(test_movies['anime_id']))
            writer.writerow([final_normalized.iloc[idx]['user_id'], ])


c:\python27\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
-----------------------
7.0
10
set([])
c:\python27\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
-----------------------
8.0
10
set([])
c:\python27\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
-----------------------
11.0
10
set([])
c:\python27\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
-----------------------
12.0
10
set([])
c:\python27\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
-----------------------
19.0
10
set([])
c:\python27\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
-----------------------
22.0
10
set([])
c:\python27\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
-----------------------
26.0
10
set([])
c:\python27\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
-----------------------
28.0
10
set([])
c:\python27\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
-----------------------
30.0
10
set([])
c:\python27\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
-----------------------
31.0
10
set([])
-----------------------
32.0
10
set([249])
c:\python27\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)