In [1]:
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import numpy as np
import pandas as pd
import scipy
# matplotlib.style.use('ggplot')
# pd.options.display.float_format = '{:20,.2f}'.format
In [2]:
data_rating = pd.io.parsers.read_csv('raw/rating.csv')
# data_rating = data_rating.loc[data_rating['user_id'] != 48766]
data_anime = pd.io.parsers.read_csv('raw/anime.csv')
train_rating = pd.io.parsers.read_csv('omer/rating_train.csv')
test_rating = pd.io.parsers.read_csv('omer/rating_test.csv')
print "Animes: "
print data_anime.describe()
print "\nRatings: "
print data_rating.describe()
In [18]:
data_anime[60:61]
Out[18]:
In [16]:
data_rating[153:156]
Out[16]:
In [38]:
plt.subplots(figsize = (10,7))
plt.hist(data_anime['rating'].dropna(), bins=50, label="Average rating of movies", color="b", alpha=0.6, edgecolor="k")
plt.xlabel('Rating')
plt.ylabel('Number of movies')
plt.legend()
plt.xticks(np.arange(0, 11))
plt.show()
In [25]:
data_anime['rating'].plot(kind="hist", bins=40,figsize=(15,4))
plt.show()
In [26]:
data_rating.groupby('user_id').size().to_frame().sort_values(by=0).plot(kind="density", logx=True, figsize=(15,4))
plt.show()
In [27]:
print 'Anime rating:'
print data_anime['rating'].describe(include='all')
for t in ['TV', 'OVA', 'Movie', 'Special', 'ONA', 'Music']:
print '\nRating: ' + t
print data_anime.loc[data_anime['type'] == t]['rating'].describe(include='all')
data_anime.loc[data_anime['type'] == t]['rating'].plot(kind="density", figsize=(15,8))
L=plt.legend()
for i, t in enumerate(['TV', 'OVA', 'Movie', 'Special', 'ONA', 'Music']):
L.get_texts()[i].set_text(t)
plt.show()
In [163]:
_genre = data_anime['genre']
_genre_list = []
genre_count = []
unique_genre = []
for g in _genre:
try:
gs = g.split(',')
if len(gs) == 1:
unique_genre.extend(gs)
_genre_list.extend(map(lambda s: s.strip(), gs))
genre_count.append(gs)
except:
pass
#print genre_count / len(data_anime)
print sorted(set(unique_genre))
print len(sorted(set(unique_genre)))
print len(data_anime)
print len(sorted(set(_genre_list)))
print sorted(set(_genre_list))
print
print set(_genre_list) - set(unique_genre)
pd.DataFrame(map(lambda x: len(x), genre_count)).plot(kind='hist', bins=15)
plt.show()
In [29]:
genre_count = pd.DataFrame(_genre_list).groupby(0)
genre_count.size().sort_values(ascending=False).plot(kind="bar", width=0.9, figsize=(15,4))
plt.show()
In [30]:
members = data_anime['members'].cumsum()
data_anime['members'].quantile(np.arange(0.0, 1.0, 0.01)).plot(kind="line")
plt.show()
In [32]:
data_anime['members'].plot(kind="box", logy=True)
plt.show()
In [44]:
movies = data_rating.groupby('user_id').size().to_frame().sort_values(by=0)
movies.loc[movies[0] < 314].describe()
movies.loc[movies[0] < 314].plot(kind="hist", bins=313)
plt.show()
In [196]:
def normalize(df_user_profiles):
x = df_user_profiles.iloc[:,1:-1].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x.T)
print x_scaled.T
print df_user_profiles['user_id']
#_df_user_profiles = pd.concat([df_user_profiles['user_id'],
# pd.DataFrame(x_scaled.T, columns=df_user_profiles.columns[1:-1])], axis=1)
_df_user_profiles = [df_user_profiles['user_id'].join(pd.DataFrame(x_scaled.T, columns=df_user_profiles.columns[1:-1]))
_df_user_profiles['avg_genre'] = df_user_profiles['avg_genre'].apply(func=lambda x: x /10.0)
return _df_user_profiles
def get_user_profile(user_id, df_rating, data_anime):
df_anime_genres = pd.get_dummies(data_anime['genre'].str.get_dummies(sep=", ")) # creates genre vectors
df_anime_vector = pd.concat([data_anime['anime_id'], df_anime_genres], axis=1)
df_user = df_rating.loc[df_rating['user_id'] == user_id]
df_merged = pd.merge(df_user, df_anime_vector, how='left', left_on='anime_id', right_on='anime_id'
).drop(['anime_id', 'rating'], axis=1)
avg_genre = df_merged[df_merged.columns.difference(['user_id'])].sum(axis=1)
# Count only 1's
df_user_sum = df_merged.apply(pd.Series.value_counts).loc[df_merged.index == 1]
df_user_sum.fillna(0, inplace = True)
df_user_sum = df_user_sum.apply(func=lambda x: x**2,axis=0)
df_user_sum.user_id = user_id
df_user_sum['avg_genre'] = avg_genre.sum() / float(len(avg_genre))
return df_user_sum
In [195]:
profile = get_user_profile(1, train_rating, data_anime)
print profile
print normalize(profile)
In [ ]: