In [2]:
import sys
import pandas as pd
import csv
from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors
import Orange
def get_user_profile(user_id, df_rating, df_a_fatures):
df_user = df_rating.loc[df_rating['user_id'] == user_id]
df_merged = pd.merge(df_user, df_a_fatures, how='left', left_on='anime_id', right_on='anime_id').drop(['anime_id', 'rating'], axis=1)
# Count only 1's
df_user_sum = df_merged.apply(pd.Series.value_counts).loc[df_merged.index == 1]
df_user_sum.fillna(0, inplace = True)
df_user_sum.user_id = user_id
return df_user_sum
#
def get_user_profiles(df_animes_vector, df_rating):
# first n users
n_users = 50
users = list(df_rating['user_id'].unique())[:n_users]
# Create user profiles:
df_user_profiles = pd.DataFrame()
for u in users:
u_prof = get_user_profile(u, df_rating, df_animes_vector)
df_user_profiles = df_user_profiles.append(u_prof, ignore_index = True)
return df_user_profiles
#
def normalize(df_user_profiles):
x = df_user_profiles.iloc[:,1:].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x.T)
df_user_profiles = pd.concat([df_user_profiles['user_id'], pd.DataFrame(x_scaled.T, columns=df_user_profiles.columns[1:])], axis=1)
return df_user_profiles
#
def get_userids_by_indices(indices, df_user_prof_norm):
users = []
for i in indices:
uid = df_user_prof_norm.loc[i]['user_id']
users.append(uid)
return users
#
In [3]:
file_anime = "../anime.csv"
file_rating = "../rating.csv"
df_rating = pd.read_csv(file_rating)
df_animes = pd.read_csv(file_anime)
df_animes_genres = pd.get_dummies(df_animes['genre'].str.get_dummies(sep=", ")) # creates genre vectors
df_animes_vector = pd.concat([df_animes['anime_id'], df_animes_genres], axis=1) # anime_id + genre vector
# Get user profiles; then normalize
df_user_profiles = get_user_profiles(df_animes_vector, df_rating)
df_user_prof_norm = normalize(df_user_profiles)
In [4]:
# find closest k user profiles
k = 10
nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(df_user_prof_norm.iloc[:,1:])
user_id = 1
user_prof = df_user_prof_norm[df_user_prof_norm['user_id'] == user_id]
user_prof = user_prof.drop('user_id', axis=1)
# Get closest neighbours
distances, indices = nbrs.kneighbors(user_prof)
# get user_ids
uids = get_userids_by_indices(indices[0], df_user_prof_norm)
print uids
In [19]:
u_animes = []
for uid in uids:
u_animes.append(df_rating[df_rating['user_id'] == uid]['anime_id'].tolist())
# with open('anime_trans.csv', 'wb') as csvfile:
# writer = csv.writer(csvfile)
# writer.writerows(u_animes)
df = pd.DataFrame(u_animes)
df.to_csv('anime_trans.basket', index=False, header=False)
In [68]:
# Orange.associate.AssociationRulesInducer.max_item_sets = 20000
data = Orange.data.Table("anime_trans.basket") #Orange.data.Table("anime_trans.basket")
rules = Orange.associate.AssociationRulesSparseInducer(data, support = 0.7, confidence = 0.6,
max_item_sets = 1000000)
print "%4s %4s %s" % ("Supp", "Conf", "Rule")
for r in rules:
print "%4.2f %4.1f %s %s" % (r.support, r.confidence, r.left, r.right)
# Orange.associate.AssociationRulesSparseInducer.get_itemsets(rules)
In [70]:
Orange.associate.print_rules(rules
)
In [ ]: