In [1]:
import sys
import pandas as pd
import numpy as np
import csv
from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors
import Orange

def get_user_profile(user_id, df_rating, df_a_fatures):
    
    # To be used only if the user profiles file is not already created
    df_user = df_rating.loc[df_rating['user_id'] == user_id]
    df_merged = pd.merge(df_user, df_a_fatures, how='left', left_on='anime_id', right_on='anime_id').drop(['anime_id', 'rating'], axis=1)
    
    avg_genre = df_merged[df_merged.columns.difference(['user_id', 'anime_id', 'rating'])].sum(axis=1)
    
    # Count only 1's
    df_user_sum = df_merged.sum(axis=0)
    df_user_sum.user_id = user_id
    df_user_sum['rating'] = 10.0
    df_user_sum['genre_count'] = avg_genre.sum() / float(len(avg_genre))
    
    return df_user_sum
#
def get_user_profiles(df_animes_vector, df_rating, n_users=50):
    
    # To be used only if the user profiles file is not already created
    
    # first n_users
    users = list(df_rating['user_id'].unique())[:n_users] 

    # Create user profiles:
    df_user_profiles = pd.DataFrame()
    i = 0
    for u in users:
        u_prof = get_user_profile(u, df_rating, df_animes_vector)
        df_user_profiles = df_user_profiles.append(u_prof, ignore_index = True)
        i = i+1
        if i%100 ==0:
            print("Completed users:",i)

    return df_user_profiles
#
def normalize(df_user_profiles):
    x = df_user_profiles.iloc[:,0:-2].values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    
    x_scaled = min_max_scaler.fit_transform(x.T)
    
    df_scaled = pd.DataFrame(x_scaled.T, columns=df_user_profiles.columns.difference(['user_id','rating','genre']))
    
    df_scaled['user_id'] = df_user_profiles['user_id'].values
    df_scaled['genre_count'] = map(lambda x: x /10.0, df_user_profiles['genre_count'].values)
    df_scaled['rating'] = 1.0
    
    return df_scaled
#
def get_userids_by_indices(indices, df_user_prof_norm):
    users = []
    for i in indices:
       uid = df_user_prof_norm.loc[i]['user_id']
       users.append(uid)
    return users    
#

In [2]:
def get_collaborative_recommendations_per_user(user_id, k, df_user_prof_norm, df_rating):

    # find closest k user profiles
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(df_user_prof_norm.drop(['user_id','rating','genre_count'], axis=1))
    user_prof = df_user_prof_norm[df_user_prof_norm['user_id'] == user_id]
    user_prof = user_prof.drop(['user_id','rating','genre_count'], axis=1)

    # Get closest neighbours
    distances, indices = nbrs.kneighbors(user_prof)
    print("Closest neighbours identified!")
    
    # get user_ids
    uids = get_userids_by_indices(indices[0], df_user_prof_norm)
    
    # ------------------------------------------------------------
    u_animes = []
    for uid in uids:
        u_animes.append(df_rating[df_rating['user_id'] == uid]['anime_id'].tolist())
    with open('anime_trans.basket', 'wb') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(u_animes)
    # ------------------------------------------------------------
    
    # !!!!! Get the transactions directly from the list, not from the .basket file !!!!!

    # Get all training transactions

    data = Orange.data.Table("anime_trans.basket") #Orange.data.Table("anime_trans.basket")

    # This is the user we would like to recommend something for
    target_user = data[0]
    target_user_animes = data[0].get_metas(str).keys()

    # Drop the user's data from the transactions list
    data = data.get_items(range(1,len(data)))

    # Generate recommendation rules
    support_threshold = 0.5
    confidence_threshold = 0.8
    rulesOK = False
    while rulesOK is False:
        try:
            rules = Orange.associate.AssociationRulesSparseInducer(data, support = support_threshold, confidence = confidence_threshold,
                                                                   max_item_sets = 100000)
            rulesOK = True
        except:
            print(support_threshold, confidence_threshold)
            if confidence_threshold == 1:
                support_threshold += 0.1
            else:
                confidence_threshold += 0.1
            

    # print "%4s\t %4s  %s %s" % ("AnimeId", "Lift", "Support", "Conf")

    recommendations = {}
    for r in rules:

        # Compare the generated rules with a specific instance from the transactions list
        if(r.n_right==1):
            recommendation = str(r.right.get_metas(str).keys()[0])
            if recommendation not in target_user_animes:
                #if r.applies_left(target_user):
                try:
                    recommendations[r.n_left].append(r)
                except:
                    recommendations[r.n_left] = []
                    recommendations[r.n_left].append(r)
                    # print "%4.2f %4.4f %s %s" % (r.support, r.confidence, r, r.lift)

    user_recommendations = []
    for i, r in recommendations.iteritems():
        recommendations[i].sort(key=lambda x: (x.lift, x.support, x.confidence), reverse=True)

    for recommendation_length in sorted(recommendations.keys(), reverse=True):
        if len(user_recommendations) == 10:
            break
        for recommendation in recommendations[recommendation_length]:
            anime_id = str(recommendation.right.get_metas(str).keys()[0])
    #         print recommendation
    #         print anime_id, "\t", recommendation.lift, recommendation.support, recommendation.confidence
            if anime_id not in user_recommendations:
                user_recommendations.append(anime_id)
            if len(user_recommendations) == 10:
                break
    return user_recommendations
    # Orange.associate.AssociationRulesSparseInducer.get_itemsets(rules)

In [3]:
# Read the user profiles from the user_profiles_final.csv file

user_profiles = "raw/user_profiles_final.csv"
df_user_profiles = pd.read_csv(user_profiles)

file_rating = "raw/rating_train.csv"
df_rating = pd.read_csv(file_rating)

users_ids = list(df_user_profiles['user_id'].unique())
df_user_prof_norm = normalize(df_user_profiles)

In [19]:
def clust(df_user_prof_norm):
    from sklearn.cluster import KMeans
    user_prof = df_user_prof_norm.drop(['user_id','rating','genre_count'], axis=1)
    
    kmeans = KMeans(n_clusters=100 , algorithm='auto', n_init=1, n_jobs=-1)
    kmeans.fit(user_prof)
    print(kmeans.cluster_centers_)
    return(kmeans.labels_)

In [20]:
clust(df_user_prof_norm)


[[  9.09128263e-01   4.57965925e-01   1.07265688e-02 ...,   3.22109893e-02
    9.99599169e-04   8.41595709e-04]
 [  5.78211079e-01   5.10521080e-01   2.88038679e-03 ...,   2.56229343e-02
    4.04975032e-03   5.53003865e-04]
 [  9.19007630e-01   4.24750777e-01   1.65608468e-02 ...,   4.25312528e-02
    6.52309804e-03   3.94227103e-04]
 ..., 
 [  9.97384935e-01   4.08449522e-01   8.66169658e-03 ...,   6.82887755e-02
    3.30599247e-03   9.25300723e-04]
 [  9.99822758e-01   3.79100585e-01   5.53230179e-03 ...,   6.07131735e-02
    1.01148592e-03   2.46692039e-04]
 [  9.16414808e-01   3.49484270e-01   1.44481115e-03 ...,   7.33968629e-02
    4.32256152e-04   1.00597956e-03]]
Out[20]:
array([23, 11, 11, ..., 83, 34, 15])

In [22]:
test = Out[20]

In [24]:
test.shape


Out[24]:
(60785L,)

In [29]:
df_user_prof_norm["ClustLabel"] = pd.Series(test, index=df_user_prof_norm.index)

In [31]:
df_user_prof_norm.columns


Out[31]:
Index([u'Action', u'Adventure', u'Cars', u'Comedy', u'Dementia', u'Demons',
       u'Drama', u'Ecchi', u'Fantasy', u'Game', u'Harem', u'Hentai',
       u'Historical', u'Horror', u'Josei', u'Kids', u'Magic', u'Martial Arts',
       u'Mecha', u'Military', u'Music', u'Mystery', u'Parody', u'Police',
       u'Psychological', u'Romance', u'Samurai', u'School', u'Sci-Fi',
       u'Seinen', u'Shoujo', u'Shoujo Ai', u'Shounen', u'Shounen Ai',
       u'Slice of Life', u'Space', u'Sports', u'Super Power', u'Supernatural',
       u'Thriller', u'Vampire', u'Yaoi', u'Yuri', u'genre_count', u'user_id',
       u'rating', u'ClustLabel'],
      dtype='object')

In [ ]:
recommendations = {}

with open('collaborative.csv', 'ab') as csv_file:
    writer = csv.writer(csv_file)
    for i in users_ids[:100]:
        print ("Results for user %4d\t " % (i))
        rec = get_collaborative_recommendations_per_user(user_id=i, k=11, df_user_prof_norm=df_user_prof_norm, df_rating=df_rating)
        recommendations[i] = rec
        writer.writerow([i, recommendations[i]])


Results for user    1	 
Closest neighbours identified!
(0.5, 0.8)
(0.5, 0.9)
(0.5, 1.0)
(0.6, 1.0)
Results for user    3	 
Closest neighbours identified!
Results for user    4	 
Closest neighbours identified!
Results for user    5	 
Closest neighbours identified!
Results for user    6	 
Closest neighbours identified!
Results for user    7	 
Closest neighbours identified!
(0.5, 0.8)
(0.5, 0.9)
(0.5, 1.0)
Results for user    8	 
Closest neighbours identified!
Results for user   11	 
Closest neighbours identified!
Results for user   12	 
Closest neighbours identified!
Results for user   13	 
Closest neighbours identified!
Results for user   14	 
Closest neighbours identified!
(0.5, 0.8)
(0.5, 0.9)
(0.5, 1.0)
Results for user   16	 
Closest neighbours identified!
Results for user   17	 
Closest neighbours identified!
(0.5, 0.8)
(0.5, 0.9)
(0.5, 1.0)
Results for user   18	 
Closest neighbours identified!
Results for user   19	 
Closest neighbours identified!
Results for user   20	 
Closest neighbours identified!
Results for user   21	 
Closest neighbours identified!
(0.5, 0.8)
(0.5, 0.9)
(0.5, 1.0)
(0.6, 1.0)
Results for user   22	 
Closest neighbours identified!
Results for user   23	 
Closest neighbours identified!
Results for user   24	 
Closest neighbours identified!
Results for user   25	 
Closest neighbours identified!
Results for user   26	 
Closest neighbours identified!
Results for user   27	 
Closest neighbours identified!
Results for user   28	 
Closest neighbours identified!
Results for user   29	 
Closest neighbours identified!
Results for user   30	 
Closest neighbours identified!
Results for user   31	 
Closest neighbours identified!
(0.5, 0.8)
(0.5, 0.9)
(0.5, 1.0)
Results for user   32	 
Closest neighbours identified!
Results for user   33	 
Closest neighbours identified!
Results for user   34	 
Closest neighbours identified!
Results for user   35	 
Closest neighbours identified!
Results for user   37	 
Closest neighbours identified!
Results for user   38	 
Closest neighbours identified!
Results for user   39	 
Closest neighbours identified!
(0.5, 0.8)
(0.5, 0.9)
(0.5, 1.0)
Results for user   40	 
Closest neighbours identified!
Results for user   41	 

In [ ]:
# Generate the user profiles from the raw data

file_anime = "raw/anime.csv"
file_rating = "raw/rating_train.csv"

df_rating = pd.read_csv(file_rating)
df_animes = pd.read_csv(file_anime)
df_animes_genres = pd.get_dummies(df_animes['genre'].str.get_dummies(sep=", ")) # creates genre vectors
users_ids = list(df_rating['user_id'].unique())
df_animes_vector = pd.concat([df_animes['anime_id'], df_animes_genres], axis=1) # anime_id + genre vector

# Get user profiles; then normalize 

# df_user_profiles = get_user_profiles(df_animes_vector, df_rating, n_users=len(users_ids))
# df_user_profiles.to_csv("user_profiles_final.csv", index=False, encoding='UTF-8')

In [12]:
test_reading = pd.io.parsers.read_csv('collaborative_test.csv', header=None)

In [17]:
type(test_reading[1])


Out[17]:
pandas.core.series.Series

In [ ]: