In [1]:
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 17 16:48:11 2015

@author: ruifpmaia
"""

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob
import os

%matplotlib inline

In [2]:
def processOutput(path, dataset, train_epo):
    print "*********************************"
    print "Analysing dataset [%s] results on [%s]" % (dataset, path )
    print "*********************************"
    rmse = {}
    mae = {}
    min_rmse = {}
    rmse_train = {}
    min_k = 0 # minimun RMSE on the test
    # For each k...
    #for k in [5,10,15,20,25]:
    #for k in [5,25,50,100]:
    for k in [5,25]:
        #print "Getting RSME for k=" + str(k)
        extension = "*.k" + str(k) + ".csv"
        test_count = 0;
        # print extension
        rmse[k] = 0.0
        mae[k] = 0.0
        rmse_train[k] = {}
        min_rmse[k] = {'min':0, 'idx':0, 'file':''}
        for ix in range(train_epo):
            rmse_train[k][ix]=0            
        # Analyse each file...
        for filename in glob.glob(os.path.join(path, extension)):
            #print filename
            if (filename.find(dataset) >= 0):        
                # read results file
                df = pd.read_csv(filename, usecols=[0], sep='\t')
                mae_col = pd.read_csv(filename, usecols=[1], sep='\t')
                # get last RMSE and add it to the total rmse
                rmse[k] += df.values[train_epo-1]
                mae[k] += mae_col.values[train_epo-1]
                train_iter_idx = 0
                test_count += 1
                # add iteration values to dic                 
                for iter_val in range(train_epo):
                    rmse_train[k][train_iter_idx] += df.values[train_iter_idx]
                    if (min_rmse[k]['min'] == 0):
                        min_rmse[k]['min'] = df.values[train_iter_idx]
                        min_rmse[k]['idx'] = train_iter_idx
                        min_rmse[k]['file'] = filename
                    elif (min_rmse[k]['min'] > df.values[train_iter_idx]):
                        min_rmse[k]['min'] = df.values[train_iter_idx]   
                        min_rmse[k]['idx'] = train_iter_idx
                        min_rmse[k]['file'] = filename
                    train_iter_idx += 1     
        #get the average of RMSE for K = n
        rmse[k] /= test_count
        mae[k] /= test_count
        if (k == 5):
            min_k = 5
        elif (rmse[k] < rmse[min_k]):
            min_k = k
        train_iter_idx = 0
        for iter_val in rmse_train[k]:
            rmse_train[k][train_iter_idx] /= test_count
            train_iter_idx += 1        
        print "Average RMSE for k=" + str(k) + ": %.4f" % rmse[k]
    return (rmse_train, rmse, mae, min_k)


def Analyse(path, dataset_list, train_epo):
    legend = []
    res_dic = {}
    plt.gca().set_color_cycle(['red', 'green', 'blue', 'yellow', 'gray'])
    x = np.arange(train_epo)
    for dataset in dataset_list:
        avgres = 0.0
        res_dic[dataset] = {}
        (rmse_train, rmse, mae, min_k) = processOutput(path, dataset, train_epo)
        plt.plot(x, rmse_train[min_k].values())
        legend.append("%s[k%d] RMSE:%.4f MAE:%.4f" % (dataset, min_k, rmse[min_k], mae[min_k]))
        res_dic[dataset][str('FM-k%d'% min_k)] = rmse[min_k][0]
    fig = plt.gcf()
    fig.set_size_inches(13,9)
    plt.grid(True)
    plt.legend(legend, loc='upper left')
    plt.show()
    plt.draw()
    return 0


def Baseline(dataset):
    try:
        avgres_fn = dataset + ".rmse.avg"
        with open(avgres_fn) as resf:
            avgres = float(resf.readline())
    except IOError:
        print "No RMSE result using AVG for Prediction" 
    try:
        maeres_fn = dataset + ".mae.avg"
        with open(maeres_fn) as resf:
            maeres = float(resf.readline())
    except IOError:
        print "No MAE result using AVG for Prediction"
    print ("%s\nRMSE-AVG:%.4f\nMAE-AVG:%.4f" % (dataset, float(avgres), float(maeres)))

Kochbar73k Classification


In [3]:
with open('kochbar\\Data73k\\kochbar_classification', 'r') as fin:
    print fin.read()
print '-------------------'
print 'kochbar86k_ds - Rating + Users + Item\nkochbar86k_ds2 - Rating + Users + Item + AvgRatUser\nkochbar86k_ds3 - Rating + Users + Item + AvgRatUser + StdDevUser\nkochbar86k_ds4 - Rating + Users + Item + AvgRatItem\nkochbar86k_ds5 - Rating + Users + Item + AvgRatItem + StdDevItem\nkochbar86k_ds6 - Rating + Users + Item + AvgRatUser + AvgRatItem\nkochbar86k_ds7 - Rating + Users + Item + AvgRatUser + StdDevUser + AvgRatItem + StdDevItem\n'
print '------------------- BASELINE (AVG) -----------'
Baseline('kochbar\\Output73k\\kochbar_ds')
print '-------------------'


Number of users:1447
Number of items:15170
Number of events:72922
Number of ingredients:36979
Number of cuisine types:146
Number of dietary groups:1
Avg. rating value:4.99
Avg. number of ratings per user:50.40
Avg. number of ratings per item:4.81
Avg. number of ingredients per item:57.59
Avg. number of cuisine type per item:4.81
Avg. number of dietary groups per item:4.81
Sparsity on the ratings matrix:0.332%

-------------------
kochbar86k_ds - Rating + Users + Item
kochbar86k_ds2 - Rating + Users + Item + AvgRatUser
kochbar86k_ds3 - Rating + Users + Item + AvgRatUser + StdDevUser
kochbar86k_ds4 - Rating + Users + Item + AvgRatItem
kochbar86k_ds5 - Rating + Users + Item + AvgRatItem + StdDevItem
kochbar86k_ds6 - Rating + Users + Item + AvgRatUser + AvgRatItem
kochbar86k_ds7 - Rating + Users + Item + AvgRatUser + StdDevUser + AvgRatItem + StdDevItem

------------------- BASELINE (AVG) -----------
kochbar\Output73k\kochbar_ds
RMSE-AVG:0.1369
MAE-AVG:0.0137
-------------------

Kochbar 73k Results Analysis - MCMC - 100 iter-5Fold


In [4]:
Analyse('kochbar\\Output73k', ['kochbar_ds.','kochbar_ds2.','kochbar_ds3.','kochbar_ds4.','kochbar_ds5.','kochbar_ds6.','kochbar_ds7.','kochbar_ds8.','kochbar_ds9.','kochbar_ds10.','kochbar_ds11.','kochbar_ds12.','kochbar_ds13.','kochbar_ds14.','kochbar_ds15.','kochbar_ds16.','kochbar_ds17.' ], 100)
print 'kochbar_ds - Rating + Users + Item\nkochbar_ds2 - Rating + Users + Item + AvgRatUser\nkochbar_ds3 - Rating + Users + Item + AvgRatUser + StdDevUser\nkochbar_ds4 - Rating + Users + Item + AvgRatItem\nkochbar_ds5 - Rating + Users + Item + AvgRatItem + StdDevItem\nkochbar_ds6 - Rating + Users + Item + AvgRatUser + AvgRatItem\nkochbar_ds7 - Rating + Users + Item + AvgRatUser + StdDevUser + AvgRatItem + StdDevItem\nkochbar_ds8	- Rating + Users + Item + Ingredients\nkochbar_ds9	- Rating + Users + Item + Ingredients + Categories\nkochbar_ds10 - Rating + Users + Item + Ingredients + Categories + Groups\nkochbar_ds11 - Rating + Users + Item + Ingredients + AvgRatUser + AvgRatItem\nkochbar_ds12 - Rating + Users + Item + Ingredients + Categories + AvgRatUser + AvgRatItem\nkochbar_ds13 - Rating + Users + Item + Ingredients + Categories + Groups + AvgRatUser + AvgRatItem\nkochbar_ds14 - Rating + Users + Item + Categories\nkochbar_ds15 - Rating + Users + Item + Groups\nkochbar_ds16 - Rating + Users + Item + Categories + AvgRatUser + AvgRatItem\nkochbar_ds17 - Rating + Users + Item + Groups + AvgRatUser + AvgRatItem\n'


*********************************
Analysing dataset [kochbar_ds.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.5997
Average RMSE for k=25: 0.6074
*********************************
Analysing dataset [kochbar_ds2.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.1351
Average RMSE for k=25: 0.1344
*********************************
Analysing dataset [kochbar_ds3.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.1356
Average RMSE for k=25: 0.1358
*********************************
Analysing dataset [kochbar_ds4.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.1398
Average RMSE for k=25: 0.1396
*********************************
Analysing dataset [kochbar_ds5.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.1447
Average RMSE for k=25: 0.1454
*********************************
Analysing dataset [kochbar_ds6.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.1355
Average RMSE for k=25: 0.1384
*********************************
Analysing dataset [kochbar_ds7.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.1403
Average RMSE for k=25: 0.1420
*********************************
Analysing dataset [kochbar_ds8.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.2228
Average RMSE for k=25: 0.2393
*********************************
Analysing dataset [kochbar_ds9.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.1540
Average RMSE for k=25: 0.2399
*********************************
Analysing dataset [kochbar_ds10.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.1555
Average RMSE for k=25: 0.2814
*********************************
Analysing dataset [kochbar_ds11.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.1713
Average RMSE for k=25: 0.2115
*********************************
Analysing dataset [kochbar_ds12.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.1538
Average RMSE for k=25: 0.2461
*********************************
Analysing dataset [kochbar_ds13.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.1550
Average RMSE for k=25: 0.3600
*********************************
Analysing dataset [kochbar_ds14.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.1321
Average RMSE for k=25: 0.1293
*********************************
Analysing dataset [kochbar_ds15.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.1622
Average RMSE for k=25: 0.1615
*********************************
Analysing dataset [kochbar_ds16.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.1341
Average RMSE for k=25: 0.1385
*********************************
Analysing dataset [kochbar_ds17.] results on [kochbar\Output73k]
*********************************
Average RMSE for k=5: 0.1444
Average RMSE for k=25: 0.1460
kochbar_ds - Rating + Users + Item
kochbar_ds2 - Rating + Users + Item + AvgRatUser
kochbar_ds3 - Rating + Users + Item + AvgRatUser + StdDevUser
kochbar_ds4 - Rating + Users + Item + AvgRatItem
kochbar_ds5 - Rating + Users + Item + AvgRatItem + StdDevItem
kochbar_ds6 - Rating + Users + Item + AvgRatUser + AvgRatItem
kochbar_ds7 - Rating + Users + Item + AvgRatUser + StdDevUser + AvgRatItem + StdDevItem
kochbar_ds8	- Rating + Users + Item + Ingredients
kochbar_ds9	- Rating + Users + Item + Ingredients + Categories
kochbar_ds10 - Rating + Users + Item + Ingredients + Categories + Groups
kochbar_ds11 - Rating + Users + Item + Ingredients + AvgRatUser + AvgRatItem
kochbar_ds12 - Rating + Users + Item + Ingredients + Categories + AvgRatUser + AvgRatItem
kochbar_ds13 - Rating + Users + Item + Ingredients + Categories + Groups + AvgRatUser + AvgRatItem
kochbar_ds14 - Rating + Users + Item + Categories
kochbar_ds15 - Rating + Users + Item + Groups
kochbar_ds16 - Rating + Users + Item + Categories + AvgRatUser + AvgRatItem
kochbar_ds17 - Rating + Users + Item + Groups + AvgRatUser + AvgRatItem

<matplotlib.figure.Figure at 0xcdd8710>

User Rating Average and Standard Deviation Histograms


In [5]:
user_stats = pd.read_csv('kochbar\\Data73k\\kochbar_ds.user.avgstd', usecols=[0,1], sep='\t')
user_stats.hist(alpha=0.5, bins=100)
fig = plt.gcf()
fig.set_size_inches(13,9)
plt.show()


Item Rating Average and Standard Deviation Histograms


In [6]:
user_stats = pd.read_csv('kochbar\\Data73k\\kochbar_ds.item.avgstd', usecols=[0,1], sep='\t')
user_stats.hist(alpha=0.5, bins=100)
fig = plt.gcf()
fig.set_size_inches(13,9)
plt.show()



In [ ]: