In [1]:
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 17 16:48:11 2015
@author: ruifpmaia
"""
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob
import os
%matplotlib inline
In [2]:
def processOutput(path, dataset, train_epo):
print "*********************************"
print "Analysing dataset [%s] results on [%s]" % (dataset, path )
print "*********************************"
rmse = {}
mae = {}
min_rmse = {}
rmse_train = {}
min_k = 0 # minimun RMSE on the test
# For each k...
#for k in [5,10,15,20,25]:
#for k in [5,25,50,100]:
for k in [5,25]:
#print "Getting RSME for k=" + str(k)
extension = "*.k" + str(k) + ".csv"
test_count = 0;
# print extension
rmse[k] = 0.0
mae[k] = 0.0
rmse_train[k] = {}
min_rmse[k] = {'min':0, 'idx':0, 'file':''}
for ix in range(train_epo):
rmse_train[k][ix]=0
# Analyse each file...
for filename in glob.glob(os.path.join(path, extension)):
#print filename
if (filename.find(dataset) >= 0):
# read results file
df = pd.read_csv(filename, usecols=[0], sep='\t')
mae_col = pd.read_csv(filename, usecols=[1], sep='\t')
# get last RMSE and add it to the total rmse
rmse[k] += df.values[train_epo-1]
mae[k] += mae_col.values[train_epo-1]
train_iter_idx = 0
test_count += 1
# add iteration values to dic
for iter_val in range(train_epo):
rmse_train[k][train_iter_idx] += df.values[train_iter_idx]
if (min_rmse[k]['min'] == 0):
min_rmse[k]['min'] = df.values[train_iter_idx]
min_rmse[k]['idx'] = train_iter_idx
min_rmse[k]['file'] = filename
elif (min_rmse[k]['min'] > df.values[train_iter_idx]):
min_rmse[k]['min'] = df.values[train_iter_idx]
min_rmse[k]['idx'] = train_iter_idx
min_rmse[k]['file'] = filename
train_iter_idx += 1
#get the average of RMSE for K = n
rmse[k] /= test_count
mae[k] /= test_count
if (k == 5):
min_k = 5
elif (rmse[k] < rmse[min_k]):
min_k = k
train_iter_idx = 0
for iter_val in rmse_train[k]:
rmse_train[k][train_iter_idx] /= test_count
train_iter_idx += 1
print "Average RMSE for k=" + str(k) + ": %.4f" % rmse[k]
return (rmse_train, rmse, mae, min_k)
def Analyse(path, dataset_list, train_epo):
legend = []
res_dic = {}
plt.gca().set_color_cycle(['red', 'green', 'blue', 'yellow', 'gray'])
x = np.arange(train_epo)
for dataset in dataset_list:
avgres = 0.0
res_dic[dataset] = {}
(rmse_train, rmse, mae, min_k) = processOutput(path, dataset, train_epo)
plt.plot(x, rmse_train[min_k].values())
legend.append("%s[k%d] RMSE:%.4f MAE:%.4f" % (dataset, min_k, rmse[min_k], mae[min_k]))
res_dic[dataset][str('FM-k%d'% min_k)] = rmse[min_k][0]
fig = plt.gcf()
fig.set_size_inches(13,9)
plt.grid(True)
plt.legend(legend, loc='upper left')
plt.show()
plt.draw()
return 0
def Baseline(dataset):
try:
avgres_fn = dataset + ".rmse.avg"
with open(avgres_fn) as resf:
avgres = float(resf.readline())
except IOError:
print "No RMSE result using AVG for Prediction"
try:
maeres_fn = dataset + ".mae.avg"
with open(maeres_fn) as resf:
maeres = float(resf.readline())
except IOError:
print "No MAE result using AVG for Prediction"
print ("%s\nRMSE-AVG:%.4f\nMAE-AVG:%.4f" % (dataset, float(avgres), float(maeres)))
In [3]:
with open('kochbar\\Data73k\\kochbar_classification', 'r') as fin:
print fin.read()
print '-------------------'
print 'kochbar86k_ds - Rating + Users + Item\nkochbar86k_ds2 - Rating + Users + Item + AvgRatUser\nkochbar86k_ds3 - Rating + Users + Item + AvgRatUser + StdDevUser\nkochbar86k_ds4 - Rating + Users + Item + AvgRatItem\nkochbar86k_ds5 - Rating + Users + Item + AvgRatItem + StdDevItem\nkochbar86k_ds6 - Rating + Users + Item + AvgRatUser + AvgRatItem\nkochbar86k_ds7 - Rating + Users + Item + AvgRatUser + StdDevUser + AvgRatItem + StdDevItem\n'
print '------------------- BASELINE (AVG) -----------'
Baseline('kochbar\\Output73k\\kochbar_ds')
print '-------------------'
In [4]:
Analyse('kochbar\\Output73k', ['kochbar_ds.','kochbar_ds2.','kochbar_ds3.','kochbar_ds4.','kochbar_ds5.','kochbar_ds6.','kochbar_ds7.','kochbar_ds8.','kochbar_ds9.','kochbar_ds10.','kochbar_ds11.','kochbar_ds12.','kochbar_ds13.','kochbar_ds14.','kochbar_ds15.','kochbar_ds16.','kochbar_ds17.' ], 100)
print 'kochbar_ds - Rating + Users + Item\nkochbar_ds2 - Rating + Users + Item + AvgRatUser\nkochbar_ds3 - Rating + Users + Item + AvgRatUser + StdDevUser\nkochbar_ds4 - Rating + Users + Item + AvgRatItem\nkochbar_ds5 - Rating + Users + Item + AvgRatItem + StdDevItem\nkochbar_ds6 - Rating + Users + Item + AvgRatUser + AvgRatItem\nkochbar_ds7 - Rating + Users + Item + AvgRatUser + StdDevUser + AvgRatItem + StdDevItem\nkochbar_ds8 - Rating + Users + Item + Ingredients\nkochbar_ds9 - Rating + Users + Item + Ingredients + Categories\nkochbar_ds10 - Rating + Users + Item + Ingredients + Categories + Groups\nkochbar_ds11 - Rating + Users + Item + Ingredients + AvgRatUser + AvgRatItem\nkochbar_ds12 - Rating + Users + Item + Ingredients + Categories + AvgRatUser + AvgRatItem\nkochbar_ds13 - Rating + Users + Item + Ingredients + Categories + Groups + AvgRatUser + AvgRatItem\nkochbar_ds14 - Rating + Users + Item + Categories\nkochbar_ds15 - Rating + Users + Item + Groups\nkochbar_ds16 - Rating + Users + Item + Categories + AvgRatUser + AvgRatItem\nkochbar_ds17 - Rating + Users + Item + Groups + AvgRatUser + AvgRatItem\n'
In [5]:
user_stats = pd.read_csv('kochbar\\Data73k\\kochbar_ds.user.avgstd', usecols=[0,1], sep='\t')
user_stats.hist(alpha=0.5, bins=100)
fig = plt.gcf()
fig.set_size_inches(13,9)
plt.show()
In [6]:
user_stats = pd.read_csv('kochbar\\Data73k\\kochbar_ds.item.avgstd', usecols=[0,1], sep='\t')
user_stats.hist(alpha=0.5, bins=100)
fig = plt.gcf()
fig.set_size_inches(13,9)
plt.show()
In [ ]: