In [86]:
import numpy as np
import matplotlib.pyplot as plt
from csv import DictReader
Ocene
In [87]:
data = np.loadtxt ('../ratingSAMPLE.csv', delimiter=",", skiprows=1)
Žanri
In [88]:
genresCSV = DictReader(open("../zanri/statistikaZanrov.csv", "rt", encoding = "utf-8"))
Pretvorim v lažjeuporabno obliko
In [89]:
genres=list()
for genreLine in genresCSV:
genres.append(genreLine["zanr"])
Animeji (v surovi obliki)
In [90]:
animeDataReader = DictReader(open("../anime.csv", "rt", encoding = "utf-8"))
Dodeljevanje žanrov animejem
In [91]:
animeGenre=dict()
for row in animeDataReader:
i=0
rowGenres={}
for genr in genres:
i=i+1
if genr in row["genre"]:
rowGenres[genr]=True
else:
rowGenres[genr]=False
animeGenre[row["anime_id"]]=rowGenres
Štetje žanrov
In [92]:
genreNum = {}
for genr in genres:
genreNum[genr]=0
animeDataReader = DictReader(open("../anime.csv", "rt", encoding = "utf-8"))
for row in animeDataReader:
for genr in genres:
if animeGenre[row["anime_id"]][genr]:
genreNum[genr]=genreNum[genr]+1
In [184]:
dataUnclean=data
data=data[data[:,2]!=-1]
dataLearn={}
dataTest={}
dataLearn['S'] = np.array(data[0:len(data)/4])
dataTest['S'] = np.array(data[len(data)/4:len(data)])
dataLearn['M'] = np.array(data[0:len(data)/2])
dataTest['M'] = np.array(data[len(data)/2:len(data)])
dataLearn['L'] = np.array(data[0:2*len(data)/3])
dataTest['L'] = np.array(data[2*len(data)/3:len(data)])
In [185]:
avgRating={}
for lrnSize in ['S','M','L']:
avgRating[lrnSize]=sum(dataLearn[lrnSize][:,2])/len(dataLearn[lrnSize])
print(avgRating[lrnSize])
Prva metrika: MSE
In [186]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import brier_score_loss
In [187]:
pred={}
mseR={}
brierS={}
for lrnSize in ['S','M','L']:
pred[lrnSize]=np.ones(len(dataTest[lrnSize]))*avgRating[lrnSize]
mseR[lrnSize] = mean_squared_error(dataTest[lrnSize][:,2],pred[lrnSize])
#brierS[lrnSize] = brier_score_loss(dataTest[lrnSize][:,2], pred[lrnSize])
In [188]:
for lrnSize in ['S','M','L']:
print(mseR[lrnSize])
In [189]:
from sklearn.naive_bayes import GaussianNB
In [197]:
gnb = GaussianNB()
allSizes=['S','M','L']
selectSizes=['M']
predGNB={}
for lrnSize in ['M']:
predGNB[lrnSize]= gnb.fit(dataLearn[lrnSize], dataTest[lrnSize][:,2])
predMSE={}
for lrnSize in ['M']:
print(predGNB[lrnSize].predict(dataTest[lrnSize]))
print(dataTest[lrnSize][:,2])
for lrnSize in ['M']:
mseR[lrnSize] = mean_squared_error(dataTest[lrnSize][:,2],pred[lrnSize])
In [181]:
from scipy.stats import beta
In [182]:
a=8
b=2
n=1000
sample=beta.rvs(a, b, size=n)
xr = np.linspace(0, 1, 100)# interval X
P = [beta.pdf(x, a, b) for x in xr] # porazdelitvena funkcija
# Histogram - porazdelitev naključlnih VZORCEV x glede na P(x)
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.title("Vzorec")
plt.hist(sample, color="red")
plt.xlabel("X")
plt.ylabel("Število primerov")
# Graf porazdelitvene funkcije
plt.subplot(1, 2, 2)
plt.title("Graf porazdelitve")
plt.plot(xr, P, color="red") # nariši P(x)
plt.ylabel("P(x)")
plt.xlabel("X")
plt.show()
In [183]:
parameters = beta.fit(sample)
P_fit = [beta.pdf(x, *parameters) for x in xr ]
plt.figure()
plt.hist(sample, label="Vzorec", normed=True)
plt.plot(xr, P, label="P(X) resnična", linewidth=2.0)
plt.plot(xr, P_fit, label="P(X) ocenjena", linewidth=2.0) # ocenjena porazdelitev je model
plt.legend()
plt.show()
In [ ]:
In [ ]:
In [ ]: