In [1]:
import numpy as np
import matplotlib.pyplot as plt
In [2]:
#data = np.loadtxt ('../rating.csv', delimiter=",", skiprows=1)
data = np.loadtxt ('../ratingSAMPLE.csv', delimiter=",", skiprows=1)
data
Out[2]:
Koda za pridobitev dejanskih ratingov (brez ogledov brey ocene)
In [3]:
data[data[:,2]!=-1,2]
Out[3]:
Število vseh ratingov
In [4]:
len(data[:])
Out[4]:
Število ocenjenih animejev
In [5]:
len(data[data[:,2]!=-1,2])
Out[5]:
Število neocenjenih ratingov
In [6]:
len(data)-len(data[data[:,2]!=-1,2])
Out[6]:
Povprečni rating
In [7]:
np.mean(data[data[:,2]!=-1,2])
Out[7]:
Standardni odklon
In [8]:
np.std(data[data[:,2]!=-1,2])
Out[8]:
Varianca
In [9]:
np.var(data[data[:,2]!=-1,2])
Out[9]:
Minimum in maksimum
In [10]:
print(np.min(data[data[:,2]!=-1,2]))
print(np.max(data[data[:,2]!=-1,2]))
In [11]:
ratingsNum=list()
for number in np.arange(1,10):
ratingsNum.append(len(data[data[:,2]==number,2]))
plt.figure()
plt.bar(np.arange(1,10),ratingsNum, 0.8, color="blue")
plt.show()
Zaradi velikosti smo naredili sample dataset. Da se ga da dati na github, sem omejil na 3*105 vrstic
In [12]:
#ratingsSample=data[:300000,:]
#np.savetxt('../ratingSAMPLE.csv', ratingsSample, delimiter=',')
In [13]:
from csv import DictReader
In [14]:
animeDataReader = DictReader(open("../anime.csv", "rt", encoding = "utf-8"))
In [15]:
genresCSV = DictReader(open("../zanri/statistikaZanrov.csv", "rt", encoding = "utf-8"))
In [16]:
genres=list()
for genreLine in genresCSV:
genres.append(genreLine["zanr"])
animeGenre=dict()
In [17]:
for row in animeDataReader:
i=0
rowGenres={}
for genr in genres:
i=i+1
if genr in row["genre"]:
rowGenres[genr]=True
else:
rowGenres[genr]=False
animeGenre[row["anime_id"]]=rowGenres
Narejen je vgnezdeni dict objekt, ki za vsak anime hrani ali je ali ni določenega žanra
In [18]:
print(animeGenre['123'])
In [19]:
print(animeGenre['123']['Romance'])
Preštejemo, koliko imamo posameznih žanrov v trenutni izbiri
In [20]:
genreNum = {}
for genr in genres:
genreNum[genr]=0
animeDataReader = DictReader(open("../anime.csv", "rt", encoding = "utf-8"))
for row in animeDataReader:
for genr in genres:
if animeGenre[row["anime_id"]][genr]:
genreNum[genr]=genreNum[genr]+1
In [21]:
genreNum
Out[21]:
Normalizirano gleda na celoto
In [22]:
genreNumNorm={}
for val in genreNum.keys():
genreNumNorm[val]=genreNum[val]/len(animeGenre)
plt.bar(range(len(genreNumNorm)), genreNumNorm.values())
plt.xticks(range(len(genreNumNorm)), genreNumNorm.keys())
plt.show()
Povprečno število žanrov na anime
In [23]:
avgGenresPerAnime=0
for val in genreNumNorm.keys():
avgGenresPerAnime=avgGenresPerAnime+genreNumNorm[val]
print(avgGenresPerAnime)
In [ ]: