Pandas je biblioteka za analizu podataka. Ilustrirat ćemo njeno korištenje na jednostavnom primjeru analize podataka iz IMDB baze.
Koristit ću i biblioteke requests (za učitavanje web stranica) i BeautifulSoup (za analizu HTML-a).
In [4]:
import requests
from bs4 import BeautifulSoup as bs
Dohvaćamo podatke s IMDB-a o filmovima sa zemljom porijekla Hrvatska, smimljenim između 1945. i 2017.
In [117]:
import time
url = 'http://www.imdb.com/search/title'
params = dict(sort='num_votes,desc', start=1, title_type='feature', year='1945,2017',countries='hr', languages='hr')
r=[]
for i in range(1,240,50): # takvih filmova ima trenutno 201
params['start']=i
r.append(requests.get(url, params=params))
time.sleep(10)
Parsiranje podataka koje spremamo u datoteku filmovi2.txt
.
In [167]:
import re
with open('filmovi2.txt','w') as f:
for i in range(len(r)):
soup = bs(r[i].text,'html.parser')
for film in soup.find_all('div', class_="lister-item"):
for a in film.find_all('a', href=True):
if '/title/tt' and 'adv_li_tt' in a['href']:
title = a.contents[0]
rt = film.find_all('span', class_="runtime")
if rt:
runtime = rt[0].contents[0]
else:
runtime = '0 mins'
y = film.find_all('span', class_="lister-item-year")
if y:
year = y[0].contents[0]
year = year.replace('(I)','').replace('(III)','').strip()
else:
year = '???'
rat = film.find_all('span', class_='global-sprite rating-star imdb-rating')
if rat:
rating = str(list(rat[0].next_siblings)[1]).replace('<strong>','').replace('</strong>','')
else:
rat = '0'
g = film.find_all('span',class_="genre")
if g: genres = ' '.join(g[0].contents).replace('\n','').strip()
d=film.find_all(string =re.compile('Director'))[0]
director = d.next_element.contents[0]
f.write('\t'.join((title, year, runtime, rating,director, genres))+'\n')
In [119]:
!head filmovi2.txt
In [174]:
import pandas as pd
names = ['title', 'year','runtime', 'rating', 'director', 'genres']
data = pd.read_csv('filmovi2.txt', delimiter='\t', names=names)
print ("Number of rows: {:d}".format(data.shape[0]))
data.head()
Out[174]:
In [175]:
# data['runtime'].fillna('0 mins.', inplace=True);
clean_runtime = [int(v.split(' ')[0]) for v in data.runtime]
data['runtime'] = clean_runtime
data['year'] = [int(y[1:-1]) for y in data.year]
# data.rating[data.rating=='-'] = '0';
clean_rating = [float(v) for v in data.rating]
data['rating'] = clean_rating
#clean_genres = [g.replace(' ','|') for g in data.genres]
#data['genres'] = clean_genres
data.head()
Out[175]:
In [176]:
data.ix[118]
Out[176]:
In [177]:
data[['year','runtime', 'rating']].describe()
Out[177]:
In [217]:
import numpy as np
data.replace(0,np.nan, inplace=True);
In [218]:
data[['runtime', 'rating']].describe()
Out[218]:
In [219]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(data.year, bins=np.arange(1945, 2017))
plt.xlabel("Godina produkcije");
In [220]:
plt.hist(data.rating.dropna(), bins=20)
plt.xlabel("IMDB ocjena");
In [222]:
plt.scatter(data.year, data.rating, lw=0, color='k')
plt.xlabel("Godina")
plt.ylabel("IMDB ocjena");
In [223]:
data[data.rating == data.rating.min()][['title', 'year', 'rating','director', 'genres']]
Out[223]:
In [224]:
data[data.rating == data.rating.max()][['title', 'year', 'rating','director', 'genres']]
Out[224]:
In [225]:
genres = set()
for m in data.genres:
genres.update(g for g in m.split(','))
genres = sorted(genres)
for genre in genres:
data[genre] = [genre in movie.split(',') for movie in data.genres]
data.head()
Out[225]:
In [226]:
genre_count = data[genres].sum()
pd.DataFrame({'Genre Count': genre_count})
Out[226]:
In [227]:
petoljetka = (data.year // 5) * 5
tyd = data.loc[:, ('title', 'year')]
tyd['petoljetka'] = petoljetka;
tyd.head()
Out[227]:
In [228]:
pet_mean = data.groupby(petoljetka).rating.mean()
pet_mean.name = 'Petoljetka mean'
print (pet_mean)
plt.plot(pet_mean.index, pet_mean.values, 'o-',
color='r', lw=3, label='Petoljetka prosjek')
plt.scatter(data.year, data.rating, alpha=.04, lw=0, color='k')
plt.xlabel("Godina")
plt.ylabel("Ocjena")
plt.legend(frameon=False);
In [229]:
for year, subset in data.groupby('year'):
print (year, subset[subset.rating == subset.rating.max()].title.values)
In [230]:
from verzije import *
from IPython.display import HTML
HTML(print_sysinfo()+info_packages('pandas, numpy,requests, beautifulsoup4'))
Out[230]: