In [1]:
import urllib2, sys
import pandas as pd
from bs4 import BeautifulSoup
import timeit
import re
In [55]:
pages = range(0,39)
#for page in pages:
number = 5 # page number
site= "http://www.metacritic.com/browse/dvds/release-date/available/date/?page=" + str(number)
hdr = {'User-Agent': 'Mozilla/5.0'} # helps avoid 403 forbidden error
# Import html into beautifulsoup
req = urllib2.Request(site,headers=hdr)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page, 'lxml') # lxml is supposed to be fast way to parse html
# get movie titles
a_tags = soup.find_all('a') # get all a tags
a_tags_text =[]
for i in a_tags:
a_tags_text.append(i.text) # append unicode text objects to new list
a_tags_str = [unicode(x).encode("utf-8") for x in a_tags] # convert unicode elements to strings
titles = []
for i in a_tags_str:
output = re.search('<a href="/movie/(.+?)>', i)
if output:
titles.append(output.group(1))
titles = [x.split('"')[0] for x in titles][:200]
# a) get metascores, b) convert metascore list elements from bs4.element.Tag to unicode to string, c) convert to int
metascores = soup(attrs={'class': 'metascore_w'})
metascores = list(map((lambda x: str(x.text)), metascores))
metascores = list(map((lambda x: int(x)), metascores))
# get user scores
userscores = soup(attrs={'class': 'textscore'}) # get user scores
userscores = [x.text for x in userscores] # get unicode elements
userscores = [unicode(x).encode("utf-8") for x in userscores] # convert unicode elements to strings
# zip titles, metascores, and user scores
scores = []
scores = zip(titles, metascores, userscores)
# Convert to dataframe
df = pd.DataFrame.from_records(scores, columns=['title', 'metascore', 'userscore'])
# Pickle dataframe
df.to_pickle('metacritic_' + str(number) + '.pickle')
df
Out[55]: