In [1]:
import urllib2, sys
import pandas as pd
from bs4 import BeautifulSoup
import timeit
import re


/Users/peter/anaconda/lib/python2.7/site-packages/pandas/computation/__init__.py:19: UserWarning: The installed version of numexpr 2.4.4 is not supported in pandas and will be not be used

  UserWarning)

In [55]:
pages = range(0,39)

#for page in pages:

number = 5 # page number

site= "http://www.metacritic.com/browse/dvds/release-date/available/date/?page=" + str(number)
hdr = {'User-Agent': 'Mozilla/5.0'} # helps avoid 403 forbidden error

# Import html into beautifulsoup
req = urllib2.Request(site,headers=hdr) 
page = urllib2.urlopen(req)
soup = BeautifulSoup(page, 'lxml') # lxml is supposed to be fast way to parse html

# get movie titles
a_tags = soup.find_all('a') # get all a tags
a_tags_text =[] 
for i in a_tags:
    a_tags_text.append(i.text) # append unicode text objects to new list
a_tags_str = [unicode(x).encode("utf-8") for x in a_tags] # convert unicode elements to strings
titles = []
for i in a_tags_str:
    output = re.search('<a href="/movie/(.+?)>', i)
    if output:
        titles.append(output.group(1))
titles = [x.split('"')[0] for x in titles][:200]

# a) get metascores, b) convert metascore list elements from bs4.element.Tag to unicode to string, c) convert to int
metascores = soup(attrs={'class': 'metascore_w'})
metascores = list(map((lambda x: str(x.text)), metascores))
metascores = list(map((lambda x: int(x)), metascores))

# get user scores
userscores = soup(attrs={'class': 'textscore'}) # get user scores
userscores = [x.text for x in userscores] # get unicode elements
userscores = [unicode(x).encode("utf-8") for x in userscores] # convert unicode elements to strings

# zip titles, metascores, and user scores
scores = []
scores = zip(titles, metascores, userscores)

# Convert to dataframe
df = pd.DataFrame.from_records(scores, columns=['title', 'metascore', 'userscore'])

# Pickle dataframe
df.to_pickle('metacritic_' + str(number) + '.pickle')

df


Out[55]:
title metascore userscore
0 x-men-days-of-future-past 74 8.5
1 4-minute-mile 48 tbd
2 a-million-ways-to-die-in-the-west 44 5.2
3 a-people-uncounted 72 tbd
4 advanced-style 66 tbd
5 all-you-need-is-kill 71 8.6
6 jack-and-the-cuckoo-clock-heart 56 7.4
7 million-dollar-arm 56 7.1
8 obvious-child 76 6.6
9 radio-free-albemuth 35 tbd
10 supermensch-the-legend-of-shep-gordon 64 6.2
11 tasting-menu 29 tbd
12 the-grand-seduction 57 7.3
13 to-be-takei 66 8.3
14 are-you-here 37 5.9
15 chef 68 7.8
16 cold-in-july 73 7.3
17 decoding-annie-parker 56 8.0
18 hellion 55 6.8
19 ivory-tower 65 tbd
20 lucky-them 65 6.4
21 space-station-76 49 5.9
22 third-person 38 5.0
23 thunder-and-the-house-of-magic 47 5.1
24 transformers-4 32 4.5
25 fort-mccoy 47 tbd
26 found 41 tbd
27 free-the-mind 51 tbd
28 ida 91 7.5
29 neighbors 68 6.3
... ... ... ...
170 enemy 61 7.3
171 fracknation 61 6.6
172 pandoras-promise 54 tbd
173 rob-the-mob 63 6.9
174 the-wedding-video 51 tbd
175 winters-tale 31 5.4
176 wolf-creek-2 44 5.9
177 almost-human 40 4.2
178 authors-anonymous 16 tbd
179 ernest-celestine 86 7.4
180 jimmy-p 58 6.3
181 joe 74 7.5
182 test 70 8.0
183 the-final-member 67 tbd
184 the-grand-budapest-hotel 88 8.4
185 the-lego-movie 83 8.3
186 the-machine 52 7.1
187 vic-+-flo-saw-a-bear 76 tbd
188 walk-of-shame 25 4.8
189 a-short-history-of-decay 54 tbd
190 adult-world 61 7.0
191 alan-partridge 66 6.7
192 capital 55 6.2
193 devils-knot 42 5.6
194 haunt 33 4.4
195 jack-ryan-shadow-recruit 57 6.0
196 non-stop-2014 56 7.1
197 omar 75 7.5
198 patrick 48 7.3
199 perfect-sisters 44 6.6

200 rows × 3 columns