Companion to Lecture 4 of Harvard CS109: Data Science
In [1]:
import requests
from pattern import web
from BeautifulSoup import BeautifulSoup
Find and print the movie title, list of genres, runtime, and score of all movies on this page
In [8]:
url = 'http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1950,2012'
r = requests.get(url)
print r.url
#print r.text
In [5]:
url = 'http://www.imdb.com/search/title'
params = dict(sort='num_votes,desc', start=1, title_type='feature', year='1950,2012')
r = requests.get(url, params=params)
print r.url # notice it constructs the full url for you
In [22]:
#selection in pattern follows the rules of CSS
dom = web.Element(r.text)
#print dom.by_tag('td.title')
for movie in dom.by_tag('td.title'):
title = movie.by_tag('a')[0].content
#print title
genres = movie.by_tag('span.genre')[0].by_tag('a')
#print genres[0].content
genres = [g.content for g in genres]
runtime = movie.by_tag('span.runtime')[0].content
rating = movie.by_tag('span.value')[0].content
print title, genres, runtime, rating
In [6]:
#selection in pattern follows the rules of CSS
dom = web.Element(r.text)
for movie in dom.by_tag('td.title'):
title = movie.by_tag('a')[0].content
genres = movie.by_tag('span.genre')[0].by_tag('a')
genres = [g.content for g in genres]
runtime = movie.by_tag('span.runtime')[0].content
rating = movie.by_tag('span.value')[0].content
print title, genres, runtime, rating
In [29]:
bs = BeautifulSoup(r.text)
for movie in bs.findAll('td', 'title'):
title = movie.find('a').contents[0]
genres = movie.find('span', 'genre').findAll('a')
genres = [g.contents[0] for g in genres]
runtime = movie.find('span', 'runtime').contents[0]
rating = movie.find('span', 'value').contents[0]
print title, genres, runtime, rating
In [ ]: