In [1]:
from bs4 import BeautifulSoup
from itertools import islice
import urllib
import requests
import csv
import re
# Open csv file and read rows into a dict
movie_list = {}
with open('movie_list.csv', 'rb') as csvfile:
reader = csv.DictReader(csvfile)
for rows in reader:
# Remove punctuation except "'s" e.g. Winter's Bone
rows['movie_title'] = re.sub(r'[,:;!?/-]', '', rows['movie_title'])
movie_list[rows['movie_title']] = rows['release_year']
print movie_list
In [37]:
def get_imdbid(movie_list):
'''
Retrieve IMDb IDs from movie list via OMDb API
'''
imdbid = {}
for title, year in movie_list.iteritems():
# Add + in between spaces in movie title
title_query = re.sub(r'\s', '+', title)
# Extract IMDb IDs
r = urllib.urlopen('http://www.omdbapi.com/?t='
+ title_query + '&y=' + year + '&plot=short&r=xml').read()
soup = BeautifulSoup(r, 'xml')
# Handle invalid movie title/year search
if soup.error:
print title + " not found"
else:
imdbid[title] = soup.movie['imdbID']
return imdbid
imdbid = get_imdbid(movie_list)
print imdbid
In [2]:
# Workaround as OMDB service is down as of 4-Feb-2017
imdbid = {'Spotlight': 'tt1895587','The Big Short': 'tt1596363','Bridge of Spies': 'tt3682448','Brooklyn': 'tt2381111','Mad Max Fury Road': 'tt1392190',
'The Martian': 'tt3659388','The Revenant': 'tt1663202','Room': 'tt3170832','Birdman': 'tt2562232','American Sniper': 'tt2179136',
'Boyhood': 'tt1065073','The Grand Budapest Hotel': 'tt2278388','The Imitation Game': 'tt2084970','Selma': 'tt1020072','The Theory of Everything': 'tt2980516',
'Whiplash': 'tt2582802','12 Years a Slave': 'tt2024544','American Hustle': 'tt1800241','Captain Phillips': 'tt1535109','Dallas Buyers Club': 'tt0790636',
'Gravity': 'tt1454468','Her': 'tt1798709','Nebraska': 'tt1821549','Philomena': 'tt2431286','The Wolf of Wall Street': 'tt0993846',
'Argo': 'tt1024648','Amour': 'tt1602620','Beasts of the Southern Wild': 'tt2125435','Django Unchained': 'tt1853728','Les Miserables': 'tt1707386',
'Life of Pi': 'tt0454876','Lincoln': 'tt0443272','Silver Linings Playbook': 'tt1045658','Zero Dark Thirty': 'tt1790885','The Artist': 'tt1655442',
'The Descendants': 'tt1033575','Extremely Loud and Incredibly Close': 'tt0477302','The Help': 'tt1454029','Hugo': 'tt0970179','Midnight in Paris': 'tt1605783',
'Moneyball': 'tt1210166','The Tree of Life': 'tt0478304','War Horse': 'tt1568911','The King''s Speech': 'tt1504320','127 Hours': 'tt1542344',
'Black Swan': 'tt0947798','The Fighter': 'tt0964517','Inception': 'tt1375666','The Kids Are All Right': 'tt0842926',
'The Social Network': 'tt1285016','Toy Story 3': 'tt0435761','True Grit': 'tt1403865','Winter''s Bone': 'tt1399683',
'The Hurt Locker': 'tt0887912','Avatar': 'tt0499549','The Blind Side': 'tt0878804','District 9': 'tt1136608','An Education': 'tt1174732',
'Inglourious Basterds': 'tt0361748','Precious': 'tt0929632','A Serious Man': 'tt1019452','Up': 'tt1049413','Up in the Air': 'tt1193138',
'Slumdog Millionaire': 'tt1010048','The Curious Case of Benjamin Button': 'tt0421715','Frost Nixon': 'tt0870111','Milk': 'tt1013753',
'The Reader': 'tt0976051','No Country for Old Men': 'tt0477348','Atonement': 'tt0783233','Juno': 'tt0467406','Michael Clayton': 'tt0465538',
'There Will Be Blood': 'tt0469494','The Departed': 'tt0407887','Babel': 'tt0449467','Letters from Iwo Jima': 'tt0498380','Little Miss Sunshine': 'tt0449059',
'The Queen': 'tt0436697','Crash': 'tt0375679','Brokeback Mountain': 'tt0388795','Capote': 'tt0379725',
'Good Night and Good Luck': 'tt0433383','Munich': 'tt0408306','Million Dollar Baby': 'tt0405159','The Aviator': 'tt0338751',
'Finding Neverland': 'tt0308644','Ray': 'tt0350258','Sideways': 'tt0375063','The Lord of the Rings: The Return of the King': 'tt0167260',
'Lost in Translation': 'tt0335266','Master and Commander': 'tt0311113','Mystic River': 'tt0327056','Seabiscuit': 'tt0329575',
'Chicago': 'tt0299658','Gangs of New York': 'tt0217505','The Hours': 'tt0274558','The Lord of the Rings The Two Towers': 'tt0167261',
'The Pianist': 'tt0253474','A Beautiful Mind': 'tt0268978','Gosford Park': 'tt0280707','In The Bedroom': 'tt0247425',
'The Lord of the Rings Fellowship of the Ring': 'tt0120737','Moulin Rouge': 'tt0203009','Gladiator': 'tt0172495','Chocolat': 'tt0241303',
'Crouching Tiger Hidden Dragon': 'tt0190332','Erin Brockovich': 'tt0195685','Traffic': 'tt0181865'}
In [3]:
imdbid_pred = {'Moonlight': 'tt4975722','Manchester by the Sea': 'tt4034228','Fences': 'tt2671706','Lion': 'tt3741834','Hacksaw Ridge': 'tt2119532',
'Hidden Figures': 'tt4846340', 'La La Land': 'tt3783958', 'Hell or High Water': 'tt2582782', 'Arrival': 'tt2543164'}
In [19]:
def get_oscar_status(id):
'''
Retrieves Best Picture status for a movie title
'''
r = requests.get('http://www.imdb.com/title/' + id + '/awards?ref_=tt_awd').text
soup = BeautifulSoup(r, 'lxml')
awards = soup.find("div", class_="article listo")
for a in awards.contents:
# Search for h3 tags which contain the Oscar best picture award title
# NavigableString element is converted to unicode string to save memory
if a.name == "h3" and unicode(a.next_element).strip() == u"Academy Awards, USA":
result = a.find_next_sibling()
res = result.find_all(class_="award_description")
for j in res:
if unicode(j.next_element).strip() == u"Best Motion Picture of the Year":
if unicode(j.parent.td.b) == u"<b>Won</b>": # Strange, get_text() doesnt work
return "Won"
else:
return None
def get_synopsis(imdbid):
'''
Scrape movie plot synopsis for each movie title via IMDb
'''
dataset = {}
for title, id in imdbid.iteritems():
# Add movie titles in dataset
dataset[title] = {}
r = requests.get('http://www.imdb.com/title/' + id + '/synopsis?ref_=tt_stry_pl').text
soup = BeautifulSoup(r, 'lxml')
plot = soup.find(id="swiki.2.1").get_text().strip()
dataset[title]["Plot Summary"] = plot.encode('utf-8')
# Get Oscar Best Picture status
best_pic = get_oscar_status(id)
if best_pic == "Won":
dataset[title]["Academy Awards, USA"] = 1
else:
dataset[title]["Academy Awards, USA"] = 0
return dataset
dataset = get_synopsis(imdbid)
dataset_pred = get_synopsis(imdbid_pred)
#print dataset
In [17]:
# Output scraped data into csv
def output_to_csv(filename, dataset, pred):
with open(filename, "wb") as toWrite:
writer = csv.writer(toWrite, delimiter=",")
# No target variable for prediction data
if pred:
writer.writerow(["Title", "Plot Summary"])
for i in dataset.keys():
writer.writerow([i.encode("utf-8"),
dataset[i]])
else:
writer.writerow(["Title", "Plot Summary", "Won"])
for i in dataset.keys():
writer.writerow([i.encode("utf-8"),
dataset[i]["Plot Summary"],
dataset[i]["Academy Awards, USA"]])
output_to_csv('movies_plot.csv', dataset, pred=False)
output_to_csv('movies_plot_pred.csv', dataset_pred, pred=True)
In [ ]: