In [1]:
from bs4 import BeautifulSoup
from itertools import islice
import urllib
import requests
import csv
import re

# Open csv file and read rows into a dict
movie_list = {}
with open('movie_list.csv', 'rb') as csvfile:
    reader = csv.DictReader(csvfile)
    for rows in reader:
        # Remove punctuation except "'s" e.g. Winter's Bone
        rows['movie_title'] = re.sub(r'[,:;!?/-]', '', rows['movie_title'])
        movie_list[rows['movie_title']] = rows['release_year']
        
print movie_list


{'Michael Clayton': '2007', 'Chocolat': '2000', 'In The Bedroom': '2001', 'Life of Pi': '2012', 'Sideways': '2004', 'American Sniper': '2014', 'Argo': '2012', 'War Horse': '2011', 'Precious': '2009', 'American Hustle': '2013', 'The Blind Side': '2009', 'An Education': '2009', 'The Wolf of Wall Street': '2013', 'Million Dollar Baby': '2004', 'The Hours': '2002', '12 Years a Slave': '2013', "The King's Speech": '2010', 'Gangs of New York': '2002', 'Milk': '2008', 'Ray': '2004', 'Up in the Air': '2009', 'Silver Linings Playbook': '2012', 'A Serious Man': '2009', 'A Beautiful Mind': '2001', 'Slumdog Millionaire': '2008', 'Gravity': '2013', 'Capote': '2005', 'Moneyball': '2011', 'Good Night and Good Luck': '2005', 'Brokeback Mountain': '2005', 'Seabiscuit': '2003', 'The Theory of Everything': '2014', 'Gosford Park': '2001', 'Letters from Iwo Jima': '2006', 'The Hurt Locker': '2009', 'There Will Be Blood': '2007', 'The Social Network': '2010', 'Up': '2009', 'The Artist': '2011', 'Juno': '2007', 'The Fighter': '2010', 'Whiplash': '2014', 'Crouching Tiger': ' Hidden Dragon', 'Gladiator': '2000', 'Midnight in Paris': '2011', 'Mystic River': '2003', 'Moulin Rouge': '2001', 'Finding Neverland': '2004', 'The Reader': '2008', 'Her': '2013', 'Mad Max Fury Road': '2015', 'Hugo': '2011', 'Captain Phillips': '2013', 'Django Unchained': '2012', 'Atonement': '2007', 'FrostNixon': '2008', "Winter's Bone": '2010', 'The Lord of the Rings The Return of the King': '2003', 'Crash': '2005', 'The Martian': '2015', 'Selma': '2014', 'No Country for Old Men': '2007', 'Brooklyn': '2015', 'Babel': '2006', 'Traffic': '2000', 'Les Miserables': '2012', '127 Hours': '2010', 'Lincoln': '2012', 'The Tree of Life': '2011', 'Inception': '2010', 'The Descendants': '2011', 'Spotlight': '2015', 'The Kids Are All Right': '2010', 'Bridge of Spies': '2015', 'Philomena': '2013', 'The Lord of the Rings Fellowship of the Ring': '2001', 'Little Miss Sunshine': '2006', 'Amour': '2012', 'Zero Dark Thirty': '2012', 'Master and Commander': '2003', 'The Big Short': '2015', 'The Imitation Game': '2014', 'Toy Story 3': '2010', 'Beasts of the Southern Wild': '2012', 'Black Swan': '2010', 'The Revenant': '2015', 'The Curious Case of Benjamin Button': '2008', 'True Grit': '2010', 'Inglourious Basterds': '2009', 'District 9': '2009', 'Munich': '2005', 'The Grand Budapest Hotel': '2014', 'The Help': '2011', 'Room': '2015', 'The Lord of the Rings The Two Towers': '2002', 'Erin Brockovich': '2000', 'Birdman': '2014', 'Nebraska': '2013', 'Avatar': '2009', 'Extremely Loud and Incredibly Close': '2011', 'Lost in Translation': '2003', 'The Aviator': '2004', 'Boyhood': '2014', 'The Pianist': '2002', 'The Departed': '2006', 'Chicago': '2002', 'The Queen': '2006', 'Dallas Buyers Club': '2013'}

In [37]:
def get_imdbid(movie_list):
    '''
    Retrieve IMDb IDs from movie list via OMDb API
    '''
    imdbid = {}
    for title, year in movie_list.iteritems():
        # Add + in between spaces in movie title
        title_query = re.sub(r'\s', '+', title)
        # Extract IMDb IDs
        r = urllib.urlopen('http://www.omdbapi.com/?t=' 
                           + title_query + '&y=' + year + '&plot=short&r=xml').read()
        soup = BeautifulSoup(r, 'xml')
        # Handle invalid movie title/year search
        if soup.error:
            print title + " not found"
        else:
            imdbid[title] = soup.movie['imdbID']
    return imdbid

imdbid = get_imdbid(movie_list)
print imdbid


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-37-d8a4e6305d88> in <module>()
     18     return imdbid
     19 
---> 20 imdbid = get_imdbid(movie_list)
     21 print imdbid

<ipython-input-37-d8a4e6305d88> in get_imdbid(movie_list)
     15             print title + " not found"
     16         else:
---> 17             imdbid[title] = soup.movie['imdbID']
     18     return imdbid
     19 

TypeError: 'NoneType' object has no attribute '__getitem__'

In [2]:
# Workaround as OMDB service is down as of 4-Feb-2017
imdbid = {'Spotlight': 'tt1895587','The Big Short': 'tt1596363','Bridge of Spies': 'tt3682448','Brooklyn': 'tt2381111','Mad Max Fury Road': 'tt1392190',
          'The Martian': 'tt3659388','The Revenant': 'tt1663202','Room': 'tt3170832','Birdman': 'tt2562232','American Sniper': 'tt2179136', 
         'Boyhood': 'tt1065073','The Grand Budapest Hotel': 'tt2278388','The Imitation Game': 'tt2084970','Selma': 'tt1020072','The Theory of Everything': 'tt2980516',
        'Whiplash': 'tt2582802','12 Years a Slave': 'tt2024544','American Hustle': 'tt1800241','Captain Phillips': 'tt1535109','Dallas Buyers Club': 'tt0790636',
          'Gravity': 'tt1454468','Her': 'tt1798709','Nebraska': 'tt1821549','Philomena': 'tt2431286','The Wolf of Wall Street': 'tt0993846',
          'Argo': 'tt1024648','Amour': 'tt1602620','Beasts of the Southern Wild': 'tt2125435','Django Unchained': 'tt1853728','Les Miserables': 'tt1707386',
          'Life of Pi': 'tt0454876','Lincoln': 'tt0443272','Silver Linings Playbook': 'tt1045658','Zero Dark Thirty': 'tt1790885','The Artist': 'tt1655442',
         'The Descendants': 'tt1033575','Extremely Loud and Incredibly Close': 'tt0477302','The Help': 'tt1454029','Hugo': 'tt0970179','Midnight in Paris': 'tt1605783',
         'Moneyball': 'tt1210166','The Tree of Life': 'tt0478304','War Horse': 'tt1568911','The King''s Speech': 'tt1504320','127 Hours': 'tt1542344',
         'Black Swan': 'tt0947798','The Fighter': 'tt0964517','Inception': 'tt1375666','The Kids Are All Right': 'tt0842926',
         'The Social Network': 'tt1285016','Toy Story 3': 'tt0435761','True Grit': 'tt1403865','Winter''s Bone': 'tt1399683',
         'The Hurt Locker': 'tt0887912','Avatar': 'tt0499549','The Blind Side': 'tt0878804','District 9': 'tt1136608','An Education': 'tt1174732',
         'Inglourious Basterds': 'tt0361748','Precious': 'tt0929632','A Serious Man': 'tt1019452','Up': 'tt1049413','Up in the Air': 'tt1193138',
         'Slumdog Millionaire': 'tt1010048','The Curious Case of Benjamin Button': 'tt0421715','Frost Nixon': 'tt0870111','Milk': 'tt1013753',
         'The Reader': 'tt0976051','No Country for Old Men': 'tt0477348','Atonement': 'tt0783233','Juno': 'tt0467406','Michael Clayton': 'tt0465538',
        'There Will Be Blood': 'tt0469494','The Departed': 'tt0407887','Babel': 'tt0449467','Letters from Iwo Jima': 'tt0498380','Little Miss Sunshine': 'tt0449059',
          'The Queen': 'tt0436697','Crash': 'tt0375679','Brokeback Mountain': 'tt0388795','Capote': 'tt0379725',
        'Good Night and Good Luck': 'tt0433383','Munich': 'tt0408306','Million Dollar Baby': 'tt0405159','The Aviator': 'tt0338751',
         'Finding Neverland': 'tt0308644','Ray': 'tt0350258','Sideways': 'tt0375063','The Lord of the Rings: The Return of the King': 'tt0167260',
         'Lost in Translation': 'tt0335266','Master and Commander': 'tt0311113','Mystic River': 'tt0327056','Seabiscuit': 'tt0329575',
         'Chicago': 'tt0299658','Gangs of New York': 'tt0217505','The Hours': 'tt0274558','The Lord of the Rings The Two Towers': 'tt0167261',
         'The Pianist': 'tt0253474','A Beautiful Mind': 'tt0268978','Gosford Park': 'tt0280707','In The Bedroom': 'tt0247425',
         'The Lord of the Rings Fellowship of the Ring': 'tt0120737','Moulin Rouge': 'tt0203009','Gladiator': 'tt0172495','Chocolat': 'tt0241303', 
         'Crouching Tiger Hidden Dragon': 'tt0190332','Erin Brockovich': 'tt0195685','Traffic': 'tt0181865'}

In [3]:
imdbid_pred = {'Moonlight': 'tt4975722','Manchester by the Sea': 'tt4034228','Fences': 'tt2671706','Lion': 'tt3741834','Hacksaw Ridge': 'tt2119532',
          'Hidden Figures': 'tt4846340', 'La La Land': 'tt3783958', 'Hell or High Water': 'tt2582782', 'Arrival': 'tt2543164'}

In [19]:
def get_oscar_status(id):
    '''
    Retrieves Best Picture status for a movie title
    '''
    r = requests.get('http://www.imdb.com/title/' + id + '/awards?ref_=tt_awd').text
    soup = BeautifulSoup(r, 'lxml')
    awards = soup.find("div", class_="article listo")
    for a in awards.contents:
        # Search for h3 tags which contain the Oscar best picture award title
        # NavigableString element is converted to unicode string to save memory
        if a.name == "h3" and unicode(a.next_element).strip() == u"Academy Awards, USA":
            result = a.find_next_sibling()
            res = result.find_all(class_="award_description")
            for j in res:
                if unicode(j.next_element).strip() == u"Best Motion Picture of the Year":
                    if unicode(j.parent.td.b) == u"<b>Won</b>": # Strange, get_text() doesnt work
                        return "Won"
                    else:
                        return None
                    
def get_synopsis(imdbid):
    '''
    Scrape movie plot synopsis for each movie title via IMDb
    '''
    dataset = {}
    for title, id in imdbid.iteritems():
        # Add movie titles in dataset
        dataset[title] = {}
        r = requests.get('http://www.imdb.com/title/' + id + '/synopsis?ref_=tt_stry_pl').text
        soup = BeautifulSoup(r, 'lxml')
        plot = soup.find(id="swiki.2.1").get_text().strip()
        dataset[title]["Plot Summary"] = plot.encode('utf-8')
        # Get Oscar Best Picture status
        best_pic = get_oscar_status(id)
        if best_pic == "Won":
            dataset[title]["Academy Awards, USA"] = 1
        else:
            dataset[title]["Academy Awards, USA"] = 0
    return dataset
    
dataset = get_synopsis(imdbid)
dataset_pred = get_synopsis(imdbid_pred)
#print dataset

In [17]:
# Output scraped data into csv
def output_to_csv(filename, dataset, pred):
    with open(filename, "wb") as toWrite:
        writer = csv.writer(toWrite, delimiter=",")
        # No target variable for prediction data
        if pred: 
            writer.writerow(["Title", "Plot Summary"])
            for i in dataset.keys():
                writer.writerow([i.encode("utf-8"),
                                 dataset[i]])
        else:
            writer.writerow(["Title", "Plot Summary", "Won"])
            for i in dataset.keys():
                writer.writerow([i.encode("utf-8"),
                                 dataset[i]["Plot Summary"],
                                 dataset[i]["Academy Awards, USA"]]) 
        
output_to_csv('movies_plot.csv', dataset, pred=False)
output_to_csv('movies_plot_pred.csv', dataset_pred, pred=True)

In [ ]: