Make AAdict



In [1]:

    
from imdb import IMDb
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import cPickle as pickle
ia = IMDb(accessSystem='http')
from collections import defaultdict 
import io
from datetime import datetime
import time
import math

Just run the following cell to open AAdict



In [59]:

    
# Run this cell instead of rerunning a portion of this notebook.
# By loading AAdict, you can skip the cells until you see the note "PICK BACK UP HERE"
AAdict = pickle.load(open('AAdict.p','rb'))



In [3]:

    
# Read in Academy Awards df (AAcsv)
AAcsv = pd.read_excel("Academy_Awards_2006.xls")

# Update df
# Concat Sort Title with first part of year; store in "title"
# This will be helpful when using ia.search_movie function
AAcsv['Year'] = AAcsv['Year'].values.astype(str)
AAcsv['yr'] = AAcsv.apply( lambda row: row['Year'][:4],axis=1 )
AAcsv['titleyr'] = AAcsv.apply( lambda row: '%s (%s)' % (row['Sort Title'],row['Year'][:4]),axis=1)
# Convert 'Winner' "X" to 1
AAcsv['Winner?'] = 1*(AAcsv['Winner?'] == 'X')
# If movie title is "[no specific film title]", delete
AAcsv = AAcsv[AAcsv['Sort Title'] != "[no specific film title]"].copy()

# Subset DF
AAcsv = AAcsv[AAcsv["yr"]>="1981"].copy()

# Store all possible awards in "awards" list
# This will be used when making aadict to indicate which awards the movie was nominated for/won
awards = list(set(list(AAcsv['Category'])))

# View head of AAcsv
AAcsv.head()









    Out[3]:






  
    
      
      Original Title
      English Title
      Sort Title
      Year
      Country
      Award
      Category
      Winner?
      Nominee(s)
      __ Academy Awards
      Item
      yr
      titleyr
    
  
  
    
      5924
                 Atlantic City
                                       Atlantic City
                 Atlantic City
       1981
       0
       Academy Award
       Best Picture
       0
       Denis Heroux, John Kemeny (Producers)
       54th
       5925
       1981
                 Atlantic City (1981)
    
    
      5925
              Chariots of Fire
                                    Chariots of Fire
              Chariots of Fire
       1981
       0
       Academy Award
       Best Picture
       1
                    David Puttnam (Producer)
       54th
       5926
       1981
              Chariots of Fire (1981)
    
    
      5926
                On Golden Pond
                                      On Golden Pond
                On Golden Pond
       1981
       0
       Academy Award
       Best Picture
       0
                    Bruce Gilbert (Producer)
       54th
       5927
       1981
                On Golden Pond (1981)
    
    
      5927
       Raiders of the Lost Ark
       Indiana Jones and the Raiders of the Lost Ark
       Raiders of the Lost Ark
       1981
       0
       Academy Award
       Best Picture
       0
                   Frank Marshall (Producer)
       54th
       5928
       1981
       Raiders of the Lost Ark (1981)
    
    
      5928
                          Reds
                                                Reds
                          Reds
       1981
       0
       Academy Award
       Best Picture
       0
                    Warren Beatty (Producer)
       54th
       5929
       1981
                          Reds (1981)



In [10]:

    
# Due to the needing to call the IMDBpy database multiple times, this takes very long to run for our entire dataset.
# This can be used in future work for this analysis.

# Function:  get_starpower
# Purpose:  To convert movieobj and movie_tuple into a dictionary within AAdict
# Parameters:  
    # movieobj:  a single IMDBmovie object
    # movie_tuple:  a tuple in the format (English Title, Sort Title, English Title + yr, yr)
    # rewrite: if True, will rewrite an existing key in AAdict, if exists
# Returns None (if no movieobj) or 1 if successfully appended dictionary to AAdict

def starpower(movieobj, year):
    try:
        movie_cast = movieobj.data['cast']
    except:
        return 0
    cast_rating={}
    totalpower=0
    for actorperson in movie_cast:
        print actorperson
        ia.update(actorperson, 'filmography')
        if 'actor' in actorperson.keys():
            temp_movie_list = actorperson['actor'] # The filmography of a given actor
        elif 'actress' in actorperson.keys():
            #print "in the actress"
            temp_movie_list = actorperson['actress']
        else:
            temp_movie_list=[]
        count=0 # counts the number of movies they have been in thus far
        sum=0 # Sums the gross of the movies they have been in thus far
        avg_rating = 0 #holds the net total of ratings for the movies that an actor has been in thus far
        total_votes = 0 # Holds the IMDb votes for the movies, a proxy for movie popularity
        for j in temp_movie_list:
            ia.update(j, 'vote details')
            if 'rating' in j.keys():
                #print j
                if ((j.data['year'] <=year)):
                    count +=1
                    #sum += j['gross'] # Only used if we end up getting gross movien sales for all movies
                    avg_rating +=j.data['rating'] # adding up the movie ratings, 
                    total_votes +=(j.data['votes']*(1/(year-j.data['year']+1)))
        final_power = rating_calculator(count, avg_rating, total_votes)
        cast_rating[actorperson] = final_power #add in gross if it exists
        totalpower += final_power
        print cast_rating, totalpower
    return(cast_rating, totalpower)

def rating_calculator(count, avg_rating, total_votes):
    if (count!=0) & (avg_rating!=0) & (total_votes!=0):
        return(math.log(count) + (avg_rating/count)*0.3 + total_votes*0.000001)
    else:
        return(0)



In [10]:

    
# Due to the needing to call the IMDBpy database multiple times, this takes very long to run for our entire dataset.
# This can be used in future work for this analysis.

# Function:  get_directorpower
# Purpose:  To convert movieobj and movie_tuple into a dictionary within AAdict
# Parameters:  
    # movieobj:  a single IMDBmovie object
    # movie_tuple:  a tuple in the format (English Title, Sort Title, English Title + yr, yr)
    # rewrite: if True, will rewrite an existing key in AAdict, if exists
# Returns None (if no movieobj) or 1 if successfully appended dictionary to AAdict

def directorpower(movie_director, year):
    movie_cast
    cast_rating={}
    totalpower=0
    for actorperson in movie_cast:
        print actorperson
        ia.update(actorperson, 'filmography')
        if 'actor' in actorperson.keys():
            temp_movie_list = actorperson['actor'] # The filmography of a given actor
        elif 'actress' in actorperson.keys():
            #print "in the actress"
            temp_movie_list = actorperson['actress']
        else:
            temp_movie_list=[]
        count=0 # counts the number of movies they have been in thus far
        sum=0 # Sums the gross of the movies they have been in thus far
        avg_rating = 0 #holds the net total of ratings for the movies that an actor has been in thus far
        total_votes = 0 # Holds the IMDb votes for the movies, a proxy for movie popularity
        for j in temp_movie_list:
            ia.update(j, 'vote details')
            if 'rating' in j.keys():
                #print j
                if ((j.data['year'] <=year)):
                    count +=1
                    #sum += j['gross'] # Only used if we end up getting gross movien sales for all movies
                    avg_rating +=j.data['rating'] # adding up the movie ratings, 
                    total_votes +=(j.data['votes']*(1/(year-j.data['year']+1)))
        final_power = rating_calculator(count, avg_rating, total_votes)
        cast_rating[actorperson] = final_power #add in gross if it exists
        totalpower += final_power
        print cast_rating, totalpower

    return(cast_rating, totalpower)

def rating_calculator(count, avg_rating, total_votes):
    if (count!=0) & (avg_rating!=0) & (total_votes!=0):
        return(math.log(count) + (avg_rating/count)*0.3 + total_votes*0.000001)
    else:
        return(0)



In [4]:

    
### Movie Attribute functions ##

# Function:  get_releasedate
# Purpose:  get the USA release date given a movieobj
# Parameters: 
    # movieobj:  a single IMDBmovie object
# Returns: the USA release date of the movie as a class datetime
def get_releasedate(movieobj):
    try:
        ia.update(movieobj, 'release dates')
        date = str(movieobj.data['release dates']).split("USA::", 1)[1]
        day = str(date.split(" ")[0])
        month = str(date.split(" ")[1])
        year = str(date.split(" ")[2].split("'")[0])
        releasedate = datetime.strptime(year + "-" + month + "-" + day, "%Y-%B-%d").date()
    except:
        releasedate = np.nan
    return releasedate

# Function:  get_mpaa
# Purpose:  get the mpaa rating given a movieobj
# Parameters: 
    # movieobj:  a single IMDBmovie object
# Returns: the USA release date of the movie as a class datetime
def get_mpaa(movieobj):
    try:
        mpaa = str(movieobj.data['mpaa']).split("Rated ", 1)[1].split(" ")[0]
    except:
        mpaa = np.nan
    return mpaa

# Function:  get_genres
# Purpose:  get the list of genres given in a movieobj
# Parameters:
    # movieobj: a single IMDBmovie object
# Returns:  a list of genres
def get_genres(movieobj):
    try:
        genres = movieobj.data['genres']
    except:
        genres = np.nan
    return genres

# Function:  get_runtime
# Purpose:  get the runtime given in a movieobj
# Parameters:
    # movieobj: a single IMDBmovie object
# Returns:  USA runtime
def get_runtime(movieobj):
    try:
        runtime = movieobj.data['runtimes'][0]
        try:
            runtime = int(runtime)
        except:
            try:
                runtime = int(runtime.split(':')[0])
            except:
                try:
                    runtime = int(runtime.split(':')[1])
                except:
                    runtime = int(runtime.split(':')[2])
    except:
        runtime = 0
    return runtime

# Function:  get_starpower
# Purpose:  calculate rating of how well-known movie cast is
# Parameters:
    # movieobj: a single IMDBmovie object
# Returns:  an int
#%run 'Starpower.ipynb'
def get_starpower(movieobj, year):
    try:
        castdata = movieobj.data['cast']
        starpower = starpower(castdata, year)
    except:
        starpower = 0
    return starpower

# Function:  get_director
# Purpose:  get the list of directors given in a movieobj
# Parameters:
    # movieobj: a single IMDBmovie object
# Returns:  a list of directors
def get_director(movieobj):
    try:
        directordata = movieobj.data['director']
        director = []
        for person in directordata:
            director.append(person.personID)
    except:
        director = []
    return director

# Function:  get_keywords
# Purpose:  get the list of keywords given in a movieobj
# Parameters:
    # movieobj: a single IMDBmovie object
# Returns:  a list of keywords
def get_keywords(movieobj):
    try:
        ia.update(movieobj, 'keywords')
        keywords = movieobj.data['keywords']
    except:
        keywords = np.nan
    return keywords



In [5]:

    
# Function:  find_movie
# Purpose: to sort through possible IMDB movie objects and find just one
# Parameters:  
    #title: title of movie
    #year:  year of movie
    #mlist: list of possible IMDB movie objects
# Returns movieobj
def find_movie(title, year,  mlist):
    # find movies that came out in the same year                                                                                                                                    
    year_list = []
    for movie in mlist:
        try:
            if movie.data['year'] == int(year):
                year_list.append(movie)
            # else see if one - two years off
            elif movie.data['year'] == int(year) + 1:
                year_list.append(movie)
            elif movie.data['year'] == int(year) - 1:
                year_list.append(movie)
            elif movie.data['year'] == int(year) + 2:
                year_list.append(movie)
            elif movie.data['year'] == int(year) - 2:
                year_list.append(movie)
        except:
            pass
    # if the years do not match, there is no match                                                                                                                                  
    if len(year_list) < 1:
        return None
    if len(year_list) == 1:
        return ia.get_movie(year_list[0].movieID)
    else:
        # process the title                                                                                                                                                         
        sorted_title = "".join(sorted(title)).replace(" ", "")
        len_sorted_title = len(sorted_title)
        # check whether movies that came out in the same year                                                                                                                       
        # have the same letters                                                                                                                                                     
        counts = [0]*len(year_list)
        for j in range(len(year_list)):
            mtitle = year_list[j]['title']
            sorted_mtitle = "".join(sorted(mtitle)).replace(" ", "")
            if len_sorted_title == len(sorted_mtitle):
                # if the title cannot be converted to a string                                                                                                                      
                # it is not the correct title                                                                                                                                       
                try:
                    sorted_mtitle = str(sorted_mtitle)
                except:
                    continue
                for i in range(len_sorted_title):
                    if sorted_title[i] == sorted_mtitle[i]:
                        counts[j] += 1
            else:
                continue
        k = counts.index(max(counts))
        if len(year_list) >= 1:
            #return year_list[k]
            return ia.get_movie(year_list[k].movieID)



In [6]:

    
# Function:  find_movieobj
# Purpose:  To convert a tuple of movie's information into one IMDB movie object
# Parameters:  
    # movie_tuple:  a tuple in the format (English Title, Sort Title, English Title + yr, yr)
# Returns: IMDB movie object
def find_movieobj(movie_tuple):   
    ## Step 1:  Find the IMDB movie object ("movieobj")
    arg1 = movie_tuple[0]        # English Title (1st choice for arg1)
    if type(arg1) == int:        # check if movie title is an int, if so convert to string
        arg1 = str(arg1)
    arg2 = movie_tuple[3]        # movie year
    arg3 = ia.search_movie(arg1)       # list of possible movies
    movieobj = find_movie(arg1, arg2, arg3)  # find IMDB movie object ("movieobj") using "find_movie" function
    if movieobj == None:                     # if returned none, try again using title +  yr search list of movies & English Title
        arg1alt = movie_tuple[1]             # Non-English Title (alternate choice if arg1 fails)
        if type(arg1alt) == int:             # check if movie title is an int, if so convert to string
            arg1alt = str(arg1alt)
        arg3alt1 = ia.search_movie(arg1alt)              # list of possible movies searching for title + yr (alt choice to arg3)
        movieobj = find_movie(arg1, arg2, arg3alt1)
    if movieobj == None:                     # if returned none, try again using title +  yr search list of movies & Non-English Title
        arg3alt2 = ia.search_movie(movie_tuple[2]) # list of possible movies searching for Non-English title (alt choice to arg3)
        find_movie(arg1alt, arg2, arg3alt2)
    if movieobj == None:                     # if returned none, try again using Non-English search list of movies & English Title
        movieobj = find_movie(arg1, arg2, arg3alt1)
    if movieobj == None:                     # if returned none, try again using Non-English search list of movies & Non-English Title
        movieobj = find_movie(arg1alt, arg2, arg3alt1)
    return movieobj



In [7]:

    
# Function:  make_moviedict
# Purpose:  To convert movieobj and movie_tuple into a dictionary within AAdict
# Parameters:  
    # movieobj:  a single IMDBmovie object
    # movie_tuple:  a tuple in the format (English Title, Sort Title, English Title + yr, yr)
    # rewrite: if True, will rewrite an existing key in AAdict, if exists
# Returns None (if no movieobj) or 1 if successfully appended dictionary to AAdict
def make_moviedict(movieobj, movie_tuple, rewrite=False):
    if movieobj is None:
        return False
    else:
        ## Get movie id ##
        movid = movieobj.movieID
        # Check if movie is already in dict if parameter rewrite = True
        if rewrite==False and movid in AAdict:
            return False
        else:
            ## Populate dictionary, main key is movie id ##
            AAdict[movid] = {}
            # "title": title of movie
            AAdict[movid]['title'] = movie_tuple[0]
            # "nominations": list of Oscar nominations
            AAdict[movid]['nominations'] = list(AAcsv[AAcsv['English Title']==movie_tuple[0]]['Category'])
            # "won": list of Oscars won
            AAdict[movid]['won'] = list(AAcsv[(AAcsv['English Title']==movie_tuple[0]) & (AAcsv['Winner?']==1)]['Category'])
            # "year": year Oscar won
            AAdict[movid]['year'] = list(AAcsv[AAcsv['English Title']==movie_tuple[0]]['yr'])[0]
            # "country": country of movie
            AAdict[movid]['country'] = list(AAcsv[AAcsv['English Title']==movie_tuple[0]]['Country'])[0]
            # "releasedate": USA movie release date in form yyyy-mm-dd
            AAdict[movid]['releasedate'] = get_releasedate(movieobj)
            # "mpaa": mpaa rating for the movie (i.e. R, PG-13, PG, G)
            AAdict[movid]['mpaa'] = get_mpaa(movieobj)
            # "genres": list of genres
            AAdict[movid]['genres'] = get_genres(movieobj)
            # "runtime": USA runtime
            AAdict[movid]['runtime'] = get_runtime(movieobj)
            # "cast": movie cast
            AAdict[movid]['starpower'] = get_starpower(movieobj,AAdict[movid]['year'])
            # "director": list of directors
            AAdict[movid]['director'] = get_director(movieobj)
            # "keywords": list of keywords
            AAdict[movid]['keywords'] = get_keywords(movieobj)
            # make each award individual key and the value to indicate whether movie won/nominated or not
            # Loop through awards list and indicate if movie was nominated or won
            for award in awards:
                # "Nominated award_name": True or False
                AAdict[movid]["Nominated %s" % award] = award in list(AAdict[movid]['nominations'])
                if AAdict[movid]["Nominated %s" %award] == True:
                    AAdict[movid]["Nominated %s" %award] = list(AAcsv[(AAcsv['English Title']==movie_tuple[0]) & (AAcsv['Category']==award)]['Nominee(s)'])[0]
                # "Nominated award_name": True or False
                AAdict[movid]["Won %s" % award] = award in list(AAdict[movid]['won'])
                if AAdict[movid]["Won %s" %award] == True:
                    AAdict[movid]["Won %s" %award] = list(AAcsv[(AAcsv['English Title']==movie_tuple[0]) & (AAcsv['Category']==award)]['Nominee(s)'])[0]
            return True

Note: You can skip the next several cells and just run the next cell (pickle.load) to get the complete moviedict I ended up doing this in 2 pieces b/c it took long to run on the full file and would sometimes timeout.



In [8]:

    
# Prep to create Academy Awards Dictionary ("AAdict"), a dict of dicts
# AAdict keyed by IMDB movie IDs
# Each movie id dict has keys containing information about the movie & Academy Award information

# Get a list of the unique movies in Academy Awards DF ("AAcsv")
# Store the English Title, Sort Title, English Title + yr, and yr in "AAuniquemovies"
#AAuniquemovies = list(set(zip(AAcsv['English Title'], AAcsv['Sort Title'], AAcsv['titleyr'], AAcsv['yr'])))
AAuniquemovies = list(set(zip(AAcsv['English Title'], AAcsv['Sort Title'], AAcsv['titleyr'], AAcsv['yr'])))

# Create empty AAdict
AAdict = {}

# Keep track of movies that failed to find a IMDB movie object (i.e. movieobj = None)
AAmissingmovies = list()



In [9]:

    
# loop through all movies and save in moviedict
for i in range(len(Aauniquemovies)):
    ## STEP 1:  Get movieobj of movie using get_movieobj
    movieobj = find_movieobj(AAuniquemovies[i])
    ## STEP 2:  Append to AAdict using make_moviedict
    added = make_moviedict(movieobj, AAuniquemovies[i])
    if added is False:
        AAmissingmovies.append(AAuniquemovies[i])

Check

Check to see what movies are missing from AAdict compared to the unique movies from AAcsv



In [11]:

    
# CHECK
#print "number of movies in AAcsv:", len(AAdict.keys())
#print "number of movies in AAcsv:", len(AAuniquemovies)
#print "number of movies missing from AAcsv:", len(AAmissingmovies)

#print "movies missing from AAcsv:"
#for missingmovie in AAmissingmovies:
#    print (missingmovie[2], AAuniquemovies.index(missingmovie))
#x = (u'Adam', u'Adam', u'Adam (1992)', '1992')

Mannually search remaining movies



In [12]:

    
# Add movies that are missing from AAdict

# Movies missing altogether from AAdict
missingids = [('0102997', 48),('0083293', 481),('0092999', 580),('0091021', 675),('0130860', 1301), ('0101270',303), ('0101272', 1172)]
#missingids = list(Strings (1991), Violet (1981), 
    #Eyes on the Prize: America's Civil Rights Years/Bridge to Freedom 1965 (1987),
    #Exit (1986), Mermaid (1997)), Adam(1992) - misclassified, Addams Family (1991)
    
for missingid in missingids:
    ## STEP 1:  Get movieobj of movie using get_movieobj
    movieobj = ia.get_movie(missingid[0])
    ## STEP 2:  Append to AAdict using make_moviedict
    added = make_moviedict(movieobj, AAuniquemovies[missingid[1]], rewrite=True)



In [41]:

    
# Hand checked every repeat, the following are okay and don't need munipulation:
# ('Triplets of Belleville (2003)', '0286244')
# ('WarGames (1983)', '0086567')
# ('Pelle the Conqueror (1988)','0093713')

# These movies already exisited in AAdict but were missing nominations/winning information
# due to them being under different names (ex:  "Goodfellas" versus "Good fellas")

# ('Remains of the Day (1993)', '0107943')
AAdict['0107943']["Nominated Best Costume Design"] = u'Jenny Beavan, John Bright'
AAdict['0107943']['nominations'].append(u'Best Costume Design')

# ('Cyrano De Bergerac (1990)', '0099334')
AAdict['0099334']["Nominated Best Actor"] = u'Gerard Depardieu'
AAdict['0099334']["Nominated Best Art Direction"] = u'Ezio Frigerio (Art Direction); Jacques Rouxel (Set Decoration)'
AAdict['0099334']["Nominated Best Costume Design"] = u'Franca Squarciapino'
AAdict['0099334']["Won Best Costume Design"] = u'Franca Squarciapino'
AAdict['0099334']["Nominated Best Makeup"] = u'Michèle Burke, Jean-Pierre Eychenne'
AAdict['0099334']['nominations'].extend([u'Best Costume Design', u'Best Art Direction', u'Best Costume Design', u'Best Makeup'])
AAdict['0099334']['won'].extend([u'Best Costume Design'])

# ('Greystoke: the Legend of Tarzan, Lord of the Apes (1984)', '0087365')
AAdict['0087365']["Nominated Best Writing, Adapted Screenplay"] = u'P.H. Vazak, Michael Austin'
AAdict['0087365']['Nominated Best Makeup'] = u'Rick Baker, Paul Engelen'
AAdict['0087365']['nominations'].extend([u'Best Writing, Adapted Screenplay', u'Best Makeup'])

# ('Enemies: A Love Story (1989)','0097276')
AAdict['0097276']['Nominated Best Supporting Actress'] = u'Lena Olin', u'Anjelica Huston'
AAdict['0097276']['nominations'].extend([u'Best Supporting Actress', u'Best Supporting Actress'])

# ('Goodfellas (1990)','0099685')
AAdict['0099685']['Nominated Best Picture'] = u'Irwin Winkler (Producer)'
AAdict['0099685']['Nominated Best Supporting Actor'] = u'Joe Pesci'
AAdict['0099685']['Won Best Supporting Actor'] = u'Joe Pesci'
AAdict['0099685']['Nominated Best Supporting Actress'] = u'Lorraine Bracco'
AAdict['0099685']['Nominated Best Writing, Adapted Screenplay'] = u'Nicholas Pileggi, Martin Scorsese'
AAdict['0099685']['nominations'].extend([u'Best Picture', u'Best Supporting Actor', u'Best Supporting Actress', u'Best Writing, Adapted Screenplay'])
AAdict['0099685']['won'].extend([u'Best Supporting Actor'])



In [39]:

    
# To Save new AAdict, run this cell
#filename = 'AAdict.p'
#pickle.dump(AAdict, io.open(filename,'wb'))

Convert to AAdf

PICK BACK UP HERE

convert AAdict to AAdf pandas dataframe



In [56]:

    
# convert AAdict to pandas
AAdf = pd.DataFrame.from_dict(AAdict).transpose()
AAdf['movieid'] = AAdf.index


# hand-code genres for one movie that was missing genre info
AAdf.loc['5152218',:].genres = ["Horror","Romance"]

# Create new columns
# number of nominations
# winner
AAdf['winner'] = AAdf['won'].apply(lambda x: len(x)!=0) * 1
# convert years to ints
AAdf['year'] = AAdf['year'].apply(lambda x: int(x))

AAdf.head()
#AAdf[AAdf['Nominated Best Actor']==1].head()









    Out[56]:






  
    
      
      Nominated Best Actor
      Nominated Best Actress
      Nominated Best Animated Feature Film
      Nominated Best Art Direction
      Nominated Best Cinematography
      Nominated Best Costume Design
      Nominated Best Director
      Nominated Best Documentary, Feature
      Nominated Best Documentary, Short Subject
      Nominated Best Film Editing
      ...
      keywords
      mpaa
      nominations
      releasedate
      runtime
      title
      won
      year
      movieid
      winner
    
  
  
    
      0035423
                False
                False
       False
                                                   False
       False
       False
             False
                                                   False
       False
       False
      ...
       [time-travel, brooklyn-bridge, bridge, time-tr...
       PG-13
                                      [Best Music, Song]
       2001-12-25
       118
                               Kate & Leopold
       []
       2001
       0035423
       0
    
    
      0080388
       Burt Lancaster
       Susan Sarandon
       False
                                                   False
       False
       False
       Louis Malle
                                                   False
       False
       False
      ...
       [drugs, gangster, camera-shot-of-feet, female-...
         NaN
       [Best Picture, Best Actor, Best Actress, Best ...
       1981-04-03
       104
                                Atlantic City
       []
       1981
       0080388
       0
    
    
      0080855
                False
                False
       False
       Tambi Larsen (Art Direction); Jim Berkey (Set ...
       False
       False
             False
                                                   False
       False
       False
      ...
       [immigrant, sheriff, 1890s, johnson-county-war...
         NaN
                                    [Best Art Direction]
       1980-11-18
       149
                                Heaven's Gate
       []
       1981
       0080855
       0
    
    
      0081974
          Paul Newman
                False
       False
                                                   False
       False
       False
             False
                                                   False
       False
       False
      ...
       [murder, newspaper, mafia, reporter, slander, ...
         NaN
       [Best Actor, Best Supporting Actress, Best Wri...
       1981-11-19
       116
                            Absence of Malice
       []
       1981
       0081974
       0
    
    
      0081988
                False
                False
       False
                                                   False
       False
       False
             False
       Suzanne Bauman, Paul Neshamkin, Jim Burroughs ...
       False
       False
      ...
                                                     NaN
         NaN
                             [Best Documentary, Feature]
              NaN
        60
       Against Wind and Tide: A Cuban Odyssey
       []
       1981
       0081988
       0
    
  

5 rows × 74 columns



In [60]:

    
# Foreign films nominess were blank and therefore coded as 0, to fix this we have to convert 0 and false to strings
AAdf['Nominated Best Foreign Language Film'].loc[AAdf['Nominated Best Foreign Language Film'] != 0] = 0
AAdf['Nominated Best Foreign Language Film'] = AAdf['Nominated Best Foreign Language Film'].apply(lambda x: str(x) + "fix" == "0fix")

Import and Split Data

First we split our data into a validation set (movies in 2006) and a training set (movies 1981-2005). We will use k-fold cross validation to train our model. In this preliminary analysis, we train our model to predict Oscar winners given the movie was nominated.



In [62]:

    
# Functions

# convert release dates into quarters
def get_quarter(monthint):
    if len(monthint) == 1:
        return 0
    else:
        if int(monthint[1]) <= 3:
            return 1
        if int(monthint[1]) >3 and int(monthint[1]) <=6:
            return 2
        if int(monthint[1]) >6 and int(monthint[1]) <=9:
            return 3
        if int(monthint[1]) >9:
            return 4
# convert release dates into month
def get_month(monthint):
    if len(monthint) == 1:
        return 0
    else:
        return(int(monthint[1]))

# convert the string of countries into countries
def get_countries(countrylist):
    countries =[]
    if countrylist == 0:
        return [u'USA']
    else:
        for country in countrylist.split('/'):
            country = country.replace(" ", "")
            countries.append(country)
        return countries



In [63]:

    
%%time
# Dealing with Categorical variables & creating other descriptive variables

# mpaa is ordinal, convert to ordinal 'mpaaint'
AAdf['mpaaint'] = AAdf['mpaa']
AAdf['mpaaint'].loc[AAdf['mpaaint']=='R'] = 3
AAdf['mpaaint'].loc[AAdf['mpaaint']=='PG-13'] = 2
AAdf['mpaaint'].loc[AAdf['mpaaint']=='PG'] = 1
AAdf['mpaaint'].loc[pd.isnull(AAdf['mpaaint'])] = 0

# count number of nominations
AAdf.loc[:,'numnominations'] = AAdf['nominations'].apply(lambda x: len(x))

# convert release dates to quarters
AAdf.loc[:,'quarter'] = AAdf['releasedate'].apply( lambda x: get_quarter(str(x).split('-')) )

# convert release dates into months
AAdf.loc[:,'month'] = AAdf['releasedate'].apply( lambda x: get_month(str(x).split('-')) )

# convert release dates to number of days since beginning of the year
AAdf.loc[:,'countrylist'] = AAdf['country'].apply( lambda x: get_countries(x) )

# get unique list of genres, as well as create count dictionary of all countries and keywords
countries_dict = {}
uniquegenres = set()
keywords_dict = {}
for _,movie in AAdf.iterrows():
    if type(movie['countrylist']) == list:
        for country in movie['countrylist']:
            if country in countries_dict.keys():
                countries_dict[country] += 1
            else:
                countries_dict[country] = 1
    for genre in movie.genres:
        uniquegenres.add(genre)
    if type(movie.keywords) == list:
        for keyword in movie.keywords:
            if keyword in keywords_dict.keys():
                keywords_dict[keyword] += 1
            else:
                keywords_dict[keyword] = 1
# shorten countries and keywords dictionary to only most common keywords to reduce dimensionality
numother = 0
for country in countries_dict.keys():
    if countries_dict[country] <= 10:
        numother = numother + countries_dict[country]
        del countries_dict[country]
countries_dict['OtherCountry/Unknown'] = numother
for keyword in keywords_dict.keys():
    if keywords_dict[keyword] <= 200:
        del keywords_dict[keyword]
        
# create dummy variables for countries, genres, and keywords
for country in countries_dict:
    AAdf.loc[:,country] = 0
for genre in uniquegenres:
    AAdf.loc[:,genre] = 0
for keyword in keywords_dict:
    AAdf.loc[:,keyword] = 0

AAdf['OtherCountry/Unknown'] = 0
for index,movie in AAdf.iterrows():
    if type(movie.countrylist) == list:
        for country in set(movie.countrylist):
            if country in set(countries_dict.keys()):
                AAdf.loc[index,country] = 1
            else:
                AAdf.loc[index,'OtherCountry/Unknown'] = 1
    if type(movie.genres) == list:
        for genre in uniquegenres:
            if genre in set(movie.genres):
                AAdf.loc[index,genre] = 1
    if type(movie.keywords) == list:
        for keyword in keywords_dict:
            if keyword in set(movie.keywords):
                AAdf.loc[index,keyword] = 1









    



CPU times: user 3min 11s, sys: 1.31 s, total: 3min 12s
Wall time: 3min 14s



In [64]:

    
# convert dataframe into dict for easier pickle 
# Note:  we were having trouble picle_loading pandas on non-apple computers and found dicts didn't have this problem
AAdictfinal = AAdf.transpose().to_dict()

This is the final version that we need! We will rerun this in oscar_process_notebook.



In [65]:

    
# To save new AAdict, run the following (AAdictfinal is the dict used is oscar_process_notebook)
#filename = 'AAdictfinal'
#pickle.dump(AAdictfinal, io.open(filename,'wb'))

# To reload AAdictfinal, run the following:
#AAdictfinal = pickle.load(open('AAdictfinal.p','rb'))

Now we are ready to do some analysis. Click here to go to oscar_process_notebook: https://github.com/oscarpredictor/oscar-predictor/blob/master/oscar_process_notebook.ipynb

	Original Title	English Title	Sort Title	Year	Award	Category	Winner?	Nominee(s)	__ Academy Awards	Item	yr	titleyr
5924	Atlantic City	Atlantic City	Atlantic City	1981	Academy Award	Best Picture	0	Denis Heroux, John Kemeny (Producers)	54th	5925	1981	Atlantic City (1981)
5925	Chariots of Fire	Chariots of Fire	Chariots of Fire	1981	Academy Award	Best Picture	1	David Puttnam (Producer)	54th	5926	1981	Chariots of Fire (1981)
5926	On Golden Pond	On Golden Pond	On Golden Pond	1981	Academy Award	Best Picture	0	Bruce Gilbert (Producer)	54th	5927	1981	On Golden Pond (1981)
5927	Raiders of the Lost Ark	Indiana Jones and the Raiders of the Lost Ark	Raiders of the Lost Ark	1981	Academy Award	Best Picture	0	Frank Marshall (Producer)	54th	5928	1981	Raiders of the Lost Ark (1981)
5928	Reds	Reds	Reds	1981	Academy Award	Best Picture	0	Warren Beatty (Producer)	54th	5929	1981	Reds (1981)

	Nominated Best Actor	Nominated Best Actress	Nominated Best Animated Feature Film	Nominated Best Art Direction	Nominated Best Cinematography	Nominated Best Costume Design	Nominated Best Director	Nominated Best Documentary, Feature	Nominated Best Documentary, Short Subject	Nominated Best Film Editing	...	keywords	mpaa	nominations	releasedate	runtime	title	won	year	movieid
0035423	False	False	False	False	False	False	False	False	False	False	...	[time-travel, brooklyn-bridge, bridge, time-tr...	PG-13	[Best Music, Song]	2001-12-25	118	Kate & Leopold	[]	2001	0035423
0080388	Burt Lancaster	Susan Sarandon	False	False	False	False	Louis Malle	False	False	False	...	[drugs, gangster, camera-shot-of-feet, female-...	NaN	[Best Picture, Best Actor, Best Actress, Best ...	1981-04-03	104	Atlantic City	[]	1981	0080388
0080855	False	False	False	Tambi Larsen (Art Direction); Jim Berkey (Set ...	False	False	False	False	False	False	...	[immigrant, sheriff, 1890s, johnson-county-war...	NaN	[Best Art Direction]	1980-11-18	149	Heaven's Gate	[]	1981	0080855
0081974	Paul Newman	False	False	False	False	False	False	False	False	False	...	[murder, newspaper, mafia, reporter, slander, ...	NaN	[Best Actor, Best Supporting Actress, Best Wri...	1981-11-19	116	Absence of Malice	[]	1981	0081974
0081988	False	False	False	False	False	False	False	Suzanne Bauman, Paul Neshamkin, Jim Burroughs ...	False	False	...	NaN	NaN	[Best Documentary, Feature]	NaN	60	Against Wind and Tide: A Cuban Odyssey	[]	1981	0081988