Make AAdict


In [1]:
from imdb import IMDb
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import cPickle as pickle
ia = IMDb(accessSystem='http')
from collections import defaultdict 
import io
from datetime import datetime
import time
import math

Just run the following cell to open AAdict


In [59]:
# Run this cell instead of rerunning a portion of this notebook.
# By loading AAdict, you can skip the cells until you see the note "PICK BACK UP HERE"
AAdict = pickle.load(open('AAdict.p','rb'))

In [3]:
# Read in Academy Awards df (AAcsv)
AAcsv = pd.read_excel("Academy_Awards_2006.xls")

# Update df
# Concat Sort Title with first part of year; store in "title"
# This will be helpful when using ia.search_movie function
AAcsv['Year'] = AAcsv['Year'].values.astype(str)
AAcsv['yr'] = AAcsv.apply( lambda row: row['Year'][:4],axis=1 )
AAcsv['titleyr'] = AAcsv.apply( lambda row: '%s (%s)' % (row['Sort Title'],row['Year'][:4]),axis=1)
# Convert 'Winner' "X" to 1
AAcsv['Winner?'] = 1*(AAcsv['Winner?'] == 'X')
# If movie title is "[no specific film title]", delete
AAcsv = AAcsv[AAcsv['Sort Title'] != "[no specific film title]"].copy()

# Subset DF
AAcsv = AAcsv[AAcsv["yr"]>="1981"].copy()

# Store all possible awards in "awards" list
# This will be used when making aadict to indicate which awards the movie was nominated for/won
awards = list(set(list(AAcsv['Category'])))

# View head of AAcsv
AAcsv.head()


Out[3]:
Original Title English Title Sort Title Year Country Award Category Winner? Nominee(s) __ Academy Awards Item yr titleyr
5924 Atlantic City Atlantic City Atlantic City 1981 0 Academy Award Best Picture 0 Denis Heroux, John Kemeny (Producers) 54th 5925 1981 Atlantic City (1981)
5925 Chariots of Fire Chariots of Fire Chariots of Fire 1981 0 Academy Award Best Picture 1 David Puttnam (Producer) 54th 5926 1981 Chariots of Fire (1981)
5926 On Golden Pond On Golden Pond On Golden Pond 1981 0 Academy Award Best Picture 0 Bruce Gilbert (Producer) 54th 5927 1981 On Golden Pond (1981)
5927 Raiders of the Lost Ark Indiana Jones and the Raiders of the Lost Ark Raiders of the Lost Ark 1981 0 Academy Award Best Picture 0 Frank Marshall (Producer) 54th 5928 1981 Raiders of the Lost Ark (1981)
5928 Reds Reds Reds 1981 0 Academy Award Best Picture 0 Warren Beatty (Producer) 54th 5929 1981 Reds (1981)

In [10]:
# Due to the needing to call the IMDBpy database multiple times, this takes very long to run for our entire dataset.
# This can be used in future work for this analysis.

# Function:  get_starpower
# Purpose:  To convert movieobj and movie_tuple into a dictionary within AAdict
# Parameters:  
    # movieobj:  a single IMDBmovie object
    # movie_tuple:  a tuple in the format (English Title, Sort Title, English Title + yr, yr)
    # rewrite: if True, will rewrite an existing key in AAdict, if exists
# Returns None (if no movieobj) or 1 if successfully appended dictionary to AAdict

def starpower(movieobj, year):
    try:
        movie_cast = movieobj.data['cast']
    except:
        return 0
    cast_rating={}
    totalpower=0
    for actorperson in movie_cast:
        print actorperson
        ia.update(actorperson, 'filmography')
        if 'actor' in actorperson.keys():
            temp_movie_list = actorperson['actor'] # The filmography of a given actor
        elif 'actress' in actorperson.keys():
            #print "in the actress"
            temp_movie_list = actorperson['actress']
        else:
            temp_movie_list=[]
        count=0 # counts the number of movies they have been in thus far
        sum=0 # Sums the gross of the movies they have been in thus far
        avg_rating = 0 #holds the net total of ratings for the movies that an actor has been in thus far
        total_votes = 0 # Holds the IMDb votes for the movies, a proxy for movie popularity
        for j in temp_movie_list:
            ia.update(j, 'vote details')
            if 'rating' in j.keys():
                #print j
                if ((j.data['year'] <=year)):
                    count +=1
                    #sum += j['gross'] # Only used if we end up getting gross movien sales for all movies
                    avg_rating +=j.data['rating'] # adding up the movie ratings, 
                    total_votes +=(j.data['votes']*(1/(year-j.data['year']+1)))
        final_power = rating_calculator(count, avg_rating, total_votes)
        cast_rating[actorperson] = final_power #add in gross if it exists
        totalpower += final_power
        print cast_rating, totalpower
    return(cast_rating, totalpower)

def rating_calculator(count, avg_rating, total_votes):
    if (count!=0) & (avg_rating!=0) & (total_votes!=0):
        return(math.log(count) + (avg_rating/count)*0.3 + total_votes*0.000001)
    else:
        return(0)

In [10]:
# Due to the needing to call the IMDBpy database multiple times, this takes very long to run for our entire dataset.
# This can be used in future work for this analysis.

# Function:  get_directorpower
# Purpose:  To convert movieobj and movie_tuple into a dictionary within AAdict
# Parameters:  
    # movieobj:  a single IMDBmovie object
    # movie_tuple:  a tuple in the format (English Title, Sort Title, English Title + yr, yr)
    # rewrite: if True, will rewrite an existing key in AAdict, if exists
# Returns None (if no movieobj) or 1 if successfully appended dictionary to AAdict

def directorpower(movie_director, year):
    movie_cast
    cast_rating={}
    totalpower=0
    for actorperson in movie_cast:
        print actorperson
        ia.update(actorperson, 'filmography')
        if 'actor' in actorperson.keys():
            temp_movie_list = actorperson['actor'] # The filmography of a given actor
        elif 'actress' in actorperson.keys():
            #print "in the actress"
            temp_movie_list = actorperson['actress']
        else:
            temp_movie_list=[]
        count=0 # counts the number of movies they have been in thus far
        sum=0 # Sums the gross of the movies they have been in thus far
        avg_rating = 0 #holds the net total of ratings for the movies that an actor has been in thus far
        total_votes = 0 # Holds the IMDb votes for the movies, a proxy for movie popularity
        for j in temp_movie_list:
            ia.update(j, 'vote details')
            if 'rating' in j.keys():
                #print j
                if ((j.data['year'] <=year)):
                    count +=1
                    #sum += j['gross'] # Only used if we end up getting gross movien sales for all movies
                    avg_rating +=j.data['rating'] # adding up the movie ratings, 
                    total_votes +=(j.data['votes']*(1/(year-j.data['year']+1)))
        final_power = rating_calculator(count, avg_rating, total_votes)
        cast_rating[actorperson] = final_power #add in gross if it exists
        totalpower += final_power
        print cast_rating, totalpower

    return(cast_rating, totalpower)

def rating_calculator(count, avg_rating, total_votes):
    if (count!=0) & (avg_rating!=0) & (total_votes!=0):
        return(math.log(count) + (avg_rating/count)*0.3 + total_votes*0.000001)
    else:
        return(0)

In [4]:
### Movie Attribute functions ##

# Function:  get_releasedate
# Purpose:  get the USA release date given a movieobj
# Parameters: 
    # movieobj:  a single IMDBmovie object
# Returns: the USA release date of the movie as a class datetime
def get_releasedate(movieobj):
    try:
        ia.update(movieobj, 'release dates')
        date = str(movieobj.data['release dates']).split("USA::", 1)[1]
        day = str(date.split(" ")[0])
        month = str(date.split(" ")[1])
        year = str(date.split(" ")[2].split("'")[0])
        releasedate = datetime.strptime(year + "-" + month + "-" + day, "%Y-%B-%d").date()
    except:
        releasedate = np.nan
    return releasedate

# Function:  get_mpaa
# Purpose:  get the mpaa rating given a movieobj
# Parameters: 
    # movieobj:  a single IMDBmovie object
# Returns: the USA release date of the movie as a class datetime
def get_mpaa(movieobj):
    try:
        mpaa = str(movieobj.data['mpaa']).split("Rated ", 1)[1].split(" ")[0]
    except:
        mpaa = np.nan
    return mpaa

# Function:  get_genres
# Purpose:  get the list of genres given in a movieobj
# Parameters:
    # movieobj: a single IMDBmovie object
# Returns:  a list of genres
def get_genres(movieobj):
    try:
        genres = movieobj.data['genres']
    except:
        genres = np.nan
    return genres

# Function:  get_runtime
# Purpose:  get the runtime given in a movieobj
# Parameters:
    # movieobj: a single IMDBmovie object
# Returns:  USA runtime
def get_runtime(movieobj):
    try:
        runtime = movieobj.data['runtimes'][0]
        try:
            runtime = int(runtime)
        except:
            try:
                runtime = int(runtime.split(':')[0])
            except:
                try:
                    runtime = int(runtime.split(':')[1])
                except:
                    runtime = int(runtime.split(':')[2])
    except:
        runtime = 0
    return runtime

# Function:  get_starpower
# Purpose:  calculate rating of how well-known movie cast is
# Parameters:
    # movieobj: a single IMDBmovie object
# Returns:  an int
#%run 'Starpower.ipynb'
def get_starpower(movieobj, year):
    try:
        castdata = movieobj.data['cast']
        starpower = starpower(castdata, year)
    except:
        starpower = 0
    return starpower

# Function:  get_director
# Purpose:  get the list of directors given in a movieobj
# Parameters:
    # movieobj: a single IMDBmovie object
# Returns:  a list of directors
def get_director(movieobj):
    try:
        directordata = movieobj.data['director']
        director = []
        for person in directordata:
            director.append(person.personID)
    except:
        director = []
    return director

# Function:  get_keywords
# Purpose:  get the list of keywords given in a movieobj
# Parameters:
    # movieobj: a single IMDBmovie object
# Returns:  a list of keywords
def get_keywords(movieobj):
    try:
        ia.update(movieobj, 'keywords')
        keywords = movieobj.data['keywords']
    except:
        keywords = np.nan
    return keywords

In [5]:
# Function:  find_movie
# Purpose: to sort through possible IMDB movie objects and find just one
# Parameters:  
    #title: title of movie
    #year:  year of movie
    #mlist: list of possible IMDB movie objects
# Returns movieobj
def find_movie(title, year,  mlist):
    # find movies that came out in the same year                                                                                                                                    
    year_list = []
    for movie in mlist:
        try:
            if movie.data['year'] == int(year):
                year_list.append(movie)
            # else see if one - two years off
            elif movie.data['year'] == int(year) + 1:
                year_list.append(movie)
            elif movie.data['year'] == int(year) - 1:
                year_list.append(movie)
            elif movie.data['year'] == int(year) + 2:
                year_list.append(movie)
            elif movie.data['year'] == int(year) - 2:
                year_list.append(movie)
        except:
            pass
    # if the years do not match, there is no match                                                                                                                                  
    if len(year_list) < 1:
        return None
    if len(year_list) == 1:
        return ia.get_movie(year_list[0].movieID)
    else:
        # process the title                                                                                                                                                         
        sorted_title = "".join(sorted(title)).replace(" ", "")
        len_sorted_title = len(sorted_title)
        # check whether movies that came out in the same year                                                                                                                       
        # have the same letters                                                                                                                                                     
        counts = [0]*len(year_list)
        for j in range(len(year_list)):
            mtitle = year_list[j]['title']
            sorted_mtitle = "".join(sorted(mtitle)).replace(" ", "")
            if len_sorted_title == len(sorted_mtitle):
                # if the title cannot be converted to a string                                                                                                                      
                # it is not the correct title                                                                                                                                       
                try:
                    sorted_mtitle = str(sorted_mtitle)
                except:
                    continue
                for i in range(len_sorted_title):
                    if sorted_title[i] == sorted_mtitle[i]:
                        counts[j] += 1
            else:
                continue
        k = counts.index(max(counts))
        if len(year_list) >= 1:
            #return year_list[k]
            return ia.get_movie(year_list[k].movieID)

In [6]:
# Function:  find_movieobj
# Purpose:  To convert a tuple of movie's information into one IMDB movie object
# Parameters:  
    # movie_tuple:  a tuple in the format (English Title, Sort Title, English Title + yr, yr)
# Returns: IMDB movie object
def find_movieobj(movie_tuple):   
    ## Step 1:  Find the IMDB movie object ("movieobj")
    arg1 = movie_tuple[0]        # English Title (1st choice for arg1)
    if type(arg1) == int:        # check if movie title is an int, if so convert to string
        arg1 = str(arg1)
    arg2 = movie_tuple[3]        # movie year
    arg3 = ia.search_movie(arg1)       # list of possible movies
    movieobj = find_movie(arg1, arg2, arg3)  # find IMDB movie object ("movieobj") using "find_movie" function
    if movieobj == None:                     # if returned none, try again using title +  yr search list of movies & English Title
        arg1alt = movie_tuple[1]             # Non-English Title (alternate choice if arg1 fails)
        if type(arg1alt) == int:             # check if movie title is an int, if so convert to string
            arg1alt = str(arg1alt)
        arg3alt1 = ia.search_movie(arg1alt)              # list of possible movies searching for title + yr (alt choice to arg3)
        movieobj = find_movie(arg1, arg2, arg3alt1)
    if movieobj == None:                     # if returned none, try again using title +  yr search list of movies & Non-English Title
        arg3alt2 = ia.search_movie(movie_tuple[2]) # list of possible movies searching for Non-English title (alt choice to arg3)
        find_movie(arg1alt, arg2, arg3alt2)
    if movieobj == None:                     # if returned none, try again using Non-English search list of movies & English Title
        movieobj = find_movie(arg1, arg2, arg3alt1)
    if movieobj == None:                     # if returned none, try again using Non-English search list of movies & Non-English Title
        movieobj = find_movie(arg1alt, arg2, arg3alt1)
    return movieobj

In [7]:
# Function:  make_moviedict
# Purpose:  To convert movieobj and movie_tuple into a dictionary within AAdict
# Parameters:  
    # movieobj:  a single IMDBmovie object
    # movie_tuple:  a tuple in the format (English Title, Sort Title, English Title + yr, yr)
    # rewrite: if True, will rewrite an existing key in AAdict, if exists
# Returns None (if no movieobj) or 1 if successfully appended dictionary to AAdict
def make_moviedict(movieobj, movie_tuple, rewrite=False):
    if movieobj is None:
        return False
    else:
        ## Get movie id ##
        movid = movieobj.movieID
        # Check if movie is already in dict if parameter rewrite = True
        if rewrite==False and movid in AAdict:
            return False
        else:
            ## Populate dictionary, main key is movie id ##
            AAdict[movid] = {}
            # "title": title of movie
            AAdict[movid]['title'] = movie_tuple[0]
            # "nominations": list of Oscar nominations
            AAdict[movid]['nominations'] = list(AAcsv[AAcsv['English Title']==movie_tuple[0]]['Category'])
            # "won": list of Oscars won
            AAdict[movid]['won'] = list(AAcsv[(AAcsv['English Title']==movie_tuple[0]) & (AAcsv['Winner?']==1)]['Category'])
            # "year": year Oscar won
            AAdict[movid]['year'] = list(AAcsv[AAcsv['English Title']==movie_tuple[0]]['yr'])[0]
            # "country": country of movie
            AAdict[movid]['country'] = list(AAcsv[AAcsv['English Title']==movie_tuple[0]]['Country'])[0]
            # "releasedate": USA movie release date in form yyyy-mm-dd
            AAdict[movid]['releasedate'] = get_releasedate(movieobj)
            # "mpaa": mpaa rating for the movie (i.e. R, PG-13, PG, G)
            AAdict[movid]['mpaa'] = get_mpaa(movieobj)
            # "genres": list of genres
            AAdict[movid]['genres'] = get_genres(movieobj)
            # "runtime": USA runtime
            AAdict[movid]['runtime'] = get_runtime(movieobj)
            # "cast": movie cast
            AAdict[movid]['starpower'] = get_starpower(movieobj,AAdict[movid]['year'])
            # "director": list of directors
            AAdict[movid]['director'] = get_director(movieobj)
            # "keywords": list of keywords
            AAdict[movid]['keywords'] = get_keywords(movieobj)
            # make each award individual key and the value to indicate whether movie won/nominated or not
            # Loop through awards list and indicate if movie was nominated or won
            for award in awards:
                # "Nominated award_name": True or False
                AAdict[movid]["Nominated %s" % award] = award in list(AAdict[movid]['nominations'])
                if AAdict[movid]["Nominated %s" %award] == True:
                    AAdict[movid]["Nominated %s" %award] = list(AAcsv[(AAcsv['English Title']==movie_tuple[0]) & (AAcsv['Category']==award)]['Nominee(s)'])[0]
                # "Nominated award_name": True or False
                AAdict[movid]["Won %s" % award] = award in list(AAdict[movid]['won'])
                if AAdict[movid]["Won %s" %award] == True:
                    AAdict[movid]["Won %s" %award] = list(AAcsv[(AAcsv['English Title']==movie_tuple[0]) & (AAcsv['Category']==award)]['Nominee(s)'])[0]
            return True

Note: You can skip the next several cells and just run the next cell (pickle.load) to get the complete moviedict I ended up doing this in 2 pieces b/c it took long to run on the full file and would sometimes timeout.


In [8]:
# Prep to create Academy Awards Dictionary ("AAdict"), a dict of dicts
# AAdict keyed by IMDB movie IDs
# Each movie id dict has keys containing information about the movie & Academy Award information

# Get a list of the unique movies in Academy Awards DF ("AAcsv")
# Store the English Title, Sort Title, English Title + yr, and yr in "AAuniquemovies"
#AAuniquemovies = list(set(zip(AAcsv['English Title'], AAcsv['Sort Title'], AAcsv['titleyr'], AAcsv['yr'])))
AAuniquemovies = list(set(zip(AAcsv['English Title'], AAcsv['Sort Title'], AAcsv['titleyr'], AAcsv['yr'])))

# Create empty AAdict
AAdict = {}

# Keep track of movies that failed to find a IMDB movie object (i.e. movieobj = None)
AAmissingmovies = list()

In [9]:
# loop through all movies and save in moviedict
for i in range(len(Aauniquemovies)):
    ## STEP 1:  Get movieobj of movie using get_movieobj
    movieobj = find_movieobj(AAuniquemovies[i])
    ## STEP 2:  Append to AAdict using make_moviedict
    added = make_moviedict(movieobj, AAuniquemovies[i])
    if added is False:
        AAmissingmovies.append(AAuniquemovies[i])

Check

Check to see what movies are missing from AAdict compared to the unique movies from AAcsv


In [11]:
# CHECK
#print "number of movies in AAcsv:", len(AAdict.keys())
#print "number of movies in AAcsv:", len(AAuniquemovies)
#print "number of movies missing from AAcsv:", len(AAmissingmovies)

#print "movies missing from AAcsv:"
#for missingmovie in AAmissingmovies:
#    print (missingmovie[2], AAuniquemovies.index(missingmovie))
#x = (u'Adam', u'Adam', u'Adam (1992)', '1992')

Mannually search remaining movies


In [12]:
# Add movies that are missing from AAdict

# Movies missing altogether from AAdict
missingids = [('0102997', 48),('0083293', 481),('0092999', 580),('0091021', 675),('0130860', 1301), ('0101270',303), ('0101272', 1172)]
#missingids = list(Strings (1991), Violet (1981), 
    #Eyes on the Prize: America's Civil Rights Years/Bridge to Freedom 1965 (1987),
    #Exit (1986), Mermaid (1997)), Adam(1992) - misclassified, Addams Family (1991)
    
for missingid in missingids:
    ## STEP 1:  Get movieobj of movie using get_movieobj
    movieobj = ia.get_movie(missingid[0])
    ## STEP 2:  Append to AAdict using make_moviedict
    added = make_moviedict(movieobj, AAuniquemovies[missingid[1]], rewrite=True)

In [41]:
# Hand checked every repeat, the following are okay and don't need munipulation:
# ('Triplets of Belleville (2003)', '0286244')
# ('WarGames (1983)', '0086567')
# ('Pelle the Conqueror (1988)','0093713')

# These movies already exisited in AAdict but were missing nominations/winning information
# due to them being under different names (ex:  "Goodfellas" versus "Good fellas")

# ('Remains of the Day (1993)', '0107943')
AAdict['0107943']["Nominated Best Costume Design"] = u'Jenny Beavan, John Bright'
AAdict['0107943']['nominations'].append(u'Best Costume Design')

# ('Cyrano De Bergerac (1990)', '0099334')
AAdict['0099334']["Nominated Best Actor"] = u'Gerard Depardieu'
AAdict['0099334']["Nominated Best Art Direction"] = u'Ezio Frigerio (Art Direction); Jacques Rouxel (Set Decoration)'
AAdict['0099334']["Nominated Best Costume Design"] = u'Franca Squarciapino'
AAdict['0099334']["Won Best Costume Design"] = u'Franca Squarciapino'
AAdict['0099334']["Nominated Best Makeup"] = u'Michèle Burke, Jean-Pierre Eychenne'
AAdict['0099334']['nominations'].extend([u'Best Costume Design', u'Best Art Direction', u'Best Costume Design', u'Best Makeup'])
AAdict['0099334']['won'].extend([u'Best Costume Design'])

# ('Greystoke: the Legend of Tarzan, Lord of the Apes (1984)', '0087365')
AAdict['0087365']["Nominated Best Writing, Adapted Screenplay"] = u'P.H. Vazak, Michael Austin'
AAdict['0087365']['Nominated Best Makeup'] = u'Rick Baker, Paul Engelen'
AAdict['0087365']['nominations'].extend([u'Best Writing, Adapted Screenplay', u'Best Makeup'])

# ('Enemies: A Love Story (1989)','0097276')
AAdict['0097276']['Nominated Best Supporting Actress'] = u'Lena Olin', u'Anjelica Huston'
AAdict['0097276']['nominations'].extend([u'Best Supporting Actress', u'Best Supporting Actress'])

# ('Goodfellas (1990)','0099685')
AAdict['0099685']['Nominated Best Picture'] = u'Irwin Winkler (Producer)'
AAdict['0099685']['Nominated Best Supporting Actor'] = u'Joe Pesci'
AAdict['0099685']['Won Best Supporting Actor'] = u'Joe Pesci'
AAdict['0099685']['Nominated Best Supporting Actress'] = u'Lorraine Bracco'
AAdict['0099685']['Nominated Best Writing, Adapted Screenplay'] = u'Nicholas Pileggi, Martin Scorsese'
AAdict['0099685']['nominations'].extend([u'Best Picture', u'Best Supporting Actor', u'Best Supporting Actress', u'Best Writing, Adapted Screenplay'])
AAdict['0099685']['won'].extend([u'Best Supporting Actor'])

In [39]:
# To Save new AAdict, run this cell
#filename = 'AAdict.p'
#pickle.dump(AAdict, io.open(filename,'wb'))

Convert to AAdf

PICK BACK UP HERE

convert AAdict to AAdf pandas dataframe


In [56]:
# convert AAdict to pandas
AAdf = pd.DataFrame.from_dict(AAdict).transpose()
AAdf['movieid'] = AAdf.index


# hand-code genres for one movie that was missing genre info
AAdf.loc['5152218',:].genres = ["Horror","Romance"]

# Create new columns
# number of nominations
# winner
AAdf['winner'] = AAdf['won'].apply(lambda x: len(x)!=0) * 1
# convert years to ints
AAdf['year'] = AAdf['year'].apply(lambda x: int(x))

AAdf.head()
#AAdf[AAdf['Nominated Best Actor']==1].head()


Out[56]:
Nominated Best Actor Nominated Best Actress Nominated Best Animated Feature Film Nominated Best Art Direction Nominated Best Cinematography Nominated Best Costume Design Nominated Best Director Nominated Best Documentary, Feature Nominated Best Documentary, Short Subject Nominated Best Film Editing ... keywords mpaa nominations releasedate runtime title won year movieid winner
0035423 False False False False False False False False False False ... [time-travel, brooklyn-bridge, bridge, time-tr... PG-13 [Best Music, Song] 2001-12-25 118 Kate & Leopold [] 2001 0035423 0
0080388 Burt Lancaster Susan Sarandon False False False False Louis Malle False False False ... [drugs, gangster, camera-shot-of-feet, female-... NaN [Best Picture, Best Actor, Best Actress, Best ... 1981-04-03 104 Atlantic City [] 1981 0080388 0
0080855 False False False Tambi Larsen (Art Direction); Jim Berkey (Set ... False False False False False False ... [immigrant, sheriff, 1890s, johnson-county-war... NaN [Best Art Direction] 1980-11-18 149 Heaven's Gate [] 1981 0080855 0
0081974 Paul Newman False False False False False False False False False ... [murder, newspaper, mafia, reporter, slander, ... NaN [Best Actor, Best Supporting Actress, Best Wri... 1981-11-19 116 Absence of Malice [] 1981 0081974 0
0081988 False False False False False False False Suzanne Bauman, Paul Neshamkin, Jim Burroughs ... False False ... NaN NaN [Best Documentary, Feature] NaN 60 Against Wind and Tide: A Cuban Odyssey [] 1981 0081988 0

5 rows × 74 columns


In [60]:
# Foreign films nominess were blank and therefore coded as 0, to fix this we have to convert 0 and false to strings
AAdf['Nominated Best Foreign Language Film'].loc[AAdf['Nominated Best Foreign Language Film'] != 0] = 0
AAdf['Nominated Best Foreign Language Film'] = AAdf['Nominated Best Foreign Language Film'].apply(lambda x: str(x) + "fix" == "0fix")

Import and Split Data

First we split our data into a validation set (movies in 2006) and a training set (movies 1981-2005). We will use k-fold cross validation to train our model. In this preliminary analysis, we train our model to predict Oscar winners given the movie was nominated.


In [62]:
# Functions

# convert release dates into quarters
def get_quarter(monthint):
    if len(monthint) == 1:
        return 0
    else:
        if int(monthint[1]) <= 3:
            return 1
        if int(monthint[1]) >3 and int(monthint[1]) <=6:
            return 2
        if int(monthint[1]) >6 and int(monthint[1]) <=9:
            return 3
        if int(monthint[1]) >9:
            return 4
# convert release dates into month
def get_month(monthint):
    if len(monthint) == 1:
        return 0
    else:
        return(int(monthint[1]))

# convert the string of countries into countries
def get_countries(countrylist):
    countries =[]
    if countrylist == 0:
        return [u'USA']
    else:
        for country in countrylist.split('/'):
            country = country.replace(" ", "")
            countries.append(country)
        return countries

In [63]:
%%time
# Dealing with Categorical variables & creating other descriptive variables

# mpaa is ordinal, convert to ordinal 'mpaaint'
AAdf['mpaaint'] = AAdf['mpaa']
AAdf['mpaaint'].loc[AAdf['mpaaint']=='R'] = 3
AAdf['mpaaint'].loc[AAdf['mpaaint']=='PG-13'] = 2
AAdf['mpaaint'].loc[AAdf['mpaaint']=='PG'] = 1
AAdf['mpaaint'].loc[pd.isnull(AAdf['mpaaint'])] = 0

# count number of nominations
AAdf.loc[:,'numnominations'] = AAdf['nominations'].apply(lambda x: len(x))

# convert release dates to quarters
AAdf.loc[:,'quarter'] = AAdf['releasedate'].apply( lambda x: get_quarter(str(x).split('-')) )

# convert release dates into months
AAdf.loc[:,'month'] = AAdf['releasedate'].apply( lambda x: get_month(str(x).split('-')) )

# convert release dates to number of days since beginning of the year
AAdf.loc[:,'countrylist'] = AAdf['country'].apply( lambda x: get_countries(x) )

# get unique list of genres, as well as create count dictionary of all countries and keywords
countries_dict = {}
uniquegenres = set()
keywords_dict = {}
for _,movie in AAdf.iterrows():
    if type(movie['countrylist']) == list:
        for country in movie['countrylist']:
            if country in countries_dict.keys():
                countries_dict[country] += 1
            else:
                countries_dict[country] = 1
    for genre in movie.genres:
        uniquegenres.add(genre)
    if type(movie.keywords) == list:
        for keyword in movie.keywords:
            if keyword in keywords_dict.keys():
                keywords_dict[keyword] += 1
            else:
                keywords_dict[keyword] = 1
# shorten countries and keywords dictionary to only most common keywords to reduce dimensionality
numother = 0
for country in countries_dict.keys():
    if countries_dict[country] <= 10:
        numother = numother + countries_dict[country]
        del countries_dict[country]
countries_dict['OtherCountry/Unknown'] = numother
for keyword in keywords_dict.keys():
    if keywords_dict[keyword] <= 200:
        del keywords_dict[keyword]
        
# create dummy variables for countries, genres, and keywords
for country in countries_dict:
    AAdf.loc[:,country] = 0
for genre in uniquegenres:
    AAdf.loc[:,genre] = 0
for keyword in keywords_dict:
    AAdf.loc[:,keyword] = 0

AAdf['OtherCountry/Unknown'] = 0
for index,movie in AAdf.iterrows():
    if type(movie.countrylist) == list:
        for country in set(movie.countrylist):
            if country in set(countries_dict.keys()):
                AAdf.loc[index,country] = 1
            else:
                AAdf.loc[index,'OtherCountry/Unknown'] = 1
    if type(movie.genres) == list:
        for genre in uniquegenres:
            if genre in set(movie.genres):
                AAdf.loc[index,genre] = 1
    if type(movie.keywords) == list:
        for keyword in keywords_dict:
            if keyword in set(movie.keywords):
                AAdf.loc[index,keyword] = 1


CPU times: user 3min 11s, sys: 1.31 s, total: 3min 12s
Wall time: 3min 14s

In [64]:
# convert dataframe into dict for easier pickle 
# Note:  we were having trouble picle_loading pandas on non-apple computers and found dicts didn't have this problem
AAdictfinal = AAdf.transpose().to_dict()
This is the final version that we need! We will rerun this in oscar_process_notebook.

In [65]:
# To save new AAdict, run the following (AAdictfinal is the dict used is oscar_process_notebook)
#filename = 'AAdictfinal'
#pickle.dump(AAdictfinal, io.open(filename,'wb'))

# To reload AAdictfinal, run the following:
#AAdictfinal = pickle.load(open('AAdictfinal.p','rb'))

Now we are ready to do some analysis. Click here to go to oscar_process_notebook: https://github.com/oscarpredictor/oscar-predictor/blob/master/oscar_process_notebook.ipynb