In [1]:
from imdb import IMDb
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import cPickle as pickle
ia = IMDb(accessSystem='http')
from collections import defaultdict
import io
from datetime import datetime
import time
import math
Just run the following cell to open AAdict
In [59]:
# Run this cell instead of rerunning a portion of this notebook.
# By loading AAdict, you can skip the cells until you see the note "PICK BACK UP HERE"
AAdict = pickle.load(open('AAdict.p','rb'))
In [3]:
# Read in Academy Awards df (AAcsv)
AAcsv = pd.read_excel("Academy_Awards_2006.xls")
# Update df
# Concat Sort Title with first part of year; store in "title"
# This will be helpful when using ia.search_movie function
AAcsv['Year'] = AAcsv['Year'].values.astype(str)
AAcsv['yr'] = AAcsv.apply( lambda row: row['Year'][:4],axis=1 )
AAcsv['titleyr'] = AAcsv.apply( lambda row: '%s (%s)' % (row['Sort Title'],row['Year'][:4]),axis=1)
# Convert 'Winner' "X" to 1
AAcsv['Winner?'] = 1*(AAcsv['Winner?'] == 'X')
# If movie title is "[no specific film title]", delete
AAcsv = AAcsv[AAcsv['Sort Title'] != "[no specific film title]"].copy()
# Subset DF
AAcsv = AAcsv[AAcsv["yr"]>="1981"].copy()
# Store all possible awards in "awards" list
# This will be used when making aadict to indicate which awards the movie was nominated for/won
awards = list(set(list(AAcsv['Category'])))
# View head of AAcsv
AAcsv.head()
Out[3]:
In [10]:
# Due to the needing to call the IMDBpy database multiple times, this takes very long to run for our entire dataset.
# This can be used in future work for this analysis.
# Function: get_starpower
# Purpose: To convert movieobj and movie_tuple into a dictionary within AAdict
# Parameters:
# movieobj: a single IMDBmovie object
# movie_tuple: a tuple in the format (English Title, Sort Title, English Title + yr, yr)
# rewrite: if True, will rewrite an existing key in AAdict, if exists
# Returns None (if no movieobj) or 1 if successfully appended dictionary to AAdict
def starpower(movieobj, year):
try:
movie_cast = movieobj.data['cast']
except:
return 0
cast_rating={}
totalpower=0
for actorperson in movie_cast:
print actorperson
ia.update(actorperson, 'filmography')
if 'actor' in actorperson.keys():
temp_movie_list = actorperson['actor'] # The filmography of a given actor
elif 'actress' in actorperson.keys():
#print "in the actress"
temp_movie_list = actorperson['actress']
else:
temp_movie_list=[]
count=0 # counts the number of movies they have been in thus far
sum=0 # Sums the gross of the movies they have been in thus far
avg_rating = 0 #holds the net total of ratings for the movies that an actor has been in thus far
total_votes = 0 # Holds the IMDb votes for the movies, a proxy for movie popularity
for j in temp_movie_list:
ia.update(j, 'vote details')
if 'rating' in j.keys():
#print j
if ((j.data['year'] <=year)):
count +=1
#sum += j['gross'] # Only used if we end up getting gross movien sales for all movies
avg_rating +=j.data['rating'] # adding up the movie ratings,
total_votes +=(j.data['votes']*(1/(year-j.data['year']+1)))
final_power = rating_calculator(count, avg_rating, total_votes)
cast_rating[actorperson] = final_power #add in gross if it exists
totalpower += final_power
print cast_rating, totalpower
return(cast_rating, totalpower)
def rating_calculator(count, avg_rating, total_votes):
if (count!=0) & (avg_rating!=0) & (total_votes!=0):
return(math.log(count) + (avg_rating/count)*0.3 + total_votes*0.000001)
else:
return(0)
In [10]:
# Due to the needing to call the IMDBpy database multiple times, this takes very long to run for our entire dataset.
# This can be used in future work for this analysis.
# Function: get_directorpower
# Purpose: To convert movieobj and movie_tuple into a dictionary within AAdict
# Parameters:
# movieobj: a single IMDBmovie object
# movie_tuple: a tuple in the format (English Title, Sort Title, English Title + yr, yr)
# rewrite: if True, will rewrite an existing key in AAdict, if exists
# Returns None (if no movieobj) or 1 if successfully appended dictionary to AAdict
def directorpower(movie_director, year):
movie_cast
cast_rating={}
totalpower=0
for actorperson in movie_cast:
print actorperson
ia.update(actorperson, 'filmography')
if 'actor' in actorperson.keys():
temp_movie_list = actorperson['actor'] # The filmography of a given actor
elif 'actress' in actorperson.keys():
#print "in the actress"
temp_movie_list = actorperson['actress']
else:
temp_movie_list=[]
count=0 # counts the number of movies they have been in thus far
sum=0 # Sums the gross of the movies they have been in thus far
avg_rating = 0 #holds the net total of ratings for the movies that an actor has been in thus far
total_votes = 0 # Holds the IMDb votes for the movies, a proxy for movie popularity
for j in temp_movie_list:
ia.update(j, 'vote details')
if 'rating' in j.keys():
#print j
if ((j.data['year'] <=year)):
count +=1
#sum += j['gross'] # Only used if we end up getting gross movien sales for all movies
avg_rating +=j.data['rating'] # adding up the movie ratings,
total_votes +=(j.data['votes']*(1/(year-j.data['year']+1)))
final_power = rating_calculator(count, avg_rating, total_votes)
cast_rating[actorperson] = final_power #add in gross if it exists
totalpower += final_power
print cast_rating, totalpower
return(cast_rating, totalpower)
def rating_calculator(count, avg_rating, total_votes):
if (count!=0) & (avg_rating!=0) & (total_votes!=0):
return(math.log(count) + (avg_rating/count)*0.3 + total_votes*0.000001)
else:
return(0)
In [4]:
### Movie Attribute functions ##
# Function: get_releasedate
# Purpose: get the USA release date given a movieobj
# Parameters:
# movieobj: a single IMDBmovie object
# Returns: the USA release date of the movie as a class datetime
def get_releasedate(movieobj):
try:
ia.update(movieobj, 'release dates')
date = str(movieobj.data['release dates']).split("USA::", 1)[1]
day = str(date.split(" ")[0])
month = str(date.split(" ")[1])
year = str(date.split(" ")[2].split("'")[0])
releasedate = datetime.strptime(year + "-" + month + "-" + day, "%Y-%B-%d").date()
except:
releasedate = np.nan
return releasedate
# Function: get_mpaa
# Purpose: get the mpaa rating given a movieobj
# Parameters:
# movieobj: a single IMDBmovie object
# Returns: the USA release date of the movie as a class datetime
def get_mpaa(movieobj):
try:
mpaa = str(movieobj.data['mpaa']).split("Rated ", 1)[1].split(" ")[0]
except:
mpaa = np.nan
return mpaa
# Function: get_genres
# Purpose: get the list of genres given in a movieobj
# Parameters:
# movieobj: a single IMDBmovie object
# Returns: a list of genres
def get_genres(movieobj):
try:
genres = movieobj.data['genres']
except:
genres = np.nan
return genres
# Function: get_runtime
# Purpose: get the runtime given in a movieobj
# Parameters:
# movieobj: a single IMDBmovie object
# Returns: USA runtime
def get_runtime(movieobj):
try:
runtime = movieobj.data['runtimes'][0]
try:
runtime = int(runtime)
except:
try:
runtime = int(runtime.split(':')[0])
except:
try:
runtime = int(runtime.split(':')[1])
except:
runtime = int(runtime.split(':')[2])
except:
runtime = 0
return runtime
# Function: get_starpower
# Purpose: calculate rating of how well-known movie cast is
# Parameters:
# movieobj: a single IMDBmovie object
# Returns: an int
#%run 'Starpower.ipynb'
def get_starpower(movieobj, year):
try:
castdata = movieobj.data['cast']
starpower = starpower(castdata, year)
except:
starpower = 0
return starpower
# Function: get_director
# Purpose: get the list of directors given in a movieobj
# Parameters:
# movieobj: a single IMDBmovie object
# Returns: a list of directors
def get_director(movieobj):
try:
directordata = movieobj.data['director']
director = []
for person in directordata:
director.append(person.personID)
except:
director = []
return director
# Function: get_keywords
# Purpose: get the list of keywords given in a movieobj
# Parameters:
# movieobj: a single IMDBmovie object
# Returns: a list of keywords
def get_keywords(movieobj):
try:
ia.update(movieobj, 'keywords')
keywords = movieobj.data['keywords']
except:
keywords = np.nan
return keywords
In [5]:
# Function: find_movie
# Purpose: to sort through possible IMDB movie objects and find just one
# Parameters:
#title: title of movie
#year: year of movie
#mlist: list of possible IMDB movie objects
# Returns movieobj
def find_movie(title, year, mlist):
# find movies that came out in the same year
year_list = []
for movie in mlist:
try:
if movie.data['year'] == int(year):
year_list.append(movie)
# else see if one - two years off
elif movie.data['year'] == int(year) + 1:
year_list.append(movie)
elif movie.data['year'] == int(year) - 1:
year_list.append(movie)
elif movie.data['year'] == int(year) + 2:
year_list.append(movie)
elif movie.data['year'] == int(year) - 2:
year_list.append(movie)
except:
pass
# if the years do not match, there is no match
if len(year_list) < 1:
return None
if len(year_list) == 1:
return ia.get_movie(year_list[0].movieID)
else:
# process the title
sorted_title = "".join(sorted(title)).replace(" ", "")
len_sorted_title = len(sorted_title)
# check whether movies that came out in the same year
# have the same letters
counts = [0]*len(year_list)
for j in range(len(year_list)):
mtitle = year_list[j]['title']
sorted_mtitle = "".join(sorted(mtitle)).replace(" ", "")
if len_sorted_title == len(sorted_mtitle):
# if the title cannot be converted to a string
# it is not the correct title
try:
sorted_mtitle = str(sorted_mtitle)
except:
continue
for i in range(len_sorted_title):
if sorted_title[i] == sorted_mtitle[i]:
counts[j] += 1
else:
continue
k = counts.index(max(counts))
if len(year_list) >= 1:
#return year_list[k]
return ia.get_movie(year_list[k].movieID)
In [6]:
# Function: find_movieobj
# Purpose: To convert a tuple of movie's information into one IMDB movie object
# Parameters:
# movie_tuple: a tuple in the format (English Title, Sort Title, English Title + yr, yr)
# Returns: IMDB movie object
def find_movieobj(movie_tuple):
## Step 1: Find the IMDB movie object ("movieobj")
arg1 = movie_tuple[0] # English Title (1st choice for arg1)
if type(arg1) == int: # check if movie title is an int, if so convert to string
arg1 = str(arg1)
arg2 = movie_tuple[3] # movie year
arg3 = ia.search_movie(arg1) # list of possible movies
movieobj = find_movie(arg1, arg2, arg3) # find IMDB movie object ("movieobj") using "find_movie" function
if movieobj == None: # if returned none, try again using title + yr search list of movies & English Title
arg1alt = movie_tuple[1] # Non-English Title (alternate choice if arg1 fails)
if type(arg1alt) == int: # check if movie title is an int, if so convert to string
arg1alt = str(arg1alt)
arg3alt1 = ia.search_movie(arg1alt) # list of possible movies searching for title + yr (alt choice to arg3)
movieobj = find_movie(arg1, arg2, arg3alt1)
if movieobj == None: # if returned none, try again using title + yr search list of movies & Non-English Title
arg3alt2 = ia.search_movie(movie_tuple[2]) # list of possible movies searching for Non-English title (alt choice to arg3)
find_movie(arg1alt, arg2, arg3alt2)
if movieobj == None: # if returned none, try again using Non-English search list of movies & English Title
movieobj = find_movie(arg1, arg2, arg3alt1)
if movieobj == None: # if returned none, try again using Non-English search list of movies & Non-English Title
movieobj = find_movie(arg1alt, arg2, arg3alt1)
return movieobj
In [7]:
# Function: make_moviedict
# Purpose: To convert movieobj and movie_tuple into a dictionary within AAdict
# Parameters:
# movieobj: a single IMDBmovie object
# movie_tuple: a tuple in the format (English Title, Sort Title, English Title + yr, yr)
# rewrite: if True, will rewrite an existing key in AAdict, if exists
# Returns None (if no movieobj) or 1 if successfully appended dictionary to AAdict
def make_moviedict(movieobj, movie_tuple, rewrite=False):
if movieobj is None:
return False
else:
## Get movie id ##
movid = movieobj.movieID
# Check if movie is already in dict if parameter rewrite = True
if rewrite==False and movid in AAdict:
return False
else:
## Populate dictionary, main key is movie id ##
AAdict[movid] = {}
# "title": title of movie
AAdict[movid]['title'] = movie_tuple[0]
# "nominations": list of Oscar nominations
AAdict[movid]['nominations'] = list(AAcsv[AAcsv['English Title']==movie_tuple[0]]['Category'])
# "won": list of Oscars won
AAdict[movid]['won'] = list(AAcsv[(AAcsv['English Title']==movie_tuple[0]) & (AAcsv['Winner?']==1)]['Category'])
# "year": year Oscar won
AAdict[movid]['year'] = list(AAcsv[AAcsv['English Title']==movie_tuple[0]]['yr'])[0]
# "country": country of movie
AAdict[movid]['country'] = list(AAcsv[AAcsv['English Title']==movie_tuple[0]]['Country'])[0]
# "releasedate": USA movie release date in form yyyy-mm-dd
AAdict[movid]['releasedate'] = get_releasedate(movieobj)
# "mpaa": mpaa rating for the movie (i.e. R, PG-13, PG, G)
AAdict[movid]['mpaa'] = get_mpaa(movieobj)
# "genres": list of genres
AAdict[movid]['genres'] = get_genres(movieobj)
# "runtime": USA runtime
AAdict[movid]['runtime'] = get_runtime(movieobj)
# "cast": movie cast
AAdict[movid]['starpower'] = get_starpower(movieobj,AAdict[movid]['year'])
# "director": list of directors
AAdict[movid]['director'] = get_director(movieobj)
# "keywords": list of keywords
AAdict[movid]['keywords'] = get_keywords(movieobj)
# make each award individual key and the value to indicate whether movie won/nominated or not
# Loop through awards list and indicate if movie was nominated or won
for award in awards:
# "Nominated award_name": True or False
AAdict[movid]["Nominated %s" % award] = award in list(AAdict[movid]['nominations'])
if AAdict[movid]["Nominated %s" %award] == True:
AAdict[movid]["Nominated %s" %award] = list(AAcsv[(AAcsv['English Title']==movie_tuple[0]) & (AAcsv['Category']==award)]['Nominee(s)'])[0]
# "Nominated award_name": True or False
AAdict[movid]["Won %s" % award] = award in list(AAdict[movid]['won'])
if AAdict[movid]["Won %s" %award] == True:
AAdict[movid]["Won %s" %award] = list(AAcsv[(AAcsv['English Title']==movie_tuple[0]) & (AAcsv['Category']==award)]['Nominee(s)'])[0]
return True
Note: You can skip the next several cells and just run the next cell (pickle.load) to get the complete moviedict I ended up doing this in 2 pieces b/c it took long to run on the full file and would sometimes timeout.
In [8]:
# Prep to create Academy Awards Dictionary ("AAdict"), a dict of dicts
# AAdict keyed by IMDB movie IDs
# Each movie id dict has keys containing information about the movie & Academy Award information
# Get a list of the unique movies in Academy Awards DF ("AAcsv")
# Store the English Title, Sort Title, English Title + yr, and yr in "AAuniquemovies"
#AAuniquemovies = list(set(zip(AAcsv['English Title'], AAcsv['Sort Title'], AAcsv['titleyr'], AAcsv['yr'])))
AAuniquemovies = list(set(zip(AAcsv['English Title'], AAcsv['Sort Title'], AAcsv['titleyr'], AAcsv['yr'])))
# Create empty AAdict
AAdict = {}
# Keep track of movies that failed to find a IMDB movie object (i.e. movieobj = None)
AAmissingmovies = list()
In [9]:
# loop through all movies and save in moviedict
for i in range(len(Aauniquemovies)):
## STEP 1: Get movieobj of movie using get_movieobj
movieobj = find_movieobj(AAuniquemovies[i])
## STEP 2: Append to AAdict using make_moviedict
added = make_moviedict(movieobj, AAuniquemovies[i])
if added is False:
AAmissingmovies.append(AAuniquemovies[i])
In [11]:
# CHECK
#print "number of movies in AAcsv:", len(AAdict.keys())
#print "number of movies in AAcsv:", len(AAuniquemovies)
#print "number of movies missing from AAcsv:", len(AAmissingmovies)
#print "movies missing from AAcsv:"
#for missingmovie in AAmissingmovies:
# print (missingmovie[2], AAuniquemovies.index(missingmovie))
#x = (u'Adam', u'Adam', u'Adam (1992)', '1992')
In [12]:
# Add movies that are missing from AAdict
# Movies missing altogether from AAdict
missingids = [('0102997', 48),('0083293', 481),('0092999', 580),('0091021', 675),('0130860', 1301), ('0101270',303), ('0101272', 1172)]
#missingids = list(Strings (1991), Violet (1981),
#Eyes on the Prize: America's Civil Rights Years/Bridge to Freedom 1965 (1987),
#Exit (1986), Mermaid (1997)), Adam(1992) - misclassified, Addams Family (1991)
for missingid in missingids:
## STEP 1: Get movieobj of movie using get_movieobj
movieobj = ia.get_movie(missingid[0])
## STEP 2: Append to AAdict using make_moviedict
added = make_moviedict(movieobj, AAuniquemovies[missingid[1]], rewrite=True)
In [41]:
# Hand checked every repeat, the following are okay and don't need munipulation:
# ('Triplets of Belleville (2003)', '0286244')
# ('WarGames (1983)', '0086567')
# ('Pelle the Conqueror (1988)','0093713')
# These movies already exisited in AAdict but were missing nominations/winning information
# due to them being under different names (ex: "Goodfellas" versus "Good fellas")
# ('Remains of the Day (1993)', '0107943')
AAdict['0107943']["Nominated Best Costume Design"] = u'Jenny Beavan, John Bright'
AAdict['0107943']['nominations'].append(u'Best Costume Design')
# ('Cyrano De Bergerac (1990)', '0099334')
AAdict['0099334']["Nominated Best Actor"] = u'Gerard Depardieu'
AAdict['0099334']["Nominated Best Art Direction"] = u'Ezio Frigerio (Art Direction); Jacques Rouxel (Set Decoration)'
AAdict['0099334']["Nominated Best Costume Design"] = u'Franca Squarciapino'
AAdict['0099334']["Won Best Costume Design"] = u'Franca Squarciapino'
AAdict['0099334']["Nominated Best Makeup"] = u'Michèle Burke, Jean-Pierre Eychenne'
AAdict['0099334']['nominations'].extend([u'Best Costume Design', u'Best Art Direction', u'Best Costume Design', u'Best Makeup'])
AAdict['0099334']['won'].extend([u'Best Costume Design'])
# ('Greystoke: the Legend of Tarzan, Lord of the Apes (1984)', '0087365')
AAdict['0087365']["Nominated Best Writing, Adapted Screenplay"] = u'P.H. Vazak, Michael Austin'
AAdict['0087365']['Nominated Best Makeup'] = u'Rick Baker, Paul Engelen'
AAdict['0087365']['nominations'].extend([u'Best Writing, Adapted Screenplay', u'Best Makeup'])
# ('Enemies: A Love Story (1989)','0097276')
AAdict['0097276']['Nominated Best Supporting Actress'] = u'Lena Olin', u'Anjelica Huston'
AAdict['0097276']['nominations'].extend([u'Best Supporting Actress', u'Best Supporting Actress'])
# ('Goodfellas (1990)','0099685')
AAdict['0099685']['Nominated Best Picture'] = u'Irwin Winkler (Producer)'
AAdict['0099685']['Nominated Best Supporting Actor'] = u'Joe Pesci'
AAdict['0099685']['Won Best Supporting Actor'] = u'Joe Pesci'
AAdict['0099685']['Nominated Best Supporting Actress'] = u'Lorraine Bracco'
AAdict['0099685']['Nominated Best Writing, Adapted Screenplay'] = u'Nicholas Pileggi, Martin Scorsese'
AAdict['0099685']['nominations'].extend([u'Best Picture', u'Best Supporting Actor', u'Best Supporting Actress', u'Best Writing, Adapted Screenplay'])
AAdict['0099685']['won'].extend([u'Best Supporting Actor'])
In [39]:
# To Save new AAdict, run this cell
#filename = 'AAdict.p'
#pickle.dump(AAdict, io.open(filename,'wb'))
In [56]:
# convert AAdict to pandas
AAdf = pd.DataFrame.from_dict(AAdict).transpose()
AAdf['movieid'] = AAdf.index
# hand-code genres for one movie that was missing genre info
AAdf.loc['5152218',:].genres = ["Horror","Romance"]
# Create new columns
# number of nominations
# winner
AAdf['winner'] = AAdf['won'].apply(lambda x: len(x)!=0) * 1
# convert years to ints
AAdf['year'] = AAdf['year'].apply(lambda x: int(x))
AAdf.head()
#AAdf[AAdf['Nominated Best Actor']==1].head()
Out[56]:
In [60]:
# Foreign films nominess were blank and therefore coded as 0, to fix this we have to convert 0 and false to strings
AAdf['Nominated Best Foreign Language Film'].loc[AAdf['Nominated Best Foreign Language Film'] != 0] = 0
AAdf['Nominated Best Foreign Language Film'] = AAdf['Nominated Best Foreign Language Film'].apply(lambda x: str(x) + "fix" == "0fix")
In [62]:
# Functions
# convert release dates into quarters
def get_quarter(monthint):
if len(monthint) == 1:
return 0
else:
if int(monthint[1]) <= 3:
return 1
if int(monthint[1]) >3 and int(monthint[1]) <=6:
return 2
if int(monthint[1]) >6 and int(monthint[1]) <=9:
return 3
if int(monthint[1]) >9:
return 4
# convert release dates into month
def get_month(monthint):
if len(monthint) == 1:
return 0
else:
return(int(monthint[1]))
# convert the string of countries into countries
def get_countries(countrylist):
countries =[]
if countrylist == 0:
return [u'USA']
else:
for country in countrylist.split('/'):
country = country.replace(" ", "")
countries.append(country)
return countries
In [63]:
%%time
# Dealing with Categorical variables & creating other descriptive variables
# mpaa is ordinal, convert to ordinal 'mpaaint'
AAdf['mpaaint'] = AAdf['mpaa']
AAdf['mpaaint'].loc[AAdf['mpaaint']=='R'] = 3
AAdf['mpaaint'].loc[AAdf['mpaaint']=='PG-13'] = 2
AAdf['mpaaint'].loc[AAdf['mpaaint']=='PG'] = 1
AAdf['mpaaint'].loc[pd.isnull(AAdf['mpaaint'])] = 0
# count number of nominations
AAdf.loc[:,'numnominations'] = AAdf['nominations'].apply(lambda x: len(x))
# convert release dates to quarters
AAdf.loc[:,'quarter'] = AAdf['releasedate'].apply( lambda x: get_quarter(str(x).split('-')) )
# convert release dates into months
AAdf.loc[:,'month'] = AAdf['releasedate'].apply( lambda x: get_month(str(x).split('-')) )
# convert release dates to number of days since beginning of the year
AAdf.loc[:,'countrylist'] = AAdf['country'].apply( lambda x: get_countries(x) )
# get unique list of genres, as well as create count dictionary of all countries and keywords
countries_dict = {}
uniquegenres = set()
keywords_dict = {}
for _,movie in AAdf.iterrows():
if type(movie['countrylist']) == list:
for country in movie['countrylist']:
if country in countries_dict.keys():
countries_dict[country] += 1
else:
countries_dict[country] = 1
for genre in movie.genres:
uniquegenres.add(genre)
if type(movie.keywords) == list:
for keyword in movie.keywords:
if keyword in keywords_dict.keys():
keywords_dict[keyword] += 1
else:
keywords_dict[keyword] = 1
# shorten countries and keywords dictionary to only most common keywords to reduce dimensionality
numother = 0
for country in countries_dict.keys():
if countries_dict[country] <= 10:
numother = numother + countries_dict[country]
del countries_dict[country]
countries_dict['OtherCountry/Unknown'] = numother
for keyword in keywords_dict.keys():
if keywords_dict[keyword] <= 200:
del keywords_dict[keyword]
# create dummy variables for countries, genres, and keywords
for country in countries_dict:
AAdf.loc[:,country] = 0
for genre in uniquegenres:
AAdf.loc[:,genre] = 0
for keyword in keywords_dict:
AAdf.loc[:,keyword] = 0
AAdf['OtherCountry/Unknown'] = 0
for index,movie in AAdf.iterrows():
if type(movie.countrylist) == list:
for country in set(movie.countrylist):
if country in set(countries_dict.keys()):
AAdf.loc[index,country] = 1
else:
AAdf.loc[index,'OtherCountry/Unknown'] = 1
if type(movie.genres) == list:
for genre in uniquegenres:
if genre in set(movie.genres):
AAdf.loc[index,genre] = 1
if type(movie.keywords) == list:
for keyword in keywords_dict:
if keyword in set(movie.keywords):
AAdf.loc[index,keyword] = 1
In [64]:
# convert dataframe into dict for easier pickle
# Note: we were having trouble picle_loading pandas on non-apple computers and found dicts didn't have this problem
AAdictfinal = AAdf.transpose().to_dict()
In [65]:
# To save new AAdict, run the following (AAdictfinal is the dict used is oscar_process_notebook)
#filename = 'AAdictfinal'
#pickle.dump(AAdictfinal, io.open(filename,'wb'))
# To reload AAdictfinal, run the following:
#AAdictfinal = pickle.load(open('AAdictfinal.p','rb'))
Now we are ready to do some analysis. Click here to go to oscar_process_notebook: https://github.com/oscarpredictor/oscar-predictor/blob/master/oscar_process_notebook.ipynb