In [188]:
import re
import numpy as np
import pandas as pd
import scipy.stats as stats
pd.set_option('display.float_format', lambda x: '%.3f' % x)
In [153]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sb
sb.set(color_codes=True)
sb.set_palette("muted")
np.random.seed(sum(map(ord, "regression")))
In [182]:
awards = pd.read_csv('../data/nominations.csv')
oscars = pd.read_csv('../data/analysis.csv')
In [175]:
sb.countplot(x="release_month", data=oscars)
Out[175]:
This can be more or less confirmed by calculating the Pearson correlation coefficient, which measures the linear dependence between two variables:
In [212]:
def print_pearsonr(data, dependent, independent):
for field in independent:
coeff = stats.pearsonr(data[dependent], data[field])
print "{0} | coeff: {1} | p-value: {2}".format(field, coeff[0], coeff[1])
print_pearsonr(oscars, 'Oscar', ['q1_release', 'q2_release', 'q3_release', 'q4_release'])
Q1 and Q4 have a higher coefficient than Q2 and Q3, so that points in the right direction...
This won't really help us determine who will win the actual Oscar, but at least we know that if we want a shot, we need to be releasing in late Q4 and early Q1.
In [179]:
# In case we want to examine the data based on the release decade...
oscars['decade'] = oscars['year'].apply(lambda y: str(y)[2] + "0")
In [186]:
# Adding some fields to slice and dice...
profit = oscars[~oscars['budget'].isnull()]
profit = profit[~profit['box_office'].isnull()]
profit['profit'] = profit['box_office'] - profit['budget']
profit['margin'] = profit['profit'] / profit['box_office']
In [187]:
avg_margin_for_all = profit.groupby(['category'])['margin'].mean()
avg_margin_for_win = profit[profit['Oscar'] == 1].groupby(['category'])['margin'].mean()
fig, ax = plt.subplots()
index = np.arange(len(profit['category'].unique()))
rects1 = plt.bar(index, avg_margin_for_win, 0.45, color='r', label='Won')
rects2 = plt.bar(index, avg_margin_for_all, 0.45, color='b', label='All')
plt.xlabel('Award Category')
ax.set_xticklabels(profit['category'].unique(), rotation='vertical')
plt.ylabel('Profit Margin (%)')
plt.title('Average Profit Margin by Award Category')
plt.legend()
plt.show()
In [82]:
fields = ['year', 'film', 'category', 'name', 'budget', 'box_office', 'profit', 'margin']
profit[(profit['profit'] < 0) & (profit['Oscar'] == 1)][fields]
Out[82]:
In [249]:
winning_awards = oscars[['category', 'Oscar', 'BAFTA', 'Golden Globe', 'Guild']]
winning_awards.head()
Out[249]:
In [244]:
acting_categories = ['Actor', 'Actress', 'Supporting Actor', 'Supporting Actress']
y = winning_awards[(winning_awards['Oscar'] == 1)&(winning_awards['category'].isin(acting_categories))]
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, sharey=True)
plt.title('Count Plot of Wins by Award')
sb.countplot(x="BAFTA", data=y, ax=ax1)
sb.countplot(x="Golden Globe", data=y, ax=ax2)
sb.countplot(x="Guild", data=y, ax=ax3)
print "Pearson correlation for acting categories\n"
print_pearsonr(oscars[oscars['category'].isin(acting_categories)], 'Oscar', ['BAFTA', 'Golden Globe', 'Guild'])
It looks like if the Golden Globes and Screen Actors Guild awards are better indicators of Oscar success than the BAFTAs. Let's take a look at the same analysis, but for Best Picture. The "Guild" award we use is the Screen Actor Guild Award for Outstanding Performance by a Cast in a Motion Picture.
In [247]:
y = winning_awards[(winning_awards['Oscar'] == 1)&(winning_awards['category'] == 'Picture')]
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, sharey=True)
plt.title('Count Plot of Wins by Award')
sb.countplot(x="BAFTA", data=y, ax=ax1)
sb.countplot(x="Golden Globe", data=y, ax=ax2)
sb.countplot(x="Guild", data=y, ax=ax3)
print "Pearson correlation for acting categories\n"
print_pearsonr(oscars[oscars['category'] == 'Picture'], 'Oscar', ['BAFTA', 'Golden Globe', 'Guild'])
Seems like the BAFTAs hold a bit more weight than the SAG awar, but the Golden Globes are still the best way to forecast an Oscar win.