Exploring the movies dataset

Loading the dataset


In [1]:
import os

import pandas as pd
import sklearn as skl

import holcrawl.shared

In [2]:
dataset_dir = holcrawl.shared._get_dataset_dir_path()

In [3]:
dataset_path = os.path.join(dataset_dir, 'movies_dataset.csv')

In [4]:
df = pd.read_csv(dataset_path)

In [5]:
df.year.value_counts()


Out[5]:
2009.0    163
2011.0    152
2012.0    148
2014.0    129
2013.0    122
2010.0    105
2015.0     67
2008.0     14
Name: year, dtype: int64

In [6]:
list(df.columns)


Out[6]:
['avg_screens',
 'budget',
 'budget_currency',
 'closing_date',
 'critic_review_count',
 'duration',
 'gross_income',
 'imdb_user_reviews',
 'max_screens',
 'mc_avg_user_score',
 'mc_metascore',
 'mc_mixed_rating_frequency',
 'mc_movie_name',
 'mc_negative_rating_frequency',
 'mc_positive_rating_frequency',
 'mc_pro_critic_reviews',
 'mc_user_reviews',
 'metascore',
 'name',
 'num_weekends',
 'opening_weekend_date',
 'opening_weekend_income',
 'opening_weekend_income_currency',
 'opening_weekend_screens',
 'rating',
 'rating_count',
 'release_day',
 'release_month',
 'release_year',
 'screens_by_weekend',
 'total_screens',
 'user_review_count',
 'year',
 'avg_rating_per_demo.aged_18-29',
 'avg_rating_per_demo.aged_30-44',
 'avg_rating_per_demo.aged_45+',
 'avg_rating_per_demo.aged_under_18',
 'avg_rating_per_demo.females',
 'avg_rating_per_demo.females_aged_18-29',
 'avg_rating_per_demo.females_aged_30-44',
 'avg_rating_per_demo.females_aged_45+',
 'avg_rating_per_demo.females_under_18',
 'avg_rating_per_demo.imdb_staff',
 'avg_rating_per_demo.imdb_users',
 'avg_rating_per_demo.males',
 'avg_rating_per_demo.males_aged_18-29',
 'avg_rating_per_demo.males_aged_30-44',
 'avg_rating_per_demo.males_aged_45+',
 'avg_rating_per_demo.males_under_18',
 'avg_rating_per_demo.non-us_users',
 'avg_rating_per_demo.top_1000_voters',
 'avg_rating_per_demo.us_users',
 'votes_per_demo.aged_18-29',
 'votes_per_demo.aged_30-44',
 'votes_per_demo.aged_45+',
 'votes_per_demo.aged_under_18',
 'votes_per_demo.females',
 'votes_per_demo.females_aged_18-29',
 'votes_per_demo.females_aged_30-44',
 'votes_per_demo.females_aged_45+',
 'votes_per_demo.females_under_18',
 'votes_per_demo.imdb_staff',
 'votes_per_demo.imdb_users',
 'votes_per_demo.males',
 'votes_per_demo.males_aged_18-29',
 'votes_per_demo.males_aged_30-44',
 'votes_per_demo.males_aged_45+',
 'votes_per_demo.males_under_18',
 'votes_per_demo.non-us_users',
 'votes_per_demo.top_1000_voters',
 'votes_per_demo.us_users',
 'rating_freq.1',
 'rating_freq.10',
 'rating_freq.2',
 'rating_freq.3',
 'rating_freq.4',
 'rating_freq.5',
 'rating_freq.6',
 'rating_freq.7',
 'rating_freq.8',
 'rating_freq.9',
 'genres.action',
 'genres.adventure',
 'genres.animation',
 'genres.biography',
 'genres.comedy',
 'genres.crime',
 'genres.documentary',
 'genres.drama',
 'genres.family',
 'genres.fantasy',
 'genres.history',
 'genres.horror',
 'genres.music',
 'genres.musical',
 'genres.mystery',
 'genres.news',
 'genres.romance',
 'genres.sci-fi',
 'genres.sport',
 'genres.thriller',
 'genres.war',
 'genres.western',
 'num_mc_critic',
 'avg_mc_critic',
 'num_mc_critic_by_opening',
 'avg_mc_critic_by_opening',
 'num_mc_user',
 'avg_mc_user',
 'num_mc_user_by_opening',
 'avg_mc_user_by_opening',
 'num_imdb_user',
 'avg_imdb_user',
 'num_imdb_user_by_opening',
 'avg_imdb_user_by_opening',
 'opening_month',
 'opening_day',
 'opening_day_of_year']

Feature Generation


In [8]:
df['norm_gross'] = df['gross_income'] / df['budget']

In [9]:
df['profit'] = df['gross_income'] - df['budget']

In [10]:
df['ROI'] = (df['gross_income'] - df['budget']) / df['budget']

In [11]:
df['name_length'] = df['name'].map(lambda name: len(name))

In [12]:
len(df)


Out[12]:
900

In [13]:
df.isnull().sum()


Out[13]:
avg_screens                          0
budget                             149
budget_currency                    149
closing_date                         0
critic_review_count                  0
duration                             0
gross_income                         0
imdb_user_reviews                    0
max_screens                          0
mc_avg_user_score                    0
mc_metascore                         0
mc_mixed_rating_frequency            0
mc_movie_name                        0
mc_negative_rating_frequency         0
mc_positive_rating_frequency         0
mc_pro_critic_reviews                0
mc_user_reviews                      0
metascore                            3
name                                 0
num_weekends                         0
opening_weekend_date                 0
opening_weekend_income               0
opening_weekend_income_currency      0
opening_weekend_screens              0
rating                               0
rating_count                         0
release_day                        191
release_month                      191
release_year                       191
screens_by_weekend                   0
                                  ... 
genres.horror                        0
genres.music                         0
genres.musical                       0
genres.mystery                       0
genres.news                          0
genres.romance                       0
genres.sci-fi                        0
genres.sport                         0
genres.thriller                      0
genres.war                           0
genres.western                       0
num_mc_critic                        0
avg_mc_critic                      235
num_mc_critic_by_opening             0
avg_mc_critic_by_opening           266
num_mc_user                          0
avg_mc_user                         16
num_mc_user_by_opening               0
avg_mc_user_by_opening             441
num_imdb_user                        0
avg_imdb_user                        0
num_imdb_user_by_opening             0
avg_imdb_user_by_opening            45
opening_month                        0
opening_day                          0
opening_day_of_year                  0
norm_gross                         149
profit                             149
ROI                                149
name_length                          0
dtype: int64

In [14]:
len(df[df['avg_mc_critic_by_opening'].notnull()])


Out[14]:
634

In [15]:
# df['opening_weekend_date']

In [16]:
BASE_FEAT_TO_KEEP = [
    'duration', 'budget', 'opening_month', 'opening_day', 'opening_day_of_year', 'year',
    'avg_mc_critic_by_opening', 'num_mc_critic_by_opening', 'name_length', 
    'num_imdb_user_by_opening', 'avg_imdb_user_by_opening', 'opening_weekend_screens'# 'avg_mc_user_by_opening'
]

In [17]:
FEAT_TO_KEEP = BASE_FEAT_TO_KEEP + [col for col in df.columns if 'genres' in col]

In [18]:
features = df.drop([col for col in df.columns if col not in BASE_FEAT_TO_KEEP], axis=1)

In [19]:
dataset = df.drop([col for col in df.columns if col not in FEAT_TO_KEEP], axis=1)

In [20]:
# letter_dummies = pd.get_dummies(df['starting_letter'], drop_first=True, prefix='fl')

In [21]:
# dataset = dataset.assign(**{col: letter_dummies[col] for col in letter_dummies.columns})

In [22]:
dataset = dataset.dropna(axis=0)

In [23]:
len(dataset)


Out[23]:
518

In [24]:
import seaborn as sns
%matplotlib inline

In [25]:
dataset.year.unique()


Out[25]:
array([ 2014.,  2012.,  2013.,  2011.,  2015.,  2010.])

In [26]:
dataset.year.value_counts()


Out[26]:
2011.0    119
2012.0    117
2014.0    106
2013.0     94
2015.0     63
2010.0     19
Name: year, dtype: int64

In [27]:
sns.distplot(dataset.year)


Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x10fa96c50>

In [28]:
dataset.columns


Out[28]:
Index(['budget', 'duration', 'opening_weekend_screens', 'year',
       'genres.action', 'genres.adventure', 'genres.animation',
       'genres.biography', 'genres.comedy', 'genres.crime',
       'genres.documentary', 'genres.drama', 'genres.family', 'genres.fantasy',
       'genres.history', 'genres.horror', 'genres.music', 'genres.musical',
       'genres.mystery', 'genres.news', 'genres.romance', 'genres.sci-fi',
       'genres.sport', 'genres.thriller', 'genres.war', 'genres.western',
       'num_mc_critic_by_opening', 'avg_mc_critic_by_opening',
       'num_imdb_user_by_opening', 'avg_imdb_user_by_opening', 'opening_month',
       'opening_day', 'opening_day_of_year', 'name_length'],
      dtype='object')

In [29]:
# pd.options.display.max_columns = 999
# dataset

Exploration


In [30]:
import numpy as np

In [31]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [32]:
def draw_pearson_correlation(feature_df):
    pearson_df = pd.DataFrame(
        data=np.corrcoef(feature_df.T),
        index=feature_df.columns,
        columns=feature_df.columns
    )
    
    lower_left_mask = []
    for i in range(len(feature_df.columns)):
        lower_left_mask.append([i<j+1 for j in range(len(feature_df.columns))])
    lower_left_mask = np.array(lower_left_mask)
    
    plt.figure(figsize=(13,10))
    with sns.axes_style("white"):
        heatmap = sns.heatmap(
            pearson_df, 
            annot=True, 
            fmt=".2f", 
            linewidths=.5,
#             mask=lower_left_mask
        )
        heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), fontsize=14);
        heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=14)

In [33]:
features_for_pearson = features

In [34]:
features_for_pearson['ROI'] = df.ix[features_for_pearson.index]['ROI']

In [35]:
features_for_pearson['gross_income'] = df.ix[features_for_pearson.index]['gross_income']

In [36]:
features_for_pearson = features_for_pearson.dropna(axis=0)

In [37]:
features_for_pearson.columns


Out[37]:
Index(['budget', 'duration', 'opening_weekend_screens', 'year',
       'num_mc_critic_by_opening', 'avg_mc_critic_by_opening',
       'num_imdb_user_by_opening', 'avg_imdb_user_by_opening', 'opening_month',
       'opening_day', 'opening_day_of_year', 'name_length', 'ROI',
       'gross_income'],
      dtype='object')

In [38]:
draw_pearson_correlation(features_for_pearson)



In [39]:
pd.options.display.max_rows = 155
df[df['genres.action'] == 1][['budget', 'gross_income', 'name']]


Out[39]:
budget gross_income name
1 22000000.0 12232937.0 12 Rounds
4 200000000.0 166112167.0 2012
5 42000000.0 138447667.0 21 Jump Street
6 50000000.0 191616238.0 22 Jump Street
8 61000000.0 75573300.0 2 Guns
9 110000000.0 106369117.0 300: Rise of an Empire
10 28000000.0 37053924.0 30 Minutes or Less
11 28000000.0 30688364.0 3 Days to Kill
13 175000000.0 38297305.0 47 Ronin
15 30000000.0 31743332.0 9
17 NaN 5750.0 A Dark Truth
18 92000000.0 67344392.0 A Good Day to Die Hard
33 69000000.0 37516013.0 Abraham Lincoln: Vampire Hunter
35 12000000.0 70011073.0 Act of Valor
41 130000000.0 60522097.0 After Earth
42 2000000.0 40179.0 Aftershock
43 35000000.0 25863915.0 Alex Cross
49 9000000.0 6262942.0 All Is Lost
62 130000000.0 180191634.0 Ant-Man
67 27000000.0 15988876.0 Armored
71 65000000.0 19548064.0 Astro Boy
75 237000000.0 760505847.0 Avatar
76 250000000.0 458991599.0 Avengers: Age of Ultron
83 70000000.0 83552429.0 Battle: Los Angeles
85 209000000.0 65173160.0 Battleship
98 165000000.0 222487711.0 Big Hero 6
100 32000000.0 37911876.0 Big Mommas: Like Father, Like Son
114 28000000.0 20285518.0 Brick Mansions
120 55000000.0 9483821.0 Bullet to the Head
125 140000000.0 176636816.0 Captain America: The First Avenger
126 170000000.0 259746958.0 Captain America: The Winter Soldier
132 85000000.0 43575716.0 Cats & Dogs: The Revenge of Kitty Galore
135 49000000.0 31569268.0 Chappie
143 125000000.0 163192114.0 Clash of the Titans
150 40000000.0 36665854.0 Colombiana
153 90000000.0 21270904.0 Conan the Barbarian
157 25000000.0 66489425.0 Contraband
160 37000000.0 44867349.0 Cop Out
166 163000000.0 100215116.0 Cowboys & Aliens
167 20000000.0 13630226.0 Crank: High Voltage
176 25000000.0 25615792.0 Dance Flick
182 170000000.0 208543795.0 Dawn of the Planet of the Apes
183 20000000.0 29975979.0 Daybreakers
184 30000000.0 10880926.0 Dead Man Down
197 30000000.0 115646235.0 District 9
212 20000000.0 1183354.0 Dylan Dog: Dead of Night
216 NaN 500154.0 Echelon Conspiracy
217 115000000.0 93050117.0 Elysium
220 110000000.0 61656849.0 Ender's Game
226 50000000.0 25121291.0 Escape Plan
231 140000000.0 65007045.0 Exodus: Gods and Kings
237 85000000.0 155022220.0 Fast & Furious
238 160000000.0 238673370.0 Fast & Furious 6
239 125000000.0 209805005.0 Fast Five
241 NaN 23036320.0 Fighting
258 52000000.0 23324666.0 From Paris with Love
263 190000000.0 350034110.0 Furious 7
265 150000000.0 119420252.0 G-Force
266 50000000.0 20488579.0 Gamer
267 60000000.0 45996718.0 Gangster Squad
272 18000000.0 10494494.0 Getaway
273 57000000.0 51774002.0 Ghost Rider: Spirit of Vengeance
275 175000000.0 150167630.0 G.I. Joe: The Rise of Cobra
280 160000000.0 200661309.0 Godzilla
283 200000000.0 116593191.0 Green Lantern
284 100000000.0 35024475.0 Green Zone
289 170000000.0 333130696.0 Guardians of the Galaxy
292 30000000.0 40247512.0 Hanna
299 23000000.0 18934858.0 Haywire
304 42000000.0 45290318.0 Here Comes the Boom
307 2000000.0 13746550.0 Hit and Run
308 30000000.0 10134754.0 Hoodwinked Too! Hood vs. Evil
312 35000000.0 34507079.0 Hot Pursuit
317 145000000.0 176997107.0 How to Train Your Dragon 2
321 60000000.0 55092830.0 I Am Number Four
323 65000000.0 19059018.0 I, Frankenstein
332 90000000.0 196573705.0 Ice Age: Dawn of the Dinosaurs
... ... ... ...
558 130000000.0 33592415.0 R.I.P.D.
561 200000000.0 105219735.0 Robin Hood
562 100000000.0 58607007.0 RoboCop
564 50000000.0 26442251.0 Run All Night
566 35000000.0 10499968.0 Sabotage
567 30000000.0 17120019.0 Safe
568 85000000.0 126149655.0 Safe House
571 110000000.0 118311368.0 Salt
573 110000000.0 155181732.0 San Andreas
579 60000000.0 31494270.0 Scott Pilgrim vs. the World
581 40000000.0 24823283.0 Season of the Witch
585 95000000.0 17176900.0 Seventh Son
589 90000000.0 209019489.0 Sherlock Holmes
601 10000000.0 21371425.0 Skyline
604 15000000.0 42919096.0 Snitch
620 65000000.0 110822419.0 Spy
622 150000000.0 257704099.0 Star Trek
625 35000000.0 183125.0 Stolen
627 25000000.0 10324441.0 Straw Dogs
628 18000000.0 8742261.0 Street Fighter: The Legend of Chun-Li
629 82000000.0 36381716.0 Sucker Punch
631 80000000.0 38542418.0 Surrogates
634 45000000.0 139852971.0 Taken 2
635 20000000.0 57744720.0 Takers
642 155000000.0 89732035.0 Terminator Genisys
643 200000000.0 125320003.0 Terminator Salvation
651 230000000.0 262030663.0 The Amazing Spider-Man
652 200000000.0 202853933.0 The Amazing Spider-Man 2
655 220000000.0 623279547.0 The Avengers
664 80000000.0 94822707.0 The Book of Eli
666 8000000.0 10269307.0 The Boondock Saints II: All Saints Day
667 125000000.0 113165635.0 The Bourne Legacy
677 20000000.0 3749061.0 The Cold Light of Day
688 250000000.0 448130642.0 The Dark Knight Rises
689 30000000.0 21426805.0 The Darkest Hour
696 25000000.0 19478384.0 The Eagle
697 55000000.0 101530738.0 The Equalizer
698 100000000.0 85017401.0 The Expendables 2
699 90000000.0 39292022.0 The Expendables 3
714 120000000.0 98780042.0 The Green Hornet
715 25000000.0 51533608.0 The Grey
716 40000000.0 10640645.0 The Gunman
720 43000000.0 159578352.0 The Heat
724 40000000.0 26616999.0 The Host
726 78000000.0 407999255.0 The Hunger Games
727 130000000.0 424645577.0 The Hunger Games: Catching Fire
733 50000000.0 25450527.0 The International
737 40000000.0 176591618.0 The Karate Kid
745 45000000.0 12026670.0 The Last Stand
755 15000000.0 15608545.0 The Man with the Iron Fists
758 34000000.0 102413606.0 The Maze Runner
759 40000000.0 29113588.0 The Mechanic
763 60000000.0 31165421.0 The Mortal Instruments: City of Bones
771 100000000.0 119219978.0 The Other Guys
783 9000000.0 71519230.0 The Purge: Anarchy
802 150000000.0 63143812.0 The Sorcerer's Apprentice
804 28000000.0 24268828.0 The Spy Next Door
808 75000000.0 20315324.0 The Three Musketeers
810 100000000.0 67631157.0 The Tourist
829 65000000.0 54758461.0 This Means War
830 150000000.0 181015141.0 Thor
831 190000000.0 93417865.0 Tomorrowland
835 75000000.0 78009155.0 Tower Heist
839 195000000.0 352358779.0 Transformers: Dark of the Moon
840 200000000.0 402076689.0 Transformers: Revenge of the Fallen
842 170000000.0 172051787.0 Tron: Legacy
847 35000000.0 45802315.0 Underworld: Rise of the Lycans
850 30000000.0 61094903.0 Unknown
851 100000000.0 81557479.0 Unstoppable
855 30000000.0 7791146.0 Vampire Academy
864 130000000.0 107503316.0 Watchmen
877 35000000.0 10268846.0 Whiteout
883 190000000.0 202351611.0 World War Z
885 150000000.0 83640426.0 Wrath of the Titans
887 200000000.0 233914986.0 X-Men: Days of Future Past
888 160000000.0 146405371.0 X-Men: First Class
889 150000000.0 179883016.0 X-Men Origins: Wolverine

216 rows × 3 columns


In [40]:
list(df.columns)


Out[40]:
['avg_screens',
 'budget',
 'budget_currency',
 'closing_date',
 'critic_review_count',
 'duration',
 'gross_income',
 'imdb_user_reviews',
 'max_screens',
 'mc_avg_user_score',
 'mc_metascore',
 'mc_mixed_rating_frequency',
 'mc_movie_name',
 'mc_negative_rating_frequency',
 'mc_positive_rating_frequency',
 'mc_pro_critic_reviews',
 'mc_user_reviews',
 'metascore',
 'name',
 'num_weekends',
 'opening_weekend_date',
 'opening_weekend_income',
 'opening_weekend_income_currency',
 'opening_weekend_screens',
 'rating',
 'rating_count',
 'release_day',
 'release_month',
 'release_year',
 'screens_by_weekend',
 'total_screens',
 'user_review_count',
 'year',
 'avg_rating_per_demo.aged_18-29',
 'avg_rating_per_demo.aged_30-44',
 'avg_rating_per_demo.aged_45+',
 'avg_rating_per_demo.aged_under_18',
 'avg_rating_per_demo.females',
 'avg_rating_per_demo.females_aged_18-29',
 'avg_rating_per_demo.females_aged_30-44',
 'avg_rating_per_demo.females_aged_45+',
 'avg_rating_per_demo.females_under_18',
 'avg_rating_per_demo.imdb_staff',
 'avg_rating_per_demo.imdb_users',
 'avg_rating_per_demo.males',
 'avg_rating_per_demo.males_aged_18-29',
 'avg_rating_per_demo.males_aged_30-44',
 'avg_rating_per_demo.males_aged_45+',
 'avg_rating_per_demo.males_under_18',
 'avg_rating_per_demo.non-us_users',
 'avg_rating_per_demo.top_1000_voters',
 'avg_rating_per_demo.us_users',
 'votes_per_demo.aged_18-29',
 'votes_per_demo.aged_30-44',
 'votes_per_demo.aged_45+',
 'votes_per_demo.aged_under_18',
 'votes_per_demo.females',
 'votes_per_demo.females_aged_18-29',
 'votes_per_demo.females_aged_30-44',
 'votes_per_demo.females_aged_45+',
 'votes_per_demo.females_under_18',
 'votes_per_demo.imdb_staff',
 'votes_per_demo.imdb_users',
 'votes_per_demo.males',
 'votes_per_demo.males_aged_18-29',
 'votes_per_demo.males_aged_30-44',
 'votes_per_demo.males_aged_45+',
 'votes_per_demo.males_under_18',
 'votes_per_demo.non-us_users',
 'votes_per_demo.top_1000_voters',
 'votes_per_demo.us_users',
 'rating_freq.1',
 'rating_freq.10',
 'rating_freq.2',
 'rating_freq.3',
 'rating_freq.4',
 'rating_freq.5',
 'rating_freq.6',
 'rating_freq.7',
 'rating_freq.8',
 'rating_freq.9',
 'genres.action',
 'genres.adventure',
 'genres.animation',
 'genres.biography',
 'genres.comedy',
 'genres.crime',
 'genres.documentary',
 'genres.drama',
 'genres.family',
 'genres.fantasy',
 'genres.history',
 'genres.horror',
 'genres.music',
 'genres.musical',
 'genres.mystery',
 'genres.news',
 'genres.romance',
 'genres.sci-fi',
 'genres.sport',
 'genres.thriller',
 'genres.war',
 'genres.western',
 'num_mc_critic',
 'avg_mc_critic',
 'num_mc_critic_by_opening',
 'avg_mc_critic_by_opening',
 'num_mc_user',
 'avg_mc_user',
 'num_mc_user_by_opening',
 'avg_mc_user_by_opening',
 'num_imdb_user',
 'avg_imdb_user',
 'num_imdb_user_by_opening',
 'avg_imdb_user_by_opening',
 'opening_month',
 'opening_day',
 'opening_day_of_year',
 'norm_gross',
 'profit',
 'ROI',
 'name_length']

In [41]:
GENRES_COLS = [col for col in df.columns if 'genres' in col]

In [42]:
def _average_col_by_budget(df, colnames, norm_factor=1, labels=None, figsize=None):
    coldf = pd.DataFrame(
        data=np.array([
            [
                (np.mean(df[df[genre] == 1][colname]) / norm_factor) for genre in GENRES_COLS
            ] for colname in colnames
        ]).transpose(), 
        index=[col[col.rfind('.')+1:] for col in GENRES_COLS], 
        columns=(labels or colnames)
    )
    ax = coldf.plot(kind='bar', figsize=(figsize or (15, 10)), legend=True, fontsize=13)
    plt.xticks(rotation=45, ha='right')
    plt.title("Average {} by genre".format(', '.join(labels or colnames)), fontsize=16)
    plt.legend(prop={'size':14})

In [43]:
_average_col_by_budget(df, ['ROI'])



In [44]:
len(df[df['genres.documentary'] == 1])


Out[44]:
37

In [45]:
_average_col_by_budget(df, ['budget', 'gross_income'], norm_factor=1000000, labels=['Budget', 'Gross Income'])



In [46]:
df['mc_avg_user_score_scaled'] = df['mc_avg_user_score'] * 10
df['rating_scaled'] = df['rating'] * 10

In [47]:
_average_col_by_budget(df, ['metascore', 'mc_avg_user_score_scaled','rating_scaled'], figsize=(10,8),
                      labels=['Metascore', 'Metacritic user rating', 'IMDB user rating'])
plt.ylim(0, 90);



In [48]:
_average_col_by_budget(df, ['duration'], labels=['Duration'])



In [49]:
_average_col_by_budget(df, ['opening_month'], labels=['Month of release'])



In [50]:
_average_col_by_budget(df, ['opening_day'], labels=['Release day of month'])


IMDB Rating vs Number of rating


In [51]:
np.corrcoef([df.rating, df.rating_count])[0,1]


Out[51]:
0.48975375623733669

In [52]:
plt.figure(figsize=(9,8))
sns.regplot(x=df.rating, y=df.rating_count)
plt.ylim(-50000, plt.ylim()[1])
# plt.scatter(df.rating , df.rating_count)
plt.xlabel('IMDB Rating')
plt.ylabel('Number of IMDB ratings')
plt.title('IMDB rating vs # of IMDB ratings');


Rating by opening vs gross income


In [53]:
np.corrcoef([dataset.avg_imdb_user_by_opening, df.ix[dataset.index].gross_income])[0,1]


Out[53]:
0.20553268564903468

In [54]:
plt.figure(figsize=(9,8))
sns.regplot(x=dataset.avg_imdb_user_by_opening, y=df.ix[dataset.index].gross_income / 1000000)
# plt.ylim(-50000, plt.ylim()[1])
# plt.scatter(df.rating , df.rating_count)
plt.xlabel('IMDB rating by opening weekend')
plt.ylabel('Gross Income (in millions of $)');
# plt.title('IMDB rating by opening weekend');


# ratings by opening vs gross income


In [55]:
np.corrcoef([dataset.num_imdb_user_by_opening, df.ix[dataset.index].gross_income])[0,1]


Out[55]:
0.62904497701915196

In [56]:
plt.figure(figsize=(9,8))
# sns.regplot(x=dataset.avg_imdb_user_by_opening, y=df.ix[dataset.index].gross_income / 1000000)
sns.regplot(x=dataset.num_imdb_user_by_opening, y=df.ix[dataset.index].gross_income / 1000000, color='seagreen')
# plt.ylim(-50000, plt.ylim()[1])
# plt.scatter(df.rating , df.rating_count)
plt.xlabel('# of IMDB ratings by opening weekend')
plt.ylabel('Gross Income (in millions of $)');
# plt.title('IMDB rating by opening weekend');


avg critic ratings by opening vs gross income


In [57]:
np.corrcoef([dataset.avg_mc_critic_by_opening, df.ix[dataset.index].gross_income])[0,1]


Out[57]:
0.27594777935823789

In [58]:
plt.figure(figsize=(9,8))
sns.regplot(x=dataset.avg_mc_critic_by_opening, y=df.ix[dataset.index].gross_income / 1000000, color='seagreen')
plt.xlabel('Average critic rating by opening weekend')
plt.ylabel('Gross Income (in millions of $)');


# of critic ratings by opening vs gross income


In [59]:
np.corrcoef([dataset.num_mc_critic_by_opening, df.ix[dataset.index].gross_income])[0,1]


Out[59]:
0.48208842445095279

In [60]:
plt.figure(figsize=(9,8))
sns.regplot(x=dataset.num_mc_critic_by_opening, y=df.ix[dataset.index].gross_income / 1000000, color='seagreen')
plt.xlabel('# of critic ratings by opening weekend')
plt.ylabel('Gross Income (in millions of $)');


Budget vs Gross Income


In [61]:
sub_df = df[~(df.budget.isnull() | df.gross_income.isnull())]

In [62]:
len(sub_df)


Out[62]:
751

In [63]:
np.corrcoef([sub_df.budget, sub_df.gross_income])[0,1]


Out[63]:
0.70213665758145349

In [64]:
plt.figure(figsize=(11,9))
sns.regplot(x=sub_df.budget / 1000000, y=sub_df.gross_income / 1000000, color='red')


Out[64]:
<matplotlib.axes._subplots.AxesSubplot at 0x11058cda0>

In [65]:
sub_df = df[~(df.budget.isnull() | df.ROI.isnull())]
plt.figure(figsize=(11,9))
sns.regplot(x=sub_df.budget / 1000000, y=sub_df.ROI, color='red')
np.corrcoef([sub_df.budget, sub_df.ROI])[0,1]


Out[65]:
-0.18565967992825205

rating and metascoe vs gross


In [69]:
sub_df = df.metascore.dropna()

In [71]:
np.corrcoef([df.ix[sub_df.index].metascore, df.ix[sub_df.index].gross_income])[0,1]


Out[71]:
0.20201486909133604

In [67]:
np.corrcoef([df.rating, df.gross_income])[0,1]


Out[67]:
0.26855259879389376

In [66]:
plt.figure(figsize=(13,9))
a = sns.regplot(x=df['rating']*10, y=np.log(df['gross_income']), color='yellow')
b = sns.regplot(x=df['metascore'], y=np.log(df['gross_income']), color='Green')
plt.xlabel('IMDB rating and Metascore')
plt.ylabel('Gross Income [log]')
plt.title('IMDB rating and Metascore vs Gross Income [log]');
plt.legend([a, b], ['IMDB', 'Metascore'],
           loc= 'upper center', fontsize= 'small');


/Users/shaypalachy/miniconda3/envs/ds/lib/python3.5/site-packages/matplotlib/legend.py:634: UserWarning: Legend does not support <matplotlib.axes._subplots.AxesSubplot object at 0x10ff4e320> instances.
A proxy artist may be used instead.
See: http://matplotlib.org/users/legend_guide.html#using-proxy-artist
  "#using-proxy-artist".format(orig_handle)