Exploring the movies dataset

Loading the dataset



In [1]:

    
import os

import pandas as pd
import sklearn as skl

import holcrawl.shared



In [2]:

    
dataset_dir = holcrawl.shared._get_dataset_dir_path()



In [3]:

    
dataset_path = os.path.join(dataset_dir, 'movies_dataset.csv')



In [4]:

    
df = pd.read_csv(dataset_path)



In [5]:

    
df.year.value_counts()









    Out[5]:





2009.0    163
2011.0    152
2012.0    148
2014.0    129
2013.0    122
2010.0    105
2015.0     67
2008.0     14
Name: year, dtype: int64



In [6]:

    
list(df.columns)









    Out[6]:





['avg_screens',
 'budget',
 'budget_currency',
 'closing_date',
 'critic_review_count',
 'duration',
 'gross_income',
 'imdb_user_reviews',
 'max_screens',
 'mc_avg_user_score',
 'mc_metascore',
 'mc_mixed_rating_frequency',
 'mc_movie_name',
 'mc_negative_rating_frequency',
 'mc_positive_rating_frequency',
 'mc_pro_critic_reviews',
 'mc_user_reviews',
 'metascore',
 'name',
 'num_weekends',
 'opening_weekend_date',
 'opening_weekend_income',
 'opening_weekend_income_currency',
 'opening_weekend_screens',
 'rating',
 'rating_count',
 'release_day',
 'release_month',
 'release_year',
 'screens_by_weekend',
 'total_screens',
 'user_review_count',
 'year',
 'avg_rating_per_demo.aged_18-29',
 'avg_rating_per_demo.aged_30-44',
 'avg_rating_per_demo.aged_45+',
 'avg_rating_per_demo.aged_under_18',
 'avg_rating_per_demo.females',
 'avg_rating_per_demo.females_aged_18-29',
 'avg_rating_per_demo.females_aged_30-44',
 'avg_rating_per_demo.females_aged_45+',
 'avg_rating_per_demo.females_under_18',
 'avg_rating_per_demo.imdb_staff',
 'avg_rating_per_demo.imdb_users',
 'avg_rating_per_demo.males',
 'avg_rating_per_demo.males_aged_18-29',
 'avg_rating_per_demo.males_aged_30-44',
 'avg_rating_per_demo.males_aged_45+',
 'avg_rating_per_demo.males_under_18',
 'avg_rating_per_demo.non-us_users',
 'avg_rating_per_demo.top_1000_voters',
 'avg_rating_per_demo.us_users',
 'votes_per_demo.aged_18-29',
 'votes_per_demo.aged_30-44',
 'votes_per_demo.aged_45+',
 'votes_per_demo.aged_under_18',
 'votes_per_demo.females',
 'votes_per_demo.females_aged_18-29',
 'votes_per_demo.females_aged_30-44',
 'votes_per_demo.females_aged_45+',
 'votes_per_demo.females_under_18',
 'votes_per_demo.imdb_staff',
 'votes_per_demo.imdb_users',
 'votes_per_demo.males',
 'votes_per_demo.males_aged_18-29',
 'votes_per_demo.males_aged_30-44',
 'votes_per_demo.males_aged_45+',
 'votes_per_demo.males_under_18',
 'votes_per_demo.non-us_users',
 'votes_per_demo.top_1000_voters',
 'votes_per_demo.us_users',
 'rating_freq.1',
 'rating_freq.10',
 'rating_freq.2',
 'rating_freq.3',
 'rating_freq.4',
 'rating_freq.5',
 'rating_freq.6',
 'rating_freq.7',
 'rating_freq.8',
 'rating_freq.9',
 'genres.action',
 'genres.adventure',
 'genres.animation',
 'genres.biography',
 'genres.comedy',
 'genres.crime',
 'genres.documentary',
 'genres.drama',
 'genres.family',
 'genres.fantasy',
 'genres.history',
 'genres.horror',
 'genres.music',
 'genres.musical',
 'genres.mystery',
 'genres.news',
 'genres.romance',
 'genres.sci-fi',
 'genres.sport',
 'genres.thriller',
 'genres.war',
 'genres.western',
 'num_mc_critic',
 'avg_mc_critic',
 'num_mc_critic_by_opening',
 'avg_mc_critic_by_opening',
 'num_mc_user',
 'avg_mc_user',
 'num_mc_user_by_opening',
 'avg_mc_user_by_opening',
 'num_imdb_user',
 'avg_imdb_user',
 'num_imdb_user_by_opening',
 'avg_imdb_user_by_opening',
 'opening_month',
 'opening_day',
 'opening_day_of_year']

Feature Generation



In [8]:

    
df['norm_gross'] = df['gross_income'] / df['budget']



In [9]:

    
df['profit'] = df['gross_income'] - df['budget']



In [10]:

    
df['ROI'] = (df['gross_income'] - df['budget']) / df['budget']



In [11]:

    
df['name_length'] = df['name'].map(lambda name: len(name))



In [12]:

    
len(df)









    Out[12]:





900



In [13]:

    
df.isnull().sum()









    Out[13]:





avg_screens                          0
budget                             149
budget_currency                    149
closing_date                         0
critic_review_count                  0
duration                             0
gross_income                         0
imdb_user_reviews                    0
max_screens                          0
mc_avg_user_score                    0
mc_metascore                         0
mc_mixed_rating_frequency            0
mc_movie_name                        0
mc_negative_rating_frequency         0
mc_positive_rating_frequency         0
mc_pro_critic_reviews                0
mc_user_reviews                      0
metascore                            3
name                                 0
num_weekends                         0
opening_weekend_date                 0
opening_weekend_income               0
opening_weekend_income_currency      0
opening_weekend_screens              0
rating                               0
rating_count                         0
release_day                        191
release_month                      191
release_year                       191
screens_by_weekend                   0
                                  ... 
genres.horror                        0
genres.music                         0
genres.musical                       0
genres.mystery                       0
genres.news                          0
genres.romance                       0
genres.sci-fi                        0
genres.sport                         0
genres.thriller                      0
genres.war                           0
genres.western                       0
num_mc_critic                        0
avg_mc_critic                      235
num_mc_critic_by_opening             0
avg_mc_critic_by_opening           266
num_mc_user                          0
avg_mc_user                         16
num_mc_user_by_opening               0
avg_mc_user_by_opening             441
num_imdb_user                        0
avg_imdb_user                        0
num_imdb_user_by_opening             0
avg_imdb_user_by_opening            45
opening_month                        0
opening_day                          0
opening_day_of_year                  0
norm_gross                         149
profit                             149
ROI                                149
name_length                          0
dtype: int64



In [14]:

    
len(df[df['avg_mc_critic_by_opening'].notnull()])









    Out[14]:





634



In [15]:

    
# df['opening_weekend_date']



In [16]:

    
BASE_FEAT_TO_KEEP = [
    'duration', 'budget', 'opening_month', 'opening_day', 'opening_day_of_year', 'year',
    'avg_mc_critic_by_opening', 'num_mc_critic_by_opening', 'name_length', 
    'num_imdb_user_by_opening', 'avg_imdb_user_by_opening', 'opening_weekend_screens'# 'avg_mc_user_by_opening'
]



In [17]:

    
FEAT_TO_KEEP = BASE_FEAT_TO_KEEP + [col for col in df.columns if 'genres' in col]



In [18]:

    
features = df.drop([col for col in df.columns if col not in BASE_FEAT_TO_KEEP], axis=1)



In [19]:

    
dataset = df.drop([col for col in df.columns if col not in FEAT_TO_KEEP], axis=1)



In [20]:

    
# letter_dummies = pd.get_dummies(df['starting_letter'], drop_first=True, prefix='fl')



In [21]:

    
# dataset = dataset.assign(**{col: letter_dummies[col] for col in letter_dummies.columns})



In [22]:

    
dataset = dataset.dropna(axis=0)



In [23]:

    
len(dataset)









    Out[23]:





518



In [24]:

    
import seaborn as sns
%matplotlib inline



In [25]:

    
dataset.year.unique()









    Out[25]:





array([ 2014.,  2012.,  2013.,  2011.,  2015.,  2010.])



In [26]:

    
dataset.year.value_counts()









    Out[26]:





2011.0    119
2012.0    117
2014.0    106
2013.0     94
2015.0     63
2010.0     19
Name: year, dtype: int64



In [27]:

    
sns.distplot(dataset.year)









    Out[27]:





<matplotlib.axes._subplots.AxesSubplot at 0x10fa96c50>



In [28]:

    
dataset.columns









    Out[28]:





Index(['budget', 'duration', 'opening_weekend_screens', 'year',
       'genres.action', 'genres.adventure', 'genres.animation',
       'genres.biography', 'genres.comedy', 'genres.crime',
       'genres.documentary', 'genres.drama', 'genres.family', 'genres.fantasy',
       'genres.history', 'genres.horror', 'genres.music', 'genres.musical',
       'genres.mystery', 'genres.news', 'genres.romance', 'genres.sci-fi',
       'genres.sport', 'genres.thriller', 'genres.war', 'genres.western',
       'num_mc_critic_by_opening', 'avg_mc_critic_by_opening',
       'num_imdb_user_by_opening', 'avg_imdb_user_by_opening', 'opening_month',
       'opening_day', 'opening_day_of_year', 'name_length'],
      dtype='object')



In [29]:

    
# pd.options.display.max_columns = 999
# dataset

Exploration



In [30]:

    
import numpy as np



In [31]:

    
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline



In [32]:

    
def draw_pearson_correlation(feature_df):
    pearson_df = pd.DataFrame(
        data=np.corrcoef(feature_df.T),
        index=feature_df.columns,
        columns=feature_df.columns
    )
    
    lower_left_mask = []
    for i in range(len(feature_df.columns)):
        lower_left_mask.append([i<j+1 for j in range(len(feature_df.columns))])
    lower_left_mask = np.array(lower_left_mask)
    
    plt.figure(figsize=(13,10))
    with sns.axes_style("white"):
        heatmap = sns.heatmap(
            pearson_df, 
            annot=True, 
            fmt=".2f", 
            linewidths=.5,
#             mask=lower_left_mask
        )
        heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), fontsize=14);
        heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=14)



In [33]:

    
features_for_pearson = features



In [34]:

    
features_for_pearson['ROI'] = df.ix[features_for_pearson.index]['ROI']



In [35]:

    
features_for_pearson['gross_income'] = df.ix[features_for_pearson.index]['gross_income']



In [36]:

    
features_for_pearson = features_for_pearson.dropna(axis=0)



In [37]:

    
features_for_pearson.columns









    Out[37]:





Index(['budget', 'duration', 'opening_weekend_screens', 'year',
       'num_mc_critic_by_opening', 'avg_mc_critic_by_opening',
       'num_imdb_user_by_opening', 'avg_imdb_user_by_opening', 'opening_month',
       'opening_day', 'opening_day_of_year', 'name_length', 'ROI',
       'gross_income'],
      dtype='object')



In [38]:

    
draw_pearson_correlation(features_for_pearson)



In [39]:

    
pd.options.display.max_rows = 155
df[df['genres.action'] == 1][['budget', 'gross_income', 'name']]









    Out[39]:






  
    
      
      budget
      gross_income
      name
    
  
  
    
      1
      22000000.0
      12232937.0
      12 Rounds
    
    
      4
      200000000.0
      166112167.0
      2012
    
    
      5
      42000000.0
      138447667.0
      21 Jump Street
    
    
      6
      50000000.0
      191616238.0
      22 Jump Street
    
    
      8
      61000000.0
      75573300.0
      2 Guns
    
    
      9
      110000000.0
      106369117.0
      300: Rise of an Empire
    
    
      10
      28000000.0
      37053924.0
      30 Minutes or Less
    
    
      11
      28000000.0
      30688364.0
      3 Days to Kill
    
    
      13
      175000000.0
      38297305.0
      47 Ronin
    
    
      15
      30000000.0
      31743332.0
      9
    
    
      17
      NaN
      5750.0
      A Dark Truth
    
    
      18
      92000000.0
      67344392.0
      A Good Day to Die Hard
    
    
      33
      69000000.0
      37516013.0
      Abraham Lincoln: Vampire Hunter
    
    
      35
      12000000.0
      70011073.0
      Act of Valor
    
    
      41
      130000000.0
      60522097.0
      After Earth
    
    
      42
      2000000.0
      40179.0
      Aftershock
    
    
      43
      35000000.0
      25863915.0
      Alex Cross
    
    
      49
      9000000.0
      6262942.0
      All Is Lost
    
    
      62
      130000000.0
      180191634.0
      Ant-Man
    
    
      67
      27000000.0
      15988876.0
      Armored
    
    
      71
      65000000.0
      19548064.0
      Astro Boy
    
    
      75
      237000000.0
      760505847.0
      Avatar
    
    
      76
      250000000.0
      458991599.0
      Avengers: Age of Ultron
    
    
      83
      70000000.0
      83552429.0
      Battle: Los Angeles
    
    
      85
      209000000.0
      65173160.0
      Battleship
    
    
      98
      165000000.0
      222487711.0
      Big Hero 6
    
    
      100
      32000000.0
      37911876.0
      Big Mommas: Like Father, Like Son
    
    
      114
      28000000.0
      20285518.0
      Brick Mansions
    
    
      120
      55000000.0
      9483821.0
      Bullet to the Head
    
    
      125
      140000000.0
      176636816.0
      Captain America: The First Avenger
    
    
      126
      170000000.0
      259746958.0
      Captain America: The Winter Soldier
    
    
      132
      85000000.0
      43575716.0
      Cats & Dogs: The Revenge of Kitty Galore
    
    
      135
      49000000.0
      31569268.0
      Chappie
    
    
      143
      125000000.0
      163192114.0
      Clash of the Titans
    
    
      150
      40000000.0
      36665854.0
      Colombiana
    
    
      153
      90000000.0
      21270904.0
      Conan the Barbarian
    
    
      157
      25000000.0
      66489425.0
      Contraband
    
    
      160
      37000000.0
      44867349.0
      Cop Out
    
    
      166
      163000000.0
      100215116.0
      Cowboys & Aliens
    
    
      167
      20000000.0
      13630226.0
      Crank: High Voltage
    
    
      176
      25000000.0
      25615792.0
      Dance Flick
    
    
      182
      170000000.0
      208543795.0
      Dawn of the Planet of the Apes
    
    
      183
      20000000.0
      29975979.0
      Daybreakers
    
    
      184
      30000000.0
      10880926.0
      Dead Man Down
    
    
      197
      30000000.0
      115646235.0
      District 9
    
    
      212
      20000000.0
      1183354.0
      Dylan Dog: Dead of Night
    
    
      216
      NaN
      500154.0
      Echelon Conspiracy
    
    
      217
      115000000.0
      93050117.0
      Elysium
    
    
      220
      110000000.0
      61656849.0
      Ender's Game
    
    
      226
      50000000.0
      25121291.0
      Escape Plan
    
    
      231
      140000000.0
      65007045.0
      Exodus: Gods and Kings
    
    
      237
      85000000.0
      155022220.0
      Fast & Furious
    
    
      238
      160000000.0
      238673370.0
      Fast & Furious 6
    
    
      239
      125000000.0
      209805005.0
      Fast Five
    
    
      241
      NaN
      23036320.0
      Fighting
    
    
      258
      52000000.0
      23324666.0
      From Paris with Love
    
    
      263
      190000000.0
      350034110.0
      Furious 7
    
    
      265
      150000000.0
      119420252.0
      G-Force
    
    
      266
      50000000.0
      20488579.0
      Gamer
    
    
      267
      60000000.0
      45996718.0
      Gangster Squad
    
    
      272
      18000000.0
      10494494.0
      Getaway
    
    
      273
      57000000.0
      51774002.0
      Ghost Rider: Spirit of Vengeance
    
    
      275
      175000000.0
      150167630.0
      G.I. Joe: The Rise of Cobra
    
    
      280
      160000000.0
      200661309.0
      Godzilla
    
    
      283
      200000000.0
      116593191.0
      Green Lantern
    
    
      284
      100000000.0
      35024475.0
      Green Zone
    
    
      289
      170000000.0
      333130696.0
      Guardians of the Galaxy
    
    
      292
      30000000.0
      40247512.0
      Hanna
    
    
      299
      23000000.0
      18934858.0
      Haywire
    
    
      304
      42000000.0
      45290318.0
      Here Comes the Boom
    
    
      307
      2000000.0
      13746550.0
      Hit and Run
    
    
      308
      30000000.0
      10134754.0
      Hoodwinked Too! Hood vs. Evil
    
    
      312
      35000000.0
      34507079.0
      Hot Pursuit
    
    
      317
      145000000.0
      176997107.0
      How to Train Your Dragon 2
    
    
      321
      60000000.0
      55092830.0
      I Am Number Four
    
    
      323
      65000000.0
      19059018.0
      I, Frankenstein
    
    
      332
      90000000.0
      196573705.0
      Ice Age: Dawn of the Dinosaurs
    
    
      ...
      ...
      ...
      ...
    
    
      558
      130000000.0
      33592415.0
      R.I.P.D.
    
    
      561
      200000000.0
      105219735.0
      Robin Hood
    
    
      562
      100000000.0
      58607007.0
      RoboCop
    
    
      564
      50000000.0
      26442251.0
      Run All Night
    
    
      566
      35000000.0
      10499968.0
      Sabotage
    
    
      567
      30000000.0
      17120019.0
      Safe
    
    
      568
      85000000.0
      126149655.0
      Safe House
    
    
      571
      110000000.0
      118311368.0
      Salt
    
    
      573
      110000000.0
      155181732.0
      San Andreas
    
    
      579
      60000000.0
      31494270.0
      Scott Pilgrim vs. the World
    
    
      581
      40000000.0
      24823283.0
      Season of the Witch
    
    
      585
      95000000.0
      17176900.0
      Seventh Son
    
    
      589
      90000000.0
      209019489.0
      Sherlock Holmes
    
    
      601
      10000000.0
      21371425.0
      Skyline
    
    
      604
      15000000.0
      42919096.0
      Snitch
    
    
      620
      65000000.0
      110822419.0
      Spy
    
    
      622
      150000000.0
      257704099.0
      Star Trek
    
    
      625
      35000000.0
      183125.0
      Stolen
    
    
      627
      25000000.0
      10324441.0
      Straw Dogs
    
    
      628
      18000000.0
      8742261.0
      Street Fighter: The Legend of Chun-Li
    
    
      629
      82000000.0
      36381716.0
      Sucker Punch
    
    
      631
      80000000.0
      38542418.0
      Surrogates
    
    
      634
      45000000.0
      139852971.0
      Taken 2
    
    
      635
      20000000.0
      57744720.0
      Takers
    
    
      642
      155000000.0
      89732035.0
      Terminator Genisys
    
    
      643
      200000000.0
      125320003.0
      Terminator Salvation
    
    
      651
      230000000.0
      262030663.0
      The Amazing Spider-Man
    
    
      652
      200000000.0
      202853933.0
      The Amazing Spider-Man 2
    
    
      655
      220000000.0
      623279547.0
      The Avengers
    
    
      664
      80000000.0
      94822707.0
      The Book of Eli
    
    
      666
      8000000.0
      10269307.0
      The Boondock Saints II: All Saints Day
    
    
      667
      125000000.0
      113165635.0
      The Bourne Legacy
    
    
      677
      20000000.0
      3749061.0
      The Cold Light of Day
    
    
      688
      250000000.0
      448130642.0
      The Dark Knight Rises
    
    
      689
      30000000.0
      21426805.0
      The Darkest Hour
    
    
      696
      25000000.0
      19478384.0
      The Eagle
    
    
      697
      55000000.0
      101530738.0
      The Equalizer
    
    
      698
      100000000.0
      85017401.0
      The Expendables 2
    
    
      699
      90000000.0
      39292022.0
      The Expendables 3
    
    
      714
      120000000.0
      98780042.0
      The Green Hornet
    
    
      715
      25000000.0
      51533608.0
      The Grey
    
    
      716
      40000000.0
      10640645.0
      The Gunman
    
    
      720
      43000000.0
      159578352.0
      The Heat
    
    
      724
      40000000.0
      26616999.0
      The Host
    
    
      726
      78000000.0
      407999255.0
      The Hunger Games
    
    
      727
      130000000.0
      424645577.0
      The Hunger Games: Catching Fire
    
    
      733
      50000000.0
      25450527.0
      The International
    
    
      737
      40000000.0
      176591618.0
      The Karate Kid
    
    
      745
      45000000.0
      12026670.0
      The Last Stand
    
    
      755
      15000000.0
      15608545.0
      The Man with the Iron Fists
    
    
      758
      34000000.0
      102413606.0
      The Maze Runner
    
    
      759
      40000000.0
      29113588.0
      The Mechanic
    
    
      763
      60000000.0
      31165421.0
      The Mortal Instruments: City of Bones
    
    
      771
      100000000.0
      119219978.0
      The Other Guys
    
    
      783
      9000000.0
      71519230.0
      The Purge: Anarchy
    
    
      802
      150000000.0
      63143812.0
      The Sorcerer's Apprentice
    
    
      804
      28000000.0
      24268828.0
      The Spy Next Door
    
    
      808
      75000000.0
      20315324.0
      The Three Musketeers
    
    
      810
      100000000.0
      67631157.0
      The Tourist
    
    
      829
      65000000.0
      54758461.0
      This Means War
    
    
      830
      150000000.0
      181015141.0
      Thor
    
    
      831
      190000000.0
      93417865.0
      Tomorrowland
    
    
      835
      75000000.0
      78009155.0
      Tower Heist
    
    
      839
      195000000.0
      352358779.0
      Transformers: Dark of the Moon
    
    
      840
      200000000.0
      402076689.0
      Transformers: Revenge of the Fallen
    
    
      842
      170000000.0
      172051787.0
      Tron: Legacy
    
    
      847
      35000000.0
      45802315.0
      Underworld: Rise of the Lycans
    
    
      850
      30000000.0
      61094903.0
      Unknown
    
    
      851
      100000000.0
      81557479.0
      Unstoppable
    
    
      855
      30000000.0
      7791146.0
      Vampire Academy
    
    
      864
      130000000.0
      107503316.0
      Watchmen
    
    
      877
      35000000.0
      10268846.0
      Whiteout
    
    
      883
      190000000.0
      202351611.0
      World War Z
    
    
      885
      150000000.0
      83640426.0
      Wrath of the Titans
    
    
      887
      200000000.0
      233914986.0
      X-Men: Days of Future Past
    
    
      888
      160000000.0
      146405371.0
      X-Men: First Class
    
    
      889
      150000000.0
      179883016.0
      X-Men Origins: Wolverine
    
  

216 rows × 3 columns



In [40]:

    
list(df.columns)









    Out[40]:





['avg_screens',
 'budget',
 'budget_currency',
 'closing_date',
 'critic_review_count',
 'duration',
 'gross_income',
 'imdb_user_reviews',
 'max_screens',
 'mc_avg_user_score',
 'mc_metascore',
 'mc_mixed_rating_frequency',
 'mc_movie_name',
 'mc_negative_rating_frequency',
 'mc_positive_rating_frequency',
 'mc_pro_critic_reviews',
 'mc_user_reviews',
 'metascore',
 'name',
 'num_weekends',
 'opening_weekend_date',
 'opening_weekend_income',
 'opening_weekend_income_currency',
 'opening_weekend_screens',
 'rating',
 'rating_count',
 'release_day',
 'release_month',
 'release_year',
 'screens_by_weekend',
 'total_screens',
 'user_review_count',
 'year',
 'avg_rating_per_demo.aged_18-29',
 'avg_rating_per_demo.aged_30-44',
 'avg_rating_per_demo.aged_45+',
 'avg_rating_per_demo.aged_under_18',
 'avg_rating_per_demo.females',
 'avg_rating_per_demo.females_aged_18-29',
 'avg_rating_per_demo.females_aged_30-44',
 'avg_rating_per_demo.females_aged_45+',
 'avg_rating_per_demo.females_under_18',
 'avg_rating_per_demo.imdb_staff',
 'avg_rating_per_demo.imdb_users',
 'avg_rating_per_demo.males',
 'avg_rating_per_demo.males_aged_18-29',
 'avg_rating_per_demo.males_aged_30-44',
 'avg_rating_per_demo.males_aged_45+',
 'avg_rating_per_demo.males_under_18',
 'avg_rating_per_demo.non-us_users',
 'avg_rating_per_demo.top_1000_voters',
 'avg_rating_per_demo.us_users',
 'votes_per_demo.aged_18-29',
 'votes_per_demo.aged_30-44',
 'votes_per_demo.aged_45+',
 'votes_per_demo.aged_under_18',
 'votes_per_demo.females',
 'votes_per_demo.females_aged_18-29',
 'votes_per_demo.females_aged_30-44',
 'votes_per_demo.females_aged_45+',
 'votes_per_demo.females_under_18',
 'votes_per_demo.imdb_staff',
 'votes_per_demo.imdb_users',
 'votes_per_demo.males',
 'votes_per_demo.males_aged_18-29',
 'votes_per_demo.males_aged_30-44',
 'votes_per_demo.males_aged_45+',
 'votes_per_demo.males_under_18',
 'votes_per_demo.non-us_users',
 'votes_per_demo.top_1000_voters',
 'votes_per_demo.us_users',
 'rating_freq.1',
 'rating_freq.10',
 'rating_freq.2',
 'rating_freq.3',
 'rating_freq.4',
 'rating_freq.5',
 'rating_freq.6',
 'rating_freq.7',
 'rating_freq.8',
 'rating_freq.9',
 'genres.action',
 'genres.adventure',
 'genres.animation',
 'genres.biography',
 'genres.comedy',
 'genres.crime',
 'genres.documentary',
 'genres.drama',
 'genres.family',
 'genres.fantasy',
 'genres.history',
 'genres.horror',
 'genres.music',
 'genres.musical',
 'genres.mystery',
 'genres.news',
 'genres.romance',
 'genres.sci-fi',
 'genres.sport',
 'genres.thriller',
 'genres.war',
 'genres.western',
 'num_mc_critic',
 'avg_mc_critic',
 'num_mc_critic_by_opening',
 'avg_mc_critic_by_opening',
 'num_mc_user',
 'avg_mc_user',
 'num_mc_user_by_opening',
 'avg_mc_user_by_opening',
 'num_imdb_user',
 'avg_imdb_user',
 'num_imdb_user_by_opening',
 'avg_imdb_user_by_opening',
 'opening_month',
 'opening_day',
 'opening_day_of_year',
 'norm_gross',
 'profit',
 'ROI',
 'name_length']



In [41]:

    
GENRES_COLS = [col for col in df.columns if 'genres' in col]



In [42]:

    
def _average_col_by_budget(df, colnames, norm_factor=1, labels=None, figsize=None):
    coldf = pd.DataFrame(
        data=np.array([
            [
                (np.mean(df[df[genre] == 1][colname]) / norm_factor) for genre in GENRES_COLS
            ] for colname in colnames
        ]).transpose(), 
        index=[col[col.rfind('.')+1:] for col in GENRES_COLS], 
        columns=(labels or colnames)
    )
    ax = coldf.plot(kind='bar', figsize=(figsize or (15, 10)), legend=True, fontsize=13)
    plt.xticks(rotation=45, ha='right')
    plt.title("Average {} by genre".format(', '.join(labels or colnames)), fontsize=16)
    plt.legend(prop={'size':14})



In [43]:

    
_average_col_by_budget(df, ['ROI'])



In [44]:

    
len(df[df['genres.documentary'] == 1])









    Out[44]:





37



In [45]:

    
_average_col_by_budget(df, ['budget', 'gross_income'], norm_factor=1000000, labels=['Budget', 'Gross Income'])



In [46]:

    
df['mc_avg_user_score_scaled'] = df['mc_avg_user_score'] * 10
df['rating_scaled'] = df['rating'] * 10



In [47]:

    
_average_col_by_budget(df, ['metascore', 'mc_avg_user_score_scaled','rating_scaled'], figsize=(10,8),
                      labels=['Metascore', 'Metacritic user rating', 'IMDB user rating'])
plt.ylim(0, 90);



In [48]:

    
_average_col_by_budget(df, ['duration'], labels=['Duration'])



In [49]:

    
_average_col_by_budget(df, ['opening_month'], labels=['Month of release'])



In [50]:

    
_average_col_by_budget(df, ['opening_day'], labels=['Release day of month'])

IMDB Rating vs Number of rating



In [51]:

    
np.corrcoef([df.rating, df.rating_count])[0,1]









    Out[51]:





0.48975375623733669



In [52]:

    
plt.figure(figsize=(9,8))
sns.regplot(x=df.rating, y=df.rating_count)
plt.ylim(-50000, plt.ylim()[1])
# plt.scatter(df.rating , df.rating_count)
plt.xlabel('IMDB Rating')
plt.ylabel('Number of IMDB ratings')
plt.title('IMDB rating vs # of IMDB ratings');

Rating by opening vs gross income



In [53]:

    
np.corrcoef([dataset.avg_imdb_user_by_opening, df.ix[dataset.index].gross_income])[0,1]









    Out[53]:





0.20553268564903468



In [54]:

    
plt.figure(figsize=(9,8))
sns.regplot(x=dataset.avg_imdb_user_by_opening, y=df.ix[dataset.index].gross_income / 1000000)
# plt.ylim(-50000, plt.ylim()[1])
# plt.scatter(df.rating , df.rating_count)
plt.xlabel('IMDB rating by opening weekend')
plt.ylabel('Gross Income (in millions of $)');
# plt.title('IMDB rating by opening weekend');

# ratings by opening vs gross income



In [55]:

    
np.corrcoef([dataset.num_imdb_user_by_opening, df.ix[dataset.index].gross_income])[0,1]









    Out[55]:





0.62904497701915196



In [56]:

    
plt.figure(figsize=(9,8))
# sns.regplot(x=dataset.avg_imdb_user_by_opening, y=df.ix[dataset.index].gross_income / 1000000)
sns.regplot(x=dataset.num_imdb_user_by_opening, y=df.ix[dataset.index].gross_income / 1000000, color='seagreen')
# plt.ylim(-50000, plt.ylim()[1])
# plt.scatter(df.rating , df.rating_count)
plt.xlabel('# of IMDB ratings by opening weekend')
plt.ylabel('Gross Income (in millions of $)');
# plt.title('IMDB rating by opening weekend');

avg critic ratings by opening vs gross income



In [57]:

    
np.corrcoef([dataset.avg_mc_critic_by_opening, df.ix[dataset.index].gross_income])[0,1]









    Out[57]:





0.27594777935823789



In [58]:

    
plt.figure(figsize=(9,8))
sns.regplot(x=dataset.avg_mc_critic_by_opening, y=df.ix[dataset.index].gross_income / 1000000, color='seagreen')
plt.xlabel('Average critic rating by opening weekend')
plt.ylabel('Gross Income (in millions of $)');

# of critic ratings by opening vs gross income



In [59]:

    
np.corrcoef([dataset.num_mc_critic_by_opening, df.ix[dataset.index].gross_income])[0,1]









    Out[59]:





0.48208842445095279



In [60]:

    
plt.figure(figsize=(9,8))
sns.regplot(x=dataset.num_mc_critic_by_opening, y=df.ix[dataset.index].gross_income / 1000000, color='seagreen')
plt.xlabel('# of critic ratings by opening weekend')
plt.ylabel('Gross Income (in millions of $)');

Budget vs Gross Income



In [61]:

    
sub_df = df[~(df.budget.isnull() | df.gross_income.isnull())]



In [62]:

    
len(sub_df)









    Out[62]:





751



In [63]:

    
np.corrcoef([sub_df.budget, sub_df.gross_income])[0,1]









    Out[63]:





0.70213665758145349



In [64]:

    
plt.figure(figsize=(11,9))
sns.regplot(x=sub_df.budget / 1000000, y=sub_df.gross_income / 1000000, color='red')









    Out[64]:





<matplotlib.axes._subplots.AxesSubplot at 0x11058cda0>



In [65]:

    
sub_df = df[~(df.budget.isnull() | df.ROI.isnull())]
plt.figure(figsize=(11,9))
sns.regplot(x=sub_df.budget / 1000000, y=sub_df.ROI, color='red')
np.corrcoef([sub_df.budget, sub_df.ROI])[0,1]









    Out[65]:





-0.18565967992825205

rating and metascoe vs gross



In [69]:

    
sub_df = df.metascore.dropna()



In [71]:

    
np.corrcoef([df.ix[sub_df.index].metascore, df.ix[sub_df.index].gross_income])[0,1]









    Out[71]:





0.20201486909133604



In [67]:

    
np.corrcoef([df.rating, df.gross_income])[0,1]









    Out[67]:





0.26855259879389376



In [66]:

    
plt.figure(figsize=(13,9))
a = sns.regplot(x=df['rating']*10, y=np.log(df['gross_income']), color='yellow')
b = sns.regplot(x=df['metascore'], y=np.log(df['gross_income']), color='Green')
plt.xlabel('IMDB rating and Metascore')
plt.ylabel('Gross Income [log]')
plt.title('IMDB rating and Metascore vs Gross Income [log]');
plt.legend([a, b], ['IMDB', 'Metascore'],
           loc= 'upper center', fontsize= 'small');









    



/Users/shaypalachy/miniconda3/envs/ds/lib/python3.5/site-packages/matplotlib/legend.py:634: UserWarning: Legend does not support <matplotlib.axes._subplots.AxesSubplot object at 0x10ff4e320> instances.
A proxy artist may be used instead.
See: http://matplotlib.org/users/legend_guide.html#using-proxy-artist
  "#using-proxy-artist".format(orig_handle)

	budget	gross_income	name
1	22000000.0	12232937.0	12 Rounds
4	200000000.0	166112167.0	2012
5	42000000.0	138447667.0	21 Jump Street
6	50000000.0	191616238.0	22 Jump Street
8	61000000.0	75573300.0	2 Guns
9	110000000.0	106369117.0	300: Rise of an Empire
10	28000000.0	37053924.0	30 Minutes or Less
11	28000000.0	30688364.0	3 Days to Kill
13	175000000.0	38297305.0	47 Ronin
15	30000000.0	31743332.0	9
17	NaN	5750.0	A Dark Truth
18	92000000.0	67344392.0	A Good Day to Die Hard
33	69000000.0	37516013.0	Abraham Lincoln: Vampire Hunter
35	12000000.0	70011073.0	Act of Valor
41	130000000.0	60522097.0	After Earth
42	2000000.0	40179.0	Aftershock
43	35000000.0	25863915.0	Alex Cross
49	9000000.0	6262942.0	All Is Lost
62	130000000.0	180191634.0	Ant-Man
67	27000000.0	15988876.0	Armored
71	65000000.0	19548064.0	Astro Boy
75	237000000.0	760505847.0	Avatar
76	250000000.0	458991599.0	Avengers: Age of Ultron
83	70000000.0	83552429.0	Battle: Los Angeles
85	209000000.0	65173160.0	Battleship
98	165000000.0	222487711.0	Big Hero 6
100	32000000.0	37911876.0	Big Mommas: Like Father, Like Son
114	28000000.0	20285518.0	Brick Mansions
120	55000000.0	9483821.0	Bullet to the Head
125	140000000.0	176636816.0	Captain America: The First Avenger
126	170000000.0	259746958.0	Captain America: The Winter Soldier
132	85000000.0	43575716.0	Cats & Dogs: The Revenge of Kitty Galore
135	49000000.0	31569268.0	Chappie
143	125000000.0	163192114.0	Clash of the Titans
150	40000000.0	36665854.0	Colombiana
153	90000000.0	21270904.0	Conan the Barbarian
157	25000000.0	66489425.0	Contraband
160	37000000.0	44867349.0	Cop Out
166	163000000.0	100215116.0	Cowboys & Aliens
167	20000000.0	13630226.0	Crank: High Voltage
176	25000000.0	25615792.0	Dance Flick
182	170000000.0	208543795.0	Dawn of the Planet of the Apes
183	20000000.0	29975979.0	Daybreakers
184	30000000.0	10880926.0	Dead Man Down
197	30000000.0	115646235.0	District 9
212	20000000.0	1183354.0	Dylan Dog: Dead of Night
216	NaN	500154.0	Echelon Conspiracy
217	115000000.0	93050117.0	Elysium
220	110000000.0	61656849.0	Ender's Game
226	50000000.0	25121291.0	Escape Plan
231	140000000.0	65007045.0	Exodus: Gods and Kings
237	85000000.0	155022220.0	Fast & Furious
238	160000000.0	238673370.0	Fast & Furious 6
239	125000000.0	209805005.0	Fast Five
241	NaN	23036320.0	Fighting
258	52000000.0	23324666.0	From Paris with Love
263	190000000.0	350034110.0	Furious 7
265	150000000.0	119420252.0	G-Force
266	50000000.0	20488579.0	Gamer
267	60000000.0	45996718.0	Gangster Squad
272	18000000.0	10494494.0	Getaway
273	57000000.0	51774002.0	Ghost Rider: Spirit of Vengeance
275	175000000.0	150167630.0	G.I. Joe: The Rise of Cobra
280	160000000.0	200661309.0	Godzilla
283	200000000.0	116593191.0	Green Lantern
284	100000000.0	35024475.0	Green Zone
289	170000000.0	333130696.0	Guardians of the Galaxy
292	30000000.0	40247512.0	Hanna
299	23000000.0	18934858.0	Haywire
304	42000000.0	45290318.0	Here Comes the Boom
307	2000000.0	13746550.0	Hit and Run
308	30000000.0	10134754.0	Hoodwinked Too! Hood vs. Evil
312	35000000.0	34507079.0	Hot Pursuit
317	145000000.0	176997107.0	How to Train Your Dragon 2
321	60000000.0	55092830.0	I Am Number Four
323	65000000.0	19059018.0	I, Frankenstein
332	90000000.0	196573705.0	Ice Age: Dawn of the Dinosaurs
...	...	...	...
558	130000000.0	33592415.0	R.I.P.D.
561	200000000.0	105219735.0	Robin Hood
562	100000000.0	58607007.0	RoboCop
564	50000000.0	26442251.0	Run All Night
566	35000000.0	10499968.0	Sabotage
567	30000000.0	17120019.0	Safe
568	85000000.0	126149655.0	Safe House
571	110000000.0	118311368.0	Salt
573	110000000.0	155181732.0	San Andreas
579	60000000.0	31494270.0	Scott Pilgrim vs. the World
581	40000000.0	24823283.0	Season of the Witch
585	95000000.0	17176900.0	Seventh Son
589	90000000.0	209019489.0	Sherlock Holmes
601	10000000.0	21371425.0	Skyline
604	15000000.0	42919096.0	Snitch
620	65000000.0	110822419.0	Spy
622	150000000.0	257704099.0	Star Trek
625	35000000.0	183125.0	Stolen
627	25000000.0	10324441.0	Straw Dogs
628	18000000.0	8742261.0	Street Fighter: The Legend of Chun-Li
629	82000000.0	36381716.0	Sucker Punch
631	80000000.0	38542418.0	Surrogates
634	45000000.0	139852971.0	Taken 2
635	20000000.0	57744720.0	Takers
642	155000000.0	89732035.0	Terminator Genisys
643	200000000.0	125320003.0	Terminator Salvation
651	230000000.0	262030663.0	The Amazing Spider-Man
652	200000000.0	202853933.0	The Amazing Spider-Man 2
655	220000000.0	623279547.0	The Avengers
664	80000000.0	94822707.0	The Book of Eli
666	8000000.0	10269307.0	The Boondock Saints II: All Saints Day
667	125000000.0	113165635.0	The Bourne Legacy
677	20000000.0	3749061.0	The Cold Light of Day
688	250000000.0	448130642.0	The Dark Knight Rises
689	30000000.0	21426805.0	The Darkest Hour
696	25000000.0	19478384.0	The Eagle
697	55000000.0	101530738.0	The Equalizer
698	100000000.0	85017401.0	The Expendables 2
699	90000000.0	39292022.0	The Expendables 3
714	120000000.0	98780042.0	The Green Hornet
715	25000000.0	51533608.0	The Grey
716	40000000.0	10640645.0	The Gunman
720	43000000.0	159578352.0	The Heat
724	40000000.0	26616999.0	The Host
726	78000000.0	407999255.0	The Hunger Games
727	130000000.0	424645577.0	The Hunger Games: Catching Fire
733	50000000.0	25450527.0	The International
737	40000000.0	176591618.0	The Karate Kid
745	45000000.0	12026670.0	The Last Stand
755	15000000.0	15608545.0	The Man with the Iron Fists
758	34000000.0	102413606.0	The Maze Runner
759	40000000.0	29113588.0	The Mechanic
763	60000000.0	31165421.0	The Mortal Instruments: City of Bones
771	100000000.0	119219978.0	The Other Guys
783	9000000.0	71519230.0	The Purge: Anarchy
802	150000000.0	63143812.0	The Sorcerer's Apprentice
804	28000000.0	24268828.0	The Spy Next Door
808	75000000.0	20315324.0	The Three Musketeers
810	100000000.0	67631157.0	The Tourist
829	65000000.0	54758461.0	This Means War
830	150000000.0	181015141.0	Thor
831	190000000.0	93417865.0	Tomorrowland
835	75000000.0	78009155.0	Tower Heist
839	195000000.0	352358779.0	Transformers: Dark of the Moon
840	200000000.0	402076689.0	Transformers: Revenge of the Fallen
842	170000000.0	172051787.0	Tron: Legacy
847	35000000.0	45802315.0	Underworld: Rise of the Lycans
850	30000000.0	61094903.0	Unknown
851	100000000.0	81557479.0	Unstoppable
855	30000000.0	7791146.0	Vampire Academy
864	130000000.0	107503316.0	Watchmen
877	35000000.0	10268846.0	Whiteout
883	190000000.0	202351611.0	World War Z
885	150000000.0	83640426.0	Wrath of the Titans
887	200000000.0	233914986.0	X-Men: Days of Future Past
888	160000000.0	146405371.0	X-Men: First Class
889	150000000.0	179883016.0	X-Men Origins: Wolverine