Explore Movie Dataset


In [1]:
import os
import pandas as pd
import settings
import etl

%matplotlib inline

%load_ext watermark
%watermark -d -t -v -m -p pea,pandas


2017-06-29 08:29:53 

CPython 3.6.1
IPython 6.1.0

pea 0.0.7
pandas 0.20.2

compiler   : MSC v.1900 64 bit (AMD64)
system     : Windows
release    : 7
machine    : AMD64
processor  : Intel64 Family 6 Model 42 Stepping 7, GenuineIntel
CPU cores  : 8
interpreter: 64bit

In [2]:
data = etl.Data()
data.load()

Available Columns


In [3]:
data.movie.columns


Out[3]:
Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

Add Calulations to etl


In [4]:
data.movie.dtypes


Out[4]:
color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
movie_facebook_likes           int64
dtype: object

In [5]:
data.movie['net'] = data.movie['gross'] - data.movie['budget']

In [6]:
data.movie.sort_values('budget',ascending=False)[['movie_title', 'title_year', 'budget', 'gross', 'net']]


Out[6]:
movie_title title_year budget gross net
2988 The Host 2006.0 1.221550e+10 2201412.0 -1.221330e+10
3859 Lady Vengeance 2005.0 4.200000e+09 211667.0 -4.199788e+09
3005 Fateless 2005.0 2.500000e+09 195888.0 -2.499804e+09
2323 Princess Mononoke 1997.0 2.400000e+09 2298191.0 -2.397702e+09
2334 Steamboy 2004.0 2.127520e+09 410388.0 -2.127110e+09
3423 Akira 1988.0 1.100000e+09 439162.0 -1.099561e+09
4542 Godzilla 2000 1999.0 1.000000e+09 10037390.0 -9.899626e+08
3075 Kabhi Alvida Naa Kehna 2006.0 7.000000e+08 3275443.0 -6.967246e+08
3851 Tango 1998.0 7.000000e+08 1687311.0 -6.983127e+08
3273 Kites 2010.0 6.000000e+08 1602466.0 -5.983975e+08
1338 Red Cliff 2008.0 5.536320e+08 626809.0 -5.530052e+08
3311 The Legend of Suriyothai 2001.0 4.000000e+08 454255.0 -3.995457e+08
1016 The Messenger: The Story of Joan of Arc 1999.0 3.900000e+08 14131298.0 -3.758687e+08
2740 Ong-bak 2 2008.0 3.000000e+08 102055.0 -2.998979e+08
1 Pirates of the Caribbean: At World's End 2007.0 3.000000e+08 309404152.0 9.404152e+06
5 John Carter 2012.0 2.637000e+08 73058679.0 -1.906413e+08
7 Tangled 2010.0 2.600000e+08 200807262.0 -5.919274e+07
6 Spider-Man 3 2007.0 2.580000e+08 336530303.0 7.853030e+07
3461 Spider-Man 3 2007.0 2.580000e+08 336530303.0 7.853030e+07
10 Batman v Superman: Dawn of Justice 2016.0 2.500000e+08 330249062.0 8.024906e+07
9 Harry Potter and the Half-Blood Prince 2009.0 2.500000e+08 301956980.0 5.195698e+07
8 Avengers: Age of Ultron 2015.0 2.500000e+08 458991599.0 2.089916e+08
18 Pirates of the Caribbean: On Stranger Tides 2011.0 2.500000e+08 241063875.0 -8.936125e+06
20 The Hobbit: The Battle of the Five Armies 2014.0 2.500000e+08 255108370.0 5.108370e+06
3 The Dark Knight Rises 2012.0 2.500000e+08 448130642.0 1.981306e+08
27 Captain America: Civil War 2016.0 2.500000e+08 407197282.0 1.571973e+08
2 Spectre 2015.0 2.450000e+08 200074175.0 -4.492582e+07
0 Avatar 2009.0 2.370000e+08 760505847.0 5.235058e+08
21 The Amazing Spider-Man 2012.0 2.300000e+08 262030663.0 3.203066e+07
15 Man of Steel 2013.0 2.250000e+08 291021565.0 6.602156e+07
... ... ... ... ... ...
4902 3 2010.0 NaN 59774.0 NaN
4909 Baghead 2008.0 NaN 140016.0 NaN
4910 Solitude 2014.0 NaN NaN NaN
4912 Ordet 1955.0 NaN NaN NaN
4913 Good Dick 2008.0 NaN 15542.0 NaN
4919 H. 2014.0 NaN NaN NaN
4927 The Calling 2014.0 NaN NaN NaN
4934 When the Lights Went Out 2012.0 NaN NaN NaN
4935 Heroes of Dirt 2015.0 NaN NaN NaN
4944 Sound of My Voice 2011.0 NaN 405614.0 NaN
4960 The Mighty 1998.0 NaN 2643689.0 NaN
4967 Open Secret 1948.0 NaN NaN NaN
4969 The Night Visitor 1971.0 NaN NaN NaN
4974 I Love You, Don't Touch Me! 1997.0 NaN 33598.0 NaN
4982 Supporting Characters 2012.0 NaN NaN NaN
4985 The Dirties 2013.0 NaN NaN NaN
4986 Gabriela 1983.0 NaN NaN NaN
4989 The Naked Ape 2006.0 NaN NaN NaN
5001 The Last Waltz 1978.0 NaN 321952.0 NaN
5003 The Exploding Girl 2009.0 NaN 24705.0 NaN
5005 Mutual Appreciation 2005.0 NaN NaN NaN
5007 Down Terrace 2009.0 NaN 9609.0 NaN
5010 Funny Ha Ha 2002.0 NaN NaN NaN
5014 Rampage 2009.0 NaN NaN NaN
5019 Exeter 2015.0 NaN NaN NaN
5030 On the Downlow 2004.0 NaN NaN NaN
5032 Bang 1995.0 NaN NaN NaN
5038 Signed Sealed Delivered 2013.0 NaN NaN NaN
5039 The Following NaN NaN NaN NaN
5041 Shanghai Calling 2012.0 NaN 10443.0 NaN

5043 rows × 5 columns

plotting with IPlotter

This example is using my own branch of IPlotter which builds the dictionary from a pandas DataFrame. Much less verbose, but can be done with the current version on PyPI.


In [7]:
from iplotter import C3Plotter

In [8]:
c3 = C3Plotter()

Timeseries of mean gross


In [9]:
plot_data = data.movie.groupby(['title_year']).min()[['gross', 'net', 'budget']].fillna(0)
c3.plot(plot_data, zoom=True)


Out[9]:

In [10]:
country_group = data.movie.groupby('country').mean()['imdb_score']
values = country_group.values.tolist()
countries = country_group.index.values.tolist()

In [11]:
from iplotter import PlotlyPlotter
from IPython.display import HTML

plotly = PlotlyPlotter()

c3_plotter = C3Plotter()

plotly_chart = [{
    "type": 'choropleth',
    "locationmode": 'country names',
    "locations": countries,
    "z": values,
    "zmin": 0,
    "zmax": max(values),
    "colorscale": [
        [0, 'rgb(242,240,247)'], [0.2, 'rgb(218,218,235)'],
        [0.4, 'rgb(188,189,220)'], [0.6, 'rgb(158,154,200)'],
        [0.8, 'rgb(117,107,177)'], [1, 'rgb(84,39,143)']
    ],
    "colorbar": {
        "title": 'Count',
        "thickness": 10
    },
    "marker": {
        "line": {
            "color": 'rgb(255,255,255)',
            "width": 2
        }
    }
}]

plotly_layout = {
    "title": 'Movie Counts by Country',
    "geo": {
        "scope": 'country names',
    }
}



country_plot = plotly.plot(data=plotly_chart)

Movies by Country

{{ country_plot }}


In [12]:
data.movie.set_index(['budget'])['imdb_score']


Out[12]:
budget
 237000000.0    7.9
 300000000.0    7.1
 245000000.0    6.8
 250000000.0    8.5
NaN             7.1
 263700000.0    6.6
 258000000.0    6.2
 260000000.0    7.8
 250000000.0    7.5
 250000000.0    7.5
 250000000.0    6.9
 209000000.0    6.1
 200000000.0    6.7
 225000000.0    7.3
 215000000.0    6.5
 225000000.0    7.2
 225000000.0    6.6
 220000000.0    8.1
 250000000.0    6.7
 225000000.0    6.8
 250000000.0    7.5
 230000000.0    7.0
 200000000.0    6.7
 225000000.0    7.9
 180000000.0    6.1
 207000000.0    7.2
 200000000.0    7.7
 250000000.0    8.2
 209000000.0    5.9
 150000000.0    7.0
               ... 
 24000.0        7.0
NaN             6.3
 23000.0        7.1
 25000.0        4.8
 22000.0        3.3
 20000.0        6.9
NaN             4.6
 17350.0        3.0
 15000.0        6.6
 15000.0        7.4
 15000.0        6.2
 20000.0        4.0
 10000.0        6.1
 4500.0         6.9
 10000.0        7.5
 10000.0        6.7
 1000000.0      7.4
NaN             6.1
 200000.0       5.4
NaN             6.4
 7000.0         7.0
 7000.0         6.3
 7000.0         6.9
 3250.0         7.8
 9000.0         6.4
NaN             7.7
NaN             7.5
 1400.0         6.3
NaN             6.3
 1100.0         6.6
Name: imdb_score, Length: 5043, dtype: float64

In [13]:
score_by_budget = data.movie.set_index(['director_facebook_likes'])[['net']]
c3.plot(score_by_budget, kind='scatter', zoom=True, )


Out[13]:

In [14]:
from ipywidgets import interact, interactive, fixed, interact_manual

In [15]:
def f(country):
    df = data.movie[data.movie['country'] == country]
    ax = df.groupby(['director_name']).agg({'director_facebook_likes':'sum', 'gross':'sum'}).plot(kind='scatter', x='director_facebook_likes', y='gross')
    plt.show()

In [16]:
import matplotlib.pyplot as plt

In [17]:
interact(f, country=data.movie.country.drop_duplicates().dropna().values.tolist());



In [ ]: