Explore Movie Dataset



In [1]:

    
import os
import pandas as pd
import settings
import etl

%matplotlib inline

%load_ext watermark
%watermark -d -t -v -m -p pea,pandas









    



2017-06-29 08:29:53 

CPython 3.6.1
IPython 6.1.0

pea 0.0.7
pandas 0.20.2

compiler   : MSC v.1900 64 bit (AMD64)
system     : Windows
release    : 7
machine    : AMD64
processor  : Intel64 Family 6 Model 42 Stepping 7, GenuineIntel
CPU cores  : 8
interpreter: 64bit



In [2]:

    
data = etl.Data()
data.load()

Available Columns



In [3]:

    
data.movie.columns









    Out[3]:





Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

Add Calulations to etl



In [4]:

    
data.movie.dtypes









    Out[4]:





color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
movie_facebook_likes           int64
dtype: object



In [5]:

    
data.movie['net'] = data.movie['gross'] - data.movie['budget']



In [6]:

    
data.movie.sort_values('budget',ascending=False)[['movie_title', 'title_year', 'budget', 'gross', 'net']]









    Out[6]:







  
    
      
      movie_title
      title_year
      budget
      gross
      net
    
  
  
    
      2988
      The Host
      2006.0
      1.221550e+10
      2201412.0
      -1.221330e+10
    
    
      3859
      Lady Vengeance
      2005.0
      4.200000e+09
      211667.0
      -4.199788e+09
    
    
      3005
      Fateless
      2005.0
      2.500000e+09
      195888.0
      -2.499804e+09
    
    
      2323
      Princess Mononoke
      1997.0
      2.400000e+09
      2298191.0
      -2.397702e+09
    
    
      2334
      Steamboy
      2004.0
      2.127520e+09
      410388.0
      -2.127110e+09
    
    
      3423
      Akira
      1988.0
      1.100000e+09
      439162.0
      -1.099561e+09
    
    
      4542
      Godzilla 2000
      1999.0
      1.000000e+09
      10037390.0
      -9.899626e+08
    
    
      3075
      Kabhi Alvida Naa Kehna
      2006.0
      7.000000e+08
      3275443.0
      -6.967246e+08
    
    
      3851
      Tango
      1998.0
      7.000000e+08
      1687311.0
      -6.983127e+08
    
    
      3273
      Kites
      2010.0
      6.000000e+08
      1602466.0
      -5.983975e+08
    
    
      1338
      Red Cliff
      2008.0
      5.536320e+08
      626809.0
      -5.530052e+08
    
    
      3311
      The Legend of Suriyothai
      2001.0
      4.000000e+08
      454255.0
      -3.995457e+08
    
    
      1016
      The Messenger: The Story of Joan of Arc
      1999.0
      3.900000e+08
      14131298.0
      -3.758687e+08
    
    
      2740
      Ong-bak 2
      2008.0
      3.000000e+08
      102055.0
      -2.998979e+08
    
    
      1
      Pirates of the Caribbean: At World's End
      2007.0
      3.000000e+08
      309404152.0
      9.404152e+06
    
    
      5
      John Carter
      2012.0
      2.637000e+08
      73058679.0
      -1.906413e+08
    
    
      7
      Tangled
      2010.0
      2.600000e+08
      200807262.0
      -5.919274e+07
    
    
      6
      Spider-Man 3
      2007.0
      2.580000e+08
      336530303.0
      7.853030e+07
    
    
      3461
      Spider-Man 3
      2007.0
      2.580000e+08
      336530303.0
      7.853030e+07
    
    
      10
      Batman v Superman: Dawn of Justice
      2016.0
      2.500000e+08
      330249062.0
      8.024906e+07
    
    
      9
      Harry Potter and the Half-Blood Prince
      2009.0
      2.500000e+08
      301956980.0
      5.195698e+07
    
    
      8
      Avengers: Age of Ultron
      2015.0
      2.500000e+08
      458991599.0
      2.089916e+08
    
    
      18
      Pirates of the Caribbean: On Stranger Tides
      2011.0
      2.500000e+08
      241063875.0
      -8.936125e+06
    
    
      20
      The Hobbit: The Battle of the Five Armies
      2014.0
      2.500000e+08
      255108370.0
      5.108370e+06
    
    
      3
      The Dark Knight Rises
      2012.0
      2.500000e+08
      448130642.0
      1.981306e+08
    
    
      27
      Captain America: Civil War
      2016.0
      2.500000e+08
      407197282.0
      1.571973e+08
    
    
      2
      Spectre
      2015.0
      2.450000e+08
      200074175.0
      -4.492582e+07
    
    
      0
      Avatar
      2009.0
      2.370000e+08
      760505847.0
      5.235058e+08
    
    
      21
      The Amazing Spider-Man
      2012.0
      2.300000e+08
      262030663.0
      3.203066e+07
    
    
      15
      Man of Steel
      2013.0
      2.250000e+08
      291021565.0
      6.602156e+07
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      4902
      3
      2010.0
      NaN
      59774.0
      NaN
    
    
      4909
      Baghead
      2008.0
      NaN
      140016.0
      NaN
    
    
      4910
      Solitude
      2014.0
      NaN
      NaN
      NaN
    
    
      4912
      Ordet
      1955.0
      NaN
      NaN
      NaN
    
    
      4913
      Good Dick
      2008.0
      NaN
      15542.0
      NaN
    
    
      4919
      H.
      2014.0
      NaN
      NaN
      NaN
    
    
      4927
      The Calling
      2014.0
      NaN
      NaN
      NaN
    
    
      4934
      When the Lights Went Out
      2012.0
      NaN
      NaN
      NaN
    
    
      4935
      Heroes of Dirt
      2015.0
      NaN
      NaN
      NaN
    
    
      4944
      Sound of My Voice
      2011.0
      NaN
      405614.0
      NaN
    
    
      4960
      The Mighty
      1998.0
      NaN
      2643689.0
      NaN
    
    
      4967
      Open Secret
      1948.0
      NaN
      NaN
      NaN
    
    
      4969
      The Night Visitor
      1971.0
      NaN
      NaN
      NaN
    
    
      4974
      I Love You, Don't Touch Me!
      1997.0
      NaN
      33598.0
      NaN
    
    
      4982
      Supporting Characters
      2012.0
      NaN
      NaN
      NaN
    
    
      4985
      The Dirties
      2013.0
      NaN
      NaN
      NaN
    
    
      4986
      Gabriela
      1983.0
      NaN
      NaN
      NaN
    
    
      4989
      The Naked Ape
      2006.0
      NaN
      NaN
      NaN
    
    
      5001
      The Last Waltz
      1978.0
      NaN
      321952.0
      NaN
    
    
      5003
      The Exploding Girl
      2009.0
      NaN
      24705.0
      NaN
    
    
      5005
      Mutual Appreciation
      2005.0
      NaN
      NaN
      NaN
    
    
      5007
      Down Terrace
      2009.0
      NaN
      9609.0
      NaN
    
    
      5010
      Funny Ha Ha
      2002.0
      NaN
      NaN
      NaN
    
    
      5014
      Rampage
      2009.0
      NaN
      NaN
      NaN
    
    
      5019
      Exeter
      2015.0
      NaN
      NaN
      NaN
    
    
      5030
      On the Downlow
      2004.0
      NaN
      NaN
      NaN
    
    
      5032
      Bang
      1995.0
      NaN
      NaN
      NaN
    
    
      5038
      Signed Sealed Delivered
      2013.0
      NaN
      NaN
      NaN
    
    
      5039
      The Following
      NaN
      NaN
      NaN
      NaN
    
    
      5041
      Shanghai Calling
      2012.0
      NaN
      10443.0
      NaN
    
  

5043 rows × 5 columns

plotting with IPlotter

This example is using my own branch of IPlotter which builds the dictionary from a pandas DataFrame. Much less verbose, but can be done with the current version on PyPI.



In [7]:

    
from iplotter import C3Plotter



In [8]:

    
c3 = C3Plotter()

Timeseries of mean gross



In [9]:

    
plot_data = data.movie.groupby(['title_year']).min()[['gross', 'net', 'budget']].fillna(0)
c3.plot(plot_data, zoom=True)









    Out[9]:



In [10]:

    
country_group = data.movie.groupby('country').mean()['imdb_score']
values = country_group.values.tolist()
countries = country_group.index.values.tolist()



In [11]:

    
from iplotter import PlotlyPlotter
from IPython.display import HTML

plotly = PlotlyPlotter()

c3_plotter = C3Plotter()

plotly_chart = [{
    "type": 'choropleth',
    "locationmode": 'country names',
    "locations": countries,
    "z": values,
    "zmin": 0,
    "zmax": max(values),
    "colorscale": [
        [0, 'rgb(242,240,247)'], [0.2, 'rgb(218,218,235)'],
        [0.4, 'rgb(188,189,220)'], [0.6, 'rgb(158,154,200)'],
        [0.8, 'rgb(117,107,177)'], [1, 'rgb(84,39,143)']
    ],
    "colorbar": {
        "title": 'Count',
        "thickness": 10
    },
    "marker": {
        "line": {
            "color": 'rgb(255,255,255)',
            "width": 2
        }
    }
}]

plotly_layout = {
    "title": 'Movie Counts by Country',
    "geo": {
        "scope": 'country names',
    }
}



country_plot = plotly.plot(data=plotly_chart)

Movies by Country



In [12]:

    
data.movie.set_index(['budget'])['imdb_score']









    Out[12]:





budget
 237000000.0    7.9
 300000000.0    7.1
 245000000.0    6.8
 250000000.0    8.5
NaN             7.1
 263700000.0    6.6
 258000000.0    6.2
 260000000.0    7.8
 250000000.0    7.5
 250000000.0    7.5
 250000000.0    6.9
 209000000.0    6.1
 200000000.0    6.7
 225000000.0    7.3
 215000000.0    6.5
 225000000.0    7.2
 225000000.0    6.6
 220000000.0    8.1
 250000000.0    6.7
 225000000.0    6.8
 250000000.0    7.5
 230000000.0    7.0
 200000000.0    6.7
 225000000.0    7.9
 180000000.0    6.1
 207000000.0    7.2
 200000000.0    7.7
 250000000.0    8.2
 209000000.0    5.9
 150000000.0    7.0
               ... 
 24000.0        7.0
NaN             6.3
 23000.0        7.1
 25000.0        4.8
 22000.0        3.3
 20000.0        6.9
NaN             4.6
 17350.0        3.0
 15000.0        6.6
 15000.0        7.4
 15000.0        6.2
 20000.0        4.0
 10000.0        6.1
 4500.0         6.9
 10000.0        7.5
 10000.0        6.7
 1000000.0      7.4
NaN             6.1
 200000.0       5.4
NaN             6.4
 7000.0         7.0
 7000.0         6.3
 7000.0         6.9
 3250.0         7.8
 9000.0         6.4
NaN             7.7
NaN             7.5
 1400.0         6.3
NaN             6.3
 1100.0         6.6
Name: imdb_score, Length: 5043, dtype: float64



In [13]:

    
score_by_budget = data.movie.set_index(['director_facebook_likes'])[['net']]
c3.plot(score_by_budget, kind='scatter', zoom=True, )









    Out[13]:



In [14]:

    
from ipywidgets import interact, interactive, fixed, interact_manual



In [15]:

    
def f(country):
    df = data.movie[data.movie['country'] == country]
    ax = df.groupby(['director_name']).agg({'director_facebook_likes':'sum', 'gross':'sum'}).plot(kind='scatter', x='director_facebook_likes', y='gross')
    plt.show()



In [16]:

    
import matplotlib.pyplot as plt



In [17]:

    
interact(f, country=data.movie.country.drop_duplicates().dropna().values.tolist());



In [ ]:

	movie_title	title_year	budget	gross	net
2988	The Host	2006.0	1.221550e+10	2201412.0	-1.221330e+10
3859	Lady Vengeance	2005.0	4.200000e+09	211667.0	-4.199788e+09
3005	Fateless	2005.0	2.500000e+09	195888.0	-2.499804e+09
2323	Princess Mononoke	1997.0	2.400000e+09	2298191.0	-2.397702e+09
2334	Steamboy	2004.0	2.127520e+09	410388.0	-2.127110e+09
3423	Akira	1988.0	1.100000e+09	439162.0	-1.099561e+09
4542	Godzilla 2000	1999.0	1.000000e+09	10037390.0	-9.899626e+08
3075	Kabhi Alvida Naa Kehna	2006.0	7.000000e+08	3275443.0	-6.967246e+08
3851	Tango	1998.0	7.000000e+08	1687311.0	-6.983127e+08
3273	Kites	2010.0	6.000000e+08	1602466.0	-5.983975e+08
1338	Red Cliff	2008.0	5.536320e+08	626809.0	-5.530052e+08
3311	The Legend of Suriyothai	2001.0	4.000000e+08	454255.0	-3.995457e+08
1016	The Messenger: The Story of Joan of Arc	1999.0	3.900000e+08	14131298.0	-3.758687e+08
2740	Ong-bak 2	2008.0	3.000000e+08	102055.0	-2.998979e+08
1	Pirates of the Caribbean: At World's End	2007.0	3.000000e+08	309404152.0	9.404152e+06
5	John Carter	2012.0	2.637000e+08	73058679.0	-1.906413e+08
7	Tangled	2010.0	2.600000e+08	200807262.0	-5.919274e+07
6	Spider-Man 3	2007.0	2.580000e+08	336530303.0	7.853030e+07
3461	Spider-Man 3	2007.0	2.580000e+08	336530303.0	7.853030e+07
10	Batman v Superman: Dawn of Justice	2016.0	2.500000e+08	330249062.0	8.024906e+07
9	Harry Potter and the Half-Blood Prince	2009.0	2.500000e+08	301956980.0	5.195698e+07
8	Avengers: Age of Ultron	2015.0	2.500000e+08	458991599.0	2.089916e+08
18	Pirates of the Caribbean: On Stranger Tides	2011.0	2.500000e+08	241063875.0	-8.936125e+06
20	The Hobbit: The Battle of the Five Armies	2014.0	2.500000e+08	255108370.0	5.108370e+06
3	The Dark Knight Rises	2012.0	2.500000e+08	448130642.0	1.981306e+08
27	Captain America: Civil War	2016.0	2.500000e+08	407197282.0	1.571973e+08
2	Spectre	2015.0	2.450000e+08	200074175.0	-4.492582e+07
0	Avatar	2009.0	2.370000e+08	760505847.0	5.235058e+08
21	The Amazing Spider-Man	2012.0	2.300000e+08	262030663.0	3.203066e+07
15	Man of Steel	2013.0	2.250000e+08	291021565.0	6.602156e+07
...	...	...	...	...	...
4902	3	2010.0	NaN	59774.0	NaN
4909	Baghead	2008.0	NaN	140016.0	NaN
4910	Solitude	2014.0	NaN	NaN	NaN
4912	Ordet	1955.0	NaN	NaN	NaN
4913	Good Dick	2008.0	NaN	15542.0	NaN
4919	H.	2014.0	NaN	NaN	NaN
4927	The Calling	2014.0	NaN	NaN	NaN
4934	When the Lights Went Out	2012.0	NaN	NaN	NaN
4935	Heroes of Dirt	2015.0	NaN	NaN	NaN
4944	Sound of My Voice	2011.0	NaN	405614.0	NaN
4960	The Mighty	1998.0	NaN	2643689.0	NaN
4967	Open Secret	1948.0	NaN	NaN	NaN
4969	The Night Visitor	1971.0	NaN	NaN	NaN
4974	I Love You, Don't Touch Me!	1997.0	NaN	33598.0	NaN
4982	Supporting Characters	2012.0	NaN	NaN	NaN
4985	The Dirties	2013.0	NaN	NaN	NaN
4986	Gabriela	1983.0	NaN	NaN	NaN
4989	The Naked Ape	2006.0	NaN	NaN	NaN
5001	The Last Waltz	1978.0	NaN	321952.0	NaN
5003	The Exploding Girl	2009.0	NaN	24705.0	NaN
5005	Mutual Appreciation	2005.0	NaN	NaN	NaN
5007	Down Terrace	2009.0	NaN	9609.0	NaN
5010	Funny Ha Ha	2002.0	NaN	NaN	NaN
5014	Rampage	2009.0	NaN	NaN	NaN
5019	Exeter	2015.0	NaN	NaN	NaN
5030	On the Downlow	2004.0	NaN	NaN	NaN
5032	Bang	1995.0	NaN	NaN	NaN
5038	Signed Sealed Delivered	2013.0	NaN	NaN	NaN
5039	The Following	NaN	NaN	NaN	NaN
5041	Shanghai Calling	2012.0	NaN	10443.0	NaN