Pandas is Crazy


In [1]:
import pandas as pd

In [2]:
%%time
cast = pd.DataFrame.from_csv('../data/intro/cast.csv.gz', index_col=None)


Wall time: 8.27 s

In [ ]:


In [3]:
cast.head()


Out[3]:
title year name type character n
0 Suuri illusioni 1985 Homo $ actor Guests 22.0
1 Battle of the Sexes 2017 $hutter actor Bobby Riggs Fan 10.0
2 Secret in Their Eyes 2015 $hutter actor 2002 Dodger Fan NaN
3 Steve Jobs 2015 $hutter actor 1988 Opera House Patron NaN
4 Straight Outta Compton 2015 $hutter actor Club Patron NaN

In [ ]:


In [4]:
titles = cast[['title', 'year']].drop_duplicates().reset_index(drop=True)
titles.head()


Out[4]:
title year
0 Suuri illusioni 1985
1 Battle of the Sexes 2017
2 Secret in Their Eyes 2015
3 Steve Jobs 2015
4 Straight Outta Compton 2015

In [ ]:


In [5]:
# What is the name and year of the very first movie ever made?

titles.sort_values('year').head(1)


Out[5]:
title year
130478 Miss Jerry 1894

In [ ]:


In [6]:
# How many years into the future does the IMDB database list movie titles?

titles.sort_values('year').tail(3)


Out[6]:
title year
66884 Clutch Control 2022
165691 Avatar 4 2022
34974 New Rebellion 2023

In [ ]:


In [7]:
# How many movies listed in `titles` came out in 1950?

len(titles[titles.year == 1950])

# or: (titles.year == 1950).sum()


Out[7]:
901

In [ ]:


In [8]:
# What are the 15 most common movie titles in film history?

titles.title.value_counts().head(15)


Out[8]:
Hamlet                       17
Macbeth                      15
Carmen                       13
The Outsider                 11
Maya                         11
She                          10
Temptation                   10
Anna Karenina                10
Othello                      10
The Three Musketeers          9
Vengeance                     9
Rage                          9
A Midsummer Night's Dream     9
Blood Money                   9
Bad Blood                     9
Name: title, dtype: int64

In [ ]:


In [9]:
# How many movies has Leonardo DiCaprio acted in?

leo = cast[cast.name == 'Leonardo DiCaprio']
leo


Out[9]:
title year name type character n
553812 Blood Diamond 2006 Leonardo DiCaprio actor Danny Archer 1.0
553813 Body of Lies 2008 Leonardo DiCaprio actor Roger Ferris 1.0
553814 Catch Me If You Can 2002 Leonardo DiCaprio actor Frank Abagnale Jr. 1.0
553815 Celebrity 1998 Leonardo DiCaprio actor Brandon Darrow 93.0
553816 Critters 3 1991 Leonardo DiCaprio actor Josh 7.0
553817 Django Unchained 2012 Leonardo DiCaprio actor Calvin Candie 3.0
553818 Don's Plum 2001 Leonardo DiCaprio actor Derek 4.0
553819 Gangs of New York 2002 Leonardo DiCaprio actor Amsterdam Vallon 1.0
553820 Inception 2010 Leonardo DiCaprio actor Cobb 1.0
553821 J. Edgar 2011 Leonardo DiCaprio actor J. Edgar Hoover 1.0
553822 Marvin's Room 1996 Leonardo DiCaprio actor Hank 2.0
553823 Model Culture: One + Night in Bangkok 2009 Leonardo DiCaprio actor Himself NaN
553824 Poison Ivy 1992 Leonardo DiCaprio actor Guy 9.0
553825 Revolutionary Road 2008 Leonardo DiCaprio actor Frank Wheeler 2.0
553826 Romeo + Juliet 1996 Leonardo DiCaprio actor Romeo 1.0
553827 Shutter Island 2010 Leonardo DiCaprio actor Teddy Daniels 1.0
553828 The Aviator 2004 Leonardo DiCaprio actor Howard Hughes 1.0
553829 The Basketball Diaries 1995 Leonardo DiCaprio actor Jim Carroll 1.0
553830 The Beach 2000 Leonardo DiCaprio actor Richard 1.0
553831 The Departed 2006 Leonardo DiCaprio actor Billy 1.0
553832 The Great Gatsby 2013 Leonardo DiCaprio actor Jay Gatsby 12.0
553833 The Man in the Iron Mask 1998 Leonardo DiCaprio actor King Louis XIV 1.0
553834 The Man in the Iron Mask 1998 Leonardo DiCaprio actor Philippe 1.0
553835 The Quick and the Dead 1995 Leonardo DiCaprio actor Kid 4.0
553836 The Revenant 2015 Leonardo DiCaprio actor Hugh Glass 1.0
553837 The Wolf of Wall Street 2013 Leonardo DiCaprio actor Jordan Belfort 1.0
553838 This Boy's Life 1993 Leonardo DiCaprio actor Toby 3.0
553839 Titanic 1997 Leonardo DiCaprio actor Jack Dawson 1.0
553840 Total Eclipse 1995 Leonardo DiCaprio actor Arthur Rimbaud 1.0
553841 What's Eating Gilbert Grape 1993 Leonardo DiCaprio actor Arnie Grape 2.0

In [10]:
len(leo)


Out[10]:
30

In [ ]:


In [11]:
# What movies has Leo acted in major roles (1-3)?

leo[cast.n <= 3]


C:\Anaconda2\lib\site-packages\ipykernel\__main__.py:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  app.launch_new_instance()
Out[11]:
title year name type character n
553812 Blood Diamond 2006 Leonardo DiCaprio actor Danny Archer 1.0
553813 Body of Lies 2008 Leonardo DiCaprio actor Roger Ferris 1.0
553814 Catch Me If You Can 2002 Leonardo DiCaprio actor Frank Abagnale Jr. 1.0
553817 Django Unchained 2012 Leonardo DiCaprio actor Calvin Candie 3.0
553819 Gangs of New York 2002 Leonardo DiCaprio actor Amsterdam Vallon 1.0
553820 Inception 2010 Leonardo DiCaprio actor Cobb 1.0
553821 J. Edgar 2011 Leonardo DiCaprio actor J. Edgar Hoover 1.0
553822 Marvin's Room 1996 Leonardo DiCaprio actor Hank 2.0
553825 Revolutionary Road 2008 Leonardo DiCaprio actor Frank Wheeler 2.0
553826 Romeo + Juliet 1996 Leonardo DiCaprio actor Romeo 1.0
553827 Shutter Island 2010 Leonardo DiCaprio actor Teddy Daniels 1.0
553828 The Aviator 2004 Leonardo DiCaprio actor Howard Hughes 1.0
553829 The Basketball Diaries 1995 Leonardo DiCaprio actor Jim Carroll 1.0
553830 The Beach 2000 Leonardo DiCaprio actor Richard 1.0
553831 The Departed 2006 Leonardo DiCaprio actor Billy 1.0
553833 The Man in the Iron Mask 1998 Leonardo DiCaprio actor King Louis XIV 1.0
553834 The Man in the Iron Mask 1998 Leonardo DiCaprio actor Philippe 1.0
553836 The Revenant 2015 Leonardo DiCaprio actor Hugh Glass 1.0
553837 The Wolf of Wall Street 2013 Leonardo DiCaprio actor Jordan Belfort 1.0
553838 This Boy's Life 1993 Leonardo DiCaprio actor Toby 3.0
553839 Titanic 1997 Leonardo DiCaprio actor Jack Dawson 1.0
553840 Total Eclipse 1995 Leonardo DiCaprio actor Arthur Rimbaud 1.0
553841 What's Eating Gilbert Grape 1993 Leonardo DiCaprio actor Arnie Grape 2.0

In [12]:
leo_major_roles = cast[(cast.name == 'Leonardo DiCaprio') & (cast.n <= 3)]

In [13]:
# And only in recent (>=2010) years?

In [14]:
leo_major_roles[cast.year >= 2010].sort_values('year')


C:\Anaconda2\lib\site-packages\ipykernel\__main__.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  if __name__ == '__main__':
Out[14]:
title year name type character n
553820 Inception 2010 Leonardo DiCaprio actor Cobb 1.0
553827 Shutter Island 2010 Leonardo DiCaprio actor Teddy Daniels 1.0
553821 J. Edgar 2011 Leonardo DiCaprio actor J. Edgar Hoover 1.0
553817 Django Unchained 2012 Leonardo DiCaprio actor Calvin Candie 3.0
553837 The Wolf of Wall Street 2013 Leonardo DiCaprio actor Jordan Belfort 1.0
553836 The Revenant 2015 Leonardo DiCaprio actor Hugh Glass 1.0

In [ ]:


In [15]:
# What are the 5 longest movie titles ever?

pd.set_option('max_colwidth', 300)

t = titles.copy()
t['len'] = t.title.str.len()
t = t.sort_values('len', ascending=False)
t.head()


Out[15]:
title year len
174310 Night of the Day of the Dawn of the Son of the Bride of the Return of the Revenge of the Terror of the Attack of the Evil Mutant Hellbound Flesh Eating Crawling Alien Zombified Subhumanoid Living Dead, Part 5 2011 208
180077 Night of the Day of the Dawn of the Son of the Bride of the Return of the Revenge of the Terror of the Attack of the Evil, Mutant, Hellbound, Flesh-Eating Subhumanoid Zombified Living Dead, Part 3 2005 196
12993 Brigitte, Laura, Ursula, Monica, Raquel, Litz, Florinda, Barbara, Claudia, e Sofia le chiamo tutte... anima mia 1974 111
150746 Film d'amore e d'anarchia, ovvero 'stamattina alle 10 in via dei Fiori nella nota casa di tolleranza...' 1973 104
25811 Those Magnificent Men in Their Flying Machines or How I Flew from London to Paris in 25 hours 11 minutes 1965 104

In [ ]:


In [16]:
# Plot the number of films that have been released each decade over the history of cinema.

%matplotlib inline

In [17]:
t = titles
(t.year // 10 * 10).value_counts().sort_index().plot(kind='bar')


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x2808d278>

In [ ]:


In [ ]: