In [1]:
%matplotlib inline
import pandas as pd
In [2]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))
Out[2]:
In [3]:
titles = pd.read_csv('data/titles.csv')
titles.head()
Out[3]:
In [4]:
cast = pd.read_csv('data/cast.csv')
cast.head()
Out[4]:
In [5]:
# How many movies are listed in the titles dataframe?
len(titles)
Out[5]:
In [6]:
# What are the earliest two films listed in the titles dataframe?
titles.sort_values('year').head(2)
Out[6]:
In [7]:
# How many movies have the title "Hamlet"?
len(titles[titles.title == 'Hamlet'])
Out[7]:
In [8]:
# How many movies are titled "North by Northwest"?
len(titles[titles.title == 'North by Northwest'])
Out[8]:
In [9]:
# When was the first movie titled "Hamlet" made?
titles[titles.title == 'Hamlet'].sort_values('year').head(1)
Out[9]:
In [10]:
# List all of the "Treasure Island" movies from earliest to most recent.
titles[titles.title == 'Treasure Island'].sort_values('year')
Out[10]:
In [11]:
# How many movies were made in the year 1950?
len(titles[titles.year == 1950])
Out[11]:
In [12]:
# How many movies were made in the year 1960?
len(titles[titles.year == 1960])
Out[12]:
In [13]:
# How many movies were made from 1950 through 1959?
t = titles
len(t[(t.year >= 1950) & (t.year <= 1959)])
Out[13]:
In [14]:
len(t[t.year // 10 == 195])
Out[14]:
In [15]:
# In what years has a movie titled "Batman" been released?
t = titles
t[t.title == 'Batman']
Out[15]:
In [16]:
# How many roles were there in the movie "Inception"?
c = cast
len(c[c.title == 'Inception'])
Out[16]:
In [17]:
# How many roles in the movie "Inception" are NOT ranked by an "n" value?
c = cast
c = c[c.title == 'Inception']
c = c[c.n.isnull()]
len(c)
Out[17]:
In [18]:
# But how many roles in the movie "Inception" did receive an "n" value?
c = cast
c = c[c.title == 'Inception']
c = c[c.n.notnull()]
len(c)
Out[18]:
In [19]:
# Display the cast of "North by Northwest" in their correct "n"-value order,
# ignoring roles that did not earn a numeric "n" value.
c = cast
c = c[c.title == 'North by Northwest']
c = c[c.n.notnull()]
c = c.sort_values('n')
c
Out[19]:
In [20]:
# Display the entire cast, in "n"-order, of the 1972 film "Sleuth".
c = cast
c = c[c.title == 'Sleuth']
c = c[c.year == 1972]
c = c.sort_values('n')
c
Out[20]:
In [21]:
# Now display the entire cast, in "n"-order, of the 2007 version of "Sleuth".
c = cast
c = c[c.title == 'Sleuth']
c = c[c.year == 2007]
c = c.sort_values('n')
c
Out[21]:
In [22]:
# How many roles were credited in the silent 1921 version of Hamlet?
c = cast
c = c[(c.title == 'Hamlet') & (c.year == 1921)]
len(c)
Out[22]:
In [23]:
# How many roles were credited in Branagh’s 1996 Hamlet?
c = cast
c = c[(c.title == 'Hamlet') & (c.year == 1996)]
len(c)
Out[23]:
In [24]:
# How many "Hamlet" roles have been listed in all film credits through history?
c = cast
c = c[c.character == 'Hamlet']
len(c)
Out[24]:
In [25]:
# How many people have played an "Ophelia"?
c = cast
c = c[c.character == 'Ophelia']
len(c)
# That was my original answer.
# But 9peppe on GitHub points out I am wrong! Instead:
len(cast[cast.character == "Ophelia"].name.unique())
# Can you work out why the two answers are different?
Out[25]:
In [26]:
# How many people have played a role called "The Dude"?
c = cast
c = c[c.character == 'The Dude']
len(c)
Out[26]:
In [27]:
# How many people have played a role called "The Stranger"?
c = cast
len(c[c.character == 'The Stranger'].name.unique())
Out[27]:
In [28]:
# How many roles has Sidney Poitier played throughout his career?
c = cast
c = c[c.name == 'Sidney Poitier']
len(c)
Out[28]:
In [29]:
# How many roles has Judi Dench played?
c = cast
c = c[c.name == 'Judi Dench']
len(c)
Out[29]:
In [30]:
# List the supporting roles (having n=2) played by Cary Grant in the 1940s,
# in order by year.
c = cast
c = c[c.name == 'Cary Grant']
c = c[c.year // 10 == 194]
c = c[c.n == 2]
c = c.sort_values('year')
c
Out[30]:
In [31]:
# List the leading roles that Cary Grant played in the 1940s in order by year.
c = cast
c = c[c.name == 'Cary Grant']
c = c[c.year // 10 == 194]
c = c[c.n == 1 ]
c = c.sort_values('year')
c
Out[31]:
In [32]:
# How many roles were available for actors in the 1950s?
c = cast
c = c[c.year // 10 == 195]
c = c[c.type == 'actor']
len(c)
Out[32]:
In [33]:
# How many roles were available for actresses in the 1950s?
c = cast
c = c[c.year // 10 == 195]
c = c[c.type == 'actress']
len(c)
Out[33]:
In [34]:
# How many leading roles (n=1) were available
# from the beginning of film history through 1980?
c = cast
c = c[c.year <= 1980]
c = c[c.n == 1]
len(c)
Out[34]:
In [35]:
# How many non-leading roles were available through
# from the beginning of film history through 1980?
c = cast
c = c[c.year <= 1980]
c = c[c.n != 1]
len(c)
Out[35]:
In [36]:
# How many roles through 1980 were minor enough
# that they did not warrant a numeric "n" rank?
c = cast
c = c[c.year <= 1980]
c = c[c.n.isnull()]
len(c)
Out[36]:
In [ ]: