In [1]:
%matplotlib inline
import pandas as pd

In [2]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))


Out[2]:

In [3]:
titles = pd.DataFrame.from_csv('data/titles.csv', index_col=None)
titles.head()


Out[3]:
title year
0 Tomorrow Ends at Dawn 2002
1 Brothers of the West 1937
2 Nemo 1984
3 Pereezd 2014
4 Bad for Business 2007

In [4]:
cast = pd.DataFrame.from_csv('data/cast.csv', index_col=None)
cast.head()


Out[4]:
title year name type character n
0 Suuri illusioni 1985 Homo $ actor Guests 22
1 Gangsta Rap: The Glockumentary 2007 Too $hort actor Himself NaN
2 Menace II Society 1993 Too $hort actor Lew-Loc 27
3 Porndogs: The Adventures of Sadie 2009 Too $hort actor Bosco 3
4 Stop Pepper Palmer 2014 Too $hort actor Himself NaN

In [5]:
# What are the ten most common movie names of all time?

titles.title.value_counts().head(10)


Out[5]:
Hamlet                  19
Carmen                  13
Macbeth                 12
The Three Musketeers    12
She                     11
Maya                    11
The Outsider            11
Karma                   10
Anna Karenina           10
Temptation              10
dtype: int64

In [6]:
# Which three years of the 1930s saw the most films released?

t = titles
t = t[t.year // 10 == 193]
t.year.value_counts().head(3)


Out[6]:
1937    1180
1936    1116
1938    1115
dtype: int64

In [7]:
# Plot the number of films that have been released each decade
# over the history of cinema.

t = titles
(t.year // 10 * 10).value_counts().sort_index().plot(kind='bar')


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f275b321160>

In [8]:
# Plot the number of "Hamlet" films made each decade.

t = titles
t = t[t.title == 'Hamlet']
(t.year // 10 * 10).value_counts().sort_index().plot(kind='bar')


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f275ce86e48>

In [9]:
# Plot the number of "Rustler" characters
# in each decade of the history of film.

c = cast
c = c[c.character == 'Rustler']
(c.year // 10 * 10).value_counts().sort_index().plot(kind='bar')


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f275ce0f710>

In [10]:
# Plot the number of "Hamlet" characters each decade.

c = cast
c = c[c.character == 'Hamlet']
(c.year // 10 * 10).value_counts().sort_index().plot(kind='bar')


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f275cda26d8>

In [11]:
# What are the 11 most common character names in movie history?

cast.character.value_counts().head(11)


Out[11]:
Himself        18630
Dancer         10864
Extra           8474
Reporter        7483
Doctor          6643
Policeman       6399
Student         6158
Nurse           6039
Bartender       5986
Party Guest     5620
Zombie          5573
dtype: int64

In [12]:
# Who are the 10 people most often credited as "Herself" in film history?

c = cast
c[c.character == 'Herself'].name.value_counts().head(10)


Out[12]:
Joyce Brothers         14
Queen Elizabeth II     10
Margaret Thatcher       8
Joan Rivers             7
Mary Jo Pehl            7
Kareena Kapoor          5
Chris Evert             5
Sally Jessy Raphael     5
Caroline Rhea           5
Marilyn Monroe          5
dtype: int64

In [13]:
# Who are the 10 people most often credited as "Himself" in film history?

c = cast
c[c.character == 'Himself'].name.value_counts().head(10)


Out[13]:
Adolf Hitler             91
Richard Nixon            38
Ronald Reagan            28
John F. Kennedy          26
Ron Jeremy               22
George W. Bush           20
Franklin D. Roosevelt    20
Bill Clinton             20
Winston Churchill        20
Martin Luther King       19
dtype: int64

In [14]:
# Which actors or actresses appeared in the most movies in the year 1945?

cast[cast.year == 1945].name.value_counts().head(10)


Out[14]:
Emmett Vogan       39
Sam (II) Harris    30
Bess Flowers       28
Harold Miller      28
Nolan Leary        27
Frank O'Connor     26
Franklyn Farnum    24
Edmund Cobb        24
Tom London         24
Pierre Watkin      24
dtype: int64

In [15]:
# Which actors or actresses appeared in the most movies in the year 1985?

cast[cast.year == 1985].name.value_counts().head(10)


Out[15]:
Shakti Kapoor       19
Mammootty           17
Sukumari            16
Lou Scheimer        15
Aruna Irani         14
Rajesh Khanna       13
Mohanlal            13
Deven Verma         13
Raj Babbar          13
Satyendra Kapoor    12
dtype: int64

In [16]:
# Plot how many roles Mammootty has played in each year of his career.

cast[cast.name == 'Mammootty'].year.value_counts().sort_index().plot()


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f275ccfb240>

In [17]:
# What are the 10 most frequent roles that start with the phrase "Patron in"?

c = cast
c[c.character.str.startswith('Patron in ')].character.value_counts().head(10)


Out[17]:
Patron in Frisky Rabbit         16
Patron in the Coffee House       9
Patron in Chinese Restaurant     9
Patron in Billiard Parlor        5
Patron in Bar                    4
Patron in cabaret                3
Patron in restaurant             3
Patron in Restaurant             3
Patron in Club                   3
Patron in Coffee Shop            2
dtype: int64

In [18]:
# What are the 10 most frequent roles that start with the word "Science"?

c = cast
c[c.character.str.startswith('Science')].character.value_counts().head(10)


Out[18]:
Science Teacher         53
Science Fair Student     8
Science Student          7
Science Fair Judge       6
Science Club Member      5
Science Reporter         5
Science Kid              4
Science Promo Cadet      4
Science Officer          3
Science teacher          3
dtype: int64

In [19]:
# Plot the n-values of the roles that Judi Dench has played over her career.

c = cast
c = c[c.name == 'Judi Dench'].sort('year')
c = c[c.n.notnull()]
c.plot(x='year', y='n', kind='scatter')


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f275cbfc048>

In [20]:
# Plot the n-values of Cary Grant's roles through his career.

c = cast
c = c[c.name == 'Cary Grant'].sort('year')
c = c[c.n.notnull()]
c.plot(x='year', y='n', kind='scatter')


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f275cc36160>

In [21]:
# Plot the n-value of the roles that Sidney Poitier has acted
# over the years.

c = cast
c = c[c.name == 'Sidney Poitier'].sort('year')
c = c[c.n.notnull()]
c.plot(x='year', y='n', kind='scatter')


Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f275cbac908>

In [22]:
# How many leading (n=1) roles were available to actors,
# and how many to actresses, in the 1950s?

c = cast
c = c[c.year // 10 == 195]
c = c[c.n == 1]
c.type.value_counts()


Out[22]:
actor      6349
actress    2789
dtype: int64

In [23]:
# How many supporting (n=2) roles were available to actors,
# and how many to actresses, in the 1950s?

c = cast
c = c[c.year // 10 == 195]
c = c[c.n == 2]
c.type.value_counts()


Out[23]:
actress    4375
actor      4354
dtype: int64

In [ ]: