In [1]:
import pandas as pd
In [2]:
%%time
cast = pd.DataFrame.from_csv('../data/intro/cast.csv.gz', index_col=None)
In [ ]:
In [3]:
cast.head()
Out[3]:
In [ ]:
In [4]:
titles = cast[['title', 'year']].drop_duplicates().reset_index(drop=True)
titles.head()
Out[4]:
In [ ]:
In [5]:
# What is the name and year of the very first movie ever made?
titles.sort_values('year').head(1)
Out[5]:
In [ ]:
In [6]:
# How many years into the future does the IMDB database list movie titles?
titles.sort_values('year').tail(3)
Out[6]:
In [ ]:
In [7]:
# How many movies listed in `titles` came out in 1950?
len(titles[titles.year == 1950])
# or: (titles.year == 1950).sum()
Out[7]:
In [ ]:
In [8]:
# What are the 15 most common movie titles in film history?
titles.title.value_counts().head(15)
Out[8]:
In [ ]:
In [9]:
# How many movies has Leonardo DiCaprio acted in?
leo = cast[cast.name == 'Leonardo DiCaprio']
leo
Out[9]:
In [10]:
len(leo)
Out[10]:
In [ ]:
In [11]:
# What movies has Leo acted in major roles (1-3)?
leo[cast.n <= 3]
Out[11]:
In [12]:
leo_major_roles = cast[(cast.name == 'Leonardo DiCaprio') & (cast.n <= 3)]
In [13]:
# And only in recent (>=2010) years?
In [14]:
leo_major_roles[cast.year >= 2010].sort_values('year')
Out[14]:
In [ ]:
In [15]:
# What are the 5 longest movie titles ever?
pd.set_option('max_colwidth', 300)
t = titles.copy()
t['len'] = t.title.str.len()
t = t.sort_values('len', ascending=False)
t.head()
Out[15]:
In [ ]:
In [16]:
# Plot the number of films that have been released each decade over the history of cinema.
%matplotlib inline
In [17]:
t = titles
(t.year // 10 * 10).value_counts().sort_index().plot(kind='bar')
Out[17]:
In [ ]:
In [ ]: