In [1]:
%matplotlib inline
import pandas as pd
idx = pd.IndexSlice
In [2]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))
Out[2]:
In [3]:
%%time
cast = pd.DataFrame.from_csv('data/cast.csv', index_col=None)
In [4]:
cast.head()
Out[4]:
In [5]:
%%time
release_dates = pd.read_csv('data/release_dates.csv', index_col=None,
parse_dates=['date'], infer_datetime_format=True)
In [6]:
release_dates.head()
Out[6]:
In [7]:
titles = cast[['title', 'year']].drop_duplicates().reset_index(drop=True)
titles.head()
Out[7]:
In [8]:
# 1. How many movies are listed in the `titles` dataframe?
len(titles)
Out[8]:
In [9]:
# 1. What is the name and year of the very first movie ever made?
titles.sort('year').head(1)
Out[9]:
In [10]:
# 1. How many years into the future does the IMDB database list movie titles?
titles.sort('year').tail(3)#.year - 2015
Out[10]:
In [11]:
# 1. How many movies listed in `titles` came out in 1950?
len(titles[titles.year == 1950])
# or: (titles.year == 1950).sum()
Out[11]:
In [12]:
# 1. How many movies came out in 1960?
len(titles[titles.year == 1960])
Out[12]:
In [13]:
# 1. How many movies came out in each year of the 1970s?
# (Hint: try a Python "for" loop.)
for y in range(1970, 1980):
print(y, (titles.year == y).sum())
In [14]:
# 1. How many movies came out during your own lifetime,
# from the year of your birth through 2014?
len(titles[(titles.year >= 1974) & (titles.year <= 2014)])
Out[14]:
In [15]:
# 2. Use "value_counts" to determine how many movies came out
# in each year of the 1970s.
titles[titles.year // 10 == 197].year.value_counts().sort_index()
Out[15]:
In [16]:
# 3. Use "groupby" to determine how many movies came out in each year of the 1970s.
titles.groupby('year').size().loc[1970:1979]
Out[16]:
In [ ]:
In [17]:
# 1. What are the names of the movies made through 1906?
titles[titles.year <= 1906][['title']]
Out[17]:
In [18]:
# 1. What movies have titles that fall between Star Trek and Star Wars in the alphabet?
titles[(titles.title >= 'Star Trek') & (titles.title <= 'Star Wars')]
Out[18]:
In [19]:
# 2. Use an index and .loc[] to find the movies whose titles fall between Star Trek
# and Star Wars in the alphabet.
t = titles.copy()
t = t.set_index('title').sort_index()
t.loc['Star Trek':'Star Wars']
Out[19]:
In [20]:
# 2. Use an index and .loc[] to retrieve the names of the movies made through 1906.
titles.set_index('year').sort_index().loc[1800:1906]
Out[20]:
In [21]:
# 2. What are the 15 most common movie titles in film history?
titles.title.value_counts().head(15)
Out[21]:
In [22]:
# Use this for session 3?
i = cast.set_index('name').sort_index()
In [23]:
a = i.loc['Cary Grant',['year','n']].groupby('year').agg(['min', 'mean', 'max'])
a.loc[:1942].plot(kind='area', stacked=False)
a
Out[23]:
In [24]:
# 5. What are the 5 longest movie titles ever?
pd.set_option('max_colwidth', 300)
t = titles.copy()
t['len'] = t.title.str.len()
t = t.sort('len', ascending=False)
t.head()
Out[24]:
In [25]:
# 5. What are the 15 most popular movie titles, if you strip off the suffixes like
# (II) and (III) that the IMDB adds to distinguish movies shown in the same year?
titles.title.str.extract('^([^(]*)').value_counts().head(15)
Out[25]:
In [26]:
# 1. How many movies has Judi Dench acted in?
len(cast[cast.name == 'Judi Dench'])
Out[26]:
In [27]:
# 1. How many movies did Sidney Poitier appear in?
c = cast
c = c[c.name == 'Sidney Poitier']
len(c)
Out[27]:
In [28]:
# 1. In how many of his movies was Sidney Poitier the lead (`n==1`)?
c = cast
c = c[c.name == 'Sidney Poitier']
c = c[c.n == 1]
len(c)
Out[28]:
In [29]:
# 1. List the movies, sorted by year, in which Judi Dench starred as lead actor.
c = cast
c = c[c.name == 'Judi Dench']
c = c[c.n == 1]
c.sort('year')
Out[29]:
In [30]:
# 1. Who was credited in the 1972 version of Sleuth, in order by `n` rank?
c = cast
c = c[c.title == 'Sleuth']
c = c[c.year == 1972]
c.sort('n')
Out[30]:
In [31]:
# 2. What are the 11 most common character names in movie history?
cast.character.value_counts().head(11)
Out[31]:
In [32]:
# 3. Which actors have played the role “Zombie” the most times?
c = cast
c = c[c.character == 'Zombie']
c = c.groupby('name').size().order()
c.tail(5)
Out[32]:
In [33]:
# 3. Which ten people have appeared most often as “Herself” over the history of film?
c = cast
c = c[c.character == 'Herself']
c = c.groupby('name').size().order()
c.tail(10)
Out[33]:
In [34]:
# 3. Which ten people have appeared most often as “Himself” over the history of film?
c = cast
c = c[c.character == 'Himself']
c = c.groupby('name').size().order()
c.tail(10)
Out[34]:
In [35]:
# 4. Take the 50 most common character names in film.
# Which are most often played by men?
c = cast
clist = c.character.value_counts().head(50)
clist.head()
Out[35]:
In [36]:
clist.tail()
Out[36]:
In [37]:
cast_by_character = cast.sort('character').set_index('character')
In [38]:
c = cast_by_character.loc[clist.index][['type']]
c = c.reset_index()
c = c.groupby(['character', 'type']).size()
c = c.unstack()
c['ratio'] = c.actress / (c.actor + c.actress)
c = c.sort('ratio')
c.head()
Out[38]:
In [39]:
# 4. …which of those 50 characters are most often played by women?
c.tail()
Out[39]:
In [40]:
# 4. …which of those 50 characters have a ratio closest to 0.5?
c[(c.ratio > 0.4) & (c.ratio < 0.6)]
Out[40]:
In [ ]:
In [41]:
# 2. Which actors or actresses appeared in the most movies in the year 1945?
cast[cast.year == 1945].name.value_counts().head(10)
Out[41]:
In [42]:
# 2. Which actors or actresses appeared in the most movies in the year 1985?
cast[cast.year == 1985].name.value_counts().head(10)
Out[42]:
In [43]:
%%time
# 2. Create a `cast_by_title_year` dataframe indexed by title and year
# to use in the next few questions.
cast_by_title_year = cast.set_index(['title', 'year']).sort_index()
cast_by_title_year.head()
In [44]:
%%time
# 2. Use `cast_by_title_year` to find the stars of the film Inception
# and order them by `n` before displaying the top 10.
cast_by_title_year.loc['Inception'].sort('n').head(10)
Out[44]:
In [45]:
# 2. Use `cast_by_title_year` to find the first 10 stars in the 1996 film Hamlet,
# and order them by `n`.
cast_by_title_year.loc['Hamlet',1996].sort('n').head(10)
Out[45]:
In [46]:
%%time
# 2. Write a `for` loop that, for the top 9 actors in the 1977 movie Star Wars,
# determines how many movies they starred in after 1977.
names = cast_by_title_year.loc['Star Wars',1977].sort('n').head(9).name
for name in names:
print(name, len(cast[(cast.name == name) & (cast.year > 1977)]))
In [47]:
# 2. Create an indexed version of `cast` that, once built, lets you answer
# the previous question with a `for` loop that finishes in under a second.
i = cast.set_index('name').sort_index()
In [48]:
%%time
for name in names:
c = i.loc[name]
c = c[c.year > 1977]
#c = c[(c.character != 'Himself') & (c.character != 'Herself')]
print(name, len(c))
In [ ]:
In [49]:
# 3. How many people were cast in each of the movies named "Hamlet”?
c = cast
c = c[c.title == 'Hamlet']
c = c.groupby('year').size()
c
Out[49]:
In [ ]:
In [50]:
# 5. How many actors are in the cast of each version of Hamlet,
# including Hamlets with IMDB name collisions like "Hamlet (II)"
# and "Hamlet (III)"? [BAD]
c = cast_by_title_year
# c.loc['Hamlet':'Hamlet (Z'].index.value_counts() - Drat
# c.loc['Hamlet':'Hamlet (Z'].groupby(level=0).size() - Drat
# c.loc['Hamlet':'Hamlet (Z'].groupby(level=1).size() - Drat
c.loc['Hamlet':'Hamlet (Z'].groupby(level=[0,1]).size()
# Or:
#c = cast[(cast.title >= 'Hamlet') & (cast.title < 'Hamlet (Z')]
#c.groupby(['title', 'year']).size()
Out[50]:
In [ ]:
In [51]:
# 4. Build a dataframe with a row for each year with two columns:
# the number of roles for actors in that year's films,
# and the number of roles for actresses.
aa = cast[['year', 'type']].groupby(['year', 'type']).size()
aa = aa.loc[:2014].unstack()
aa.head()
Out[51]:
In [52]:
# 4. Use that dataframe to make a kind='area' plot showing the total
# number of roles available over the history of film.
aa.plot(kind='area')
Out[52]:
In [53]:
f = aa.actor / (aa.actor + aa.actress)
f.plot(ylim=[0,1], kind='area')
Out[53]:
In [54]:
c = cast
#c = c[c.year // 10 == 198]
c = c[c.n <= 3]
c = c.groupby(['year', 'type', 'n']).size()
c = c.unstack(1)
c.swaplevel(0,1).loc[1].plot(ylim=0, kind='area')
#f = c.actor / (c.actor + c.actress)
#f = f.unstack()
#f.plot(ylim=[0,1])
Out[54]:
In [ ]:
In [55]:
# 2. Define “leading actor” as an actor or actress whose `n==1`
# and “supporting actor” as `n==2` — what is the average year
# of all the supporting roles Judi Dench has had?
c = cast
c = c[c.name == 'Judi Dench']
print(c[c.n == 2].year.mean())
In [56]:
# 2. What is the average year of Judi Dench’s leading roles —
# is her career moving forwards toward leading roles
# or backwards towards supporting ones?
print(c[c.n == 1].year.mean())
In [57]:
# 2. Did Sidney Poitier move forward or back over his career?
c = cast
c = c[c.name == 'Sidney Poitier']
print(c[c.n == 2].year.mean())
print(c[c.n == 1].year.mean())
In [58]:
# 2. What about Michael Caine?
c = cast
c = c[c.name == 'Michael Caine']
print(c[c.n == 2].year.mean())
print(c[c.n == 1].year.mean())
In [59]:
c = cast
#c = c[c.year // 10 == 195]
c = c[c.n.notnull()].groupby('name').n.agg(['size', 'mean'])
c.head()
Out[59]:
In [60]:
c = c[c['size'] >= 10]
c = c.sort('mean')
c.head(60)
Out[60]:
In [ ]:
In [61]:
release_dates.head()
Out[61]:
In [166]:
# 5. In which month is a movie whose name starts with the text
# "The Lord of the Rings" most likely to be released?
r = release_dates
r = r[r.title.str.startswith('The Lord of the Rings')]
r = r[r.country == 'USA']
r.date.dt.month.value_counts()
Out[166]:
In [172]:
# 5. In which months is a movie whose name ends in the word "Christmas"
# most likely to be released?
r = release_dates
r = r[r.title.str.endswith('Christmas')]
r = r[r.country == 'USA']
r.date.dt.month.value_counts()
Out[172]:
In [62]:
rd = release_dates.set_index(['title', 'year']).sort_index()
rd.head()
Out[62]:
In [117]:
rd.loc[[('#Beings', 2015), ('#Horror', 2015)]]
Out[117]:
In [124]:
c = cast
c = c[c.name == 'Tom Cruise'][['title', 'year']].drop_duplicates()
#c = c.join(rd, ['title', 'year'])
#c = c[c.country == 'USA']
#c.date.dt.month.value_counts().sort_index().plot(kind='bar')
c.values
Out[124]:
In [128]:
# ASK
# rd.loc[c]
# rd.loc[c.values]
# rd.loc[list(c.values)]
In [138]:
# 5. In what months of the year have Helen Mirren movies been most often released?
c = cast
c = c[c.name == 'Helen Mirren'][['title', 'year']].drop_duplicates()
c = c.join(rd, ['title', 'year'])
c = c[c.country == 'USA']
c.date.dt.month.value_counts().sort_index().plot(kind='bar')
Out[138]:
In [139]:
# 5. …Jeff Bridges movies?
c = cast
c = c[c.name == 'Jeff Bridges'][['title', 'year']].drop_duplicates()
c = c.join(rd, ['title', 'year'])
c = c[c.country == 'USA']
c.date.dt.month.value_counts().sort_index().plot(kind='bar')
Out[139]:
In [141]:
# 5. …Tom Cruise movies?
c = cast
c = c[c.name == 'Tom Cruise'][['title', 'year']].drop_duplicates()
c = c.join(rd, ['title', 'year'])
c = c[c.country == 'USA']
c.date.dt.month.value_counts().sort_index().plot(kind='bar')
Out[141]:
In [155]:
%%time
# 5. Use join() to build a table of release dates indexed by actor,
# and use it to re-run the previous three questions efficiently.
c = cast
c = c[['name', 'title', 'year']]
c = c.join(rd, ['title', 'year'])
c = c[c.country == 'USA']
c = c.set_index('name').sort_index()
releases = c
releases.head()
In [178]:
releases.loc['Tom Cruise'].date.dt.month.value_counts().sort_index().plot(kind='bar')
Out[178]:
In [182]:
# pivot(self, index=None, columns=None, values=None)
In [183]:
cast.head()
Out[183]:
In [205]:
c = cast
c = c[c.year >= 1990]
c = c[c.year <= 1993]
c = c[c.name == 'George Clooney']
#c = c[c.title == 'Inception']
#c = c[c.n.notnull()]
#c = c.pivot('name', 'year', 'title')
c.fillna('')
Out[205]:
In [ ]:
In [206]:
release_dates.head()
Out[206]:
In [218]:
r = release_dates
r = r[r.title.str.startswith('Star Wars: Episode')]
r = r[r.country.str.startswith('U')]
r.pivot('title', 'country', 'date')
#r.pivot('country', 'title', 'date')
Out[218]:
In [227]:
r = release_dates
r = r[r.title.str.startswith('Star Wars: Episode')]
r = r[r.country.str.startswith('U')]
r.set_index(['title', 'country'])[['date']].unstack()
Out[227]:
In [ ]:
In [ ]:
In [228]:
cast.head()
Out[228]:
In [243]:
t = titles
t.head()
Out[243]:
In [246]:
c = cast
c = c[c.title == 'Hamlet']
c = c.set_index(['year', 'character'])#.unstack('type')
c
Out[246]:
In [239]:
c = cast
c = c[c.title == 'Hamlet']
c = c.set_index(['year', 'type'])#.unstack('type')
c
Out[239]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: