In [1]:
%matplotlib inline
import pandas as pd

In [2]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))


Out[2]:

In [7]:
titles = pd.DataFrame.from_csv('data/titles.csv', index_col=None,encoding='utf-8')
titles.head()


Out[7]:
title year
0 The Rising Son 1990
1 Ashes of Kukulcan 2016
2 The Thousand Plane Raid 1969
3 Crucea de piatra 1993
4 The 86 2015

In [7]:
chardet.detect?

In [8]:
type(rawdata)


Out[8]:
_io.TextIOWrapper

In [16]:
import glob
glob.glob('*.csv')


Out[16]:
['sales2.csv', 'sales1.csv']

In [19]:
import glob
from chardet.universaldetector import UniversalDetector

detector = UniversalDetector()
for filename in glob.glob('*.csv'):
    print( filename.ljust(60)),
    detector.reset()
    for line in file(filename, 'rb'):
        detector.feed(line)
        if detector.done: break
    detector.close()
    print( detector.result)


sales2.csv                                                  
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-51b03ca5ab5e> in <module>()
      6     print( filename.ljust(60)),
      7     detector.reset()
----> 8     for line in file(filename, 'rb'):
      9         detector.feed(line)
     10         if detector.done: break

NameError: name 'file' is not defined

In [ ]:


In [22]:
import chardet
from io import StringIO
rawdata=open('data/cast.csv').read()
chardet.detect(StringIO(rawdata))


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-22-ff0df6818d27> in <module>()
      2 from io import StringIO
      3 rawdata=open('data/cast.csv').read()
----> 4 chardet.detect(StringIO(rawdata))

MemoryError: 

In [10]:
cast = pd.DataFrame.from_csv('data/cast.csv', index_col=None)
cast.head()


Out[10]:
title year name type character n
0 Suuri illusioni 1985 Homo $ actor Guests 22
1 Gangsta Rap: The Glockumentary 2007 Too $hort actor Himself NaN
2 Menace II Society 1993 Too $hort actor Lew-Loc 27
3 Porndogs: The Adventures of Sadie 2009 Too $hort actor Bosco 3
4 Stop Pepper Palmer 2014 Too $hort actor Himself NaN

What are the ten most common movie names of all time?


In [7]:
titles['title'].value_counts()[:10]


Out[7]:
Hamlet                  19
Macbeth                 14
Carmen                  14
The Three Musketeers    12
Maya                    11
She                     11
Eva                     10
Temptation              10
Karma                   10
Honeymoon               10
dtype: int64

In [ ]:

Which three years of the 1930s saw the most films released?


In [8]:
titles[(titles['year']<1940)&(titles['year']>=1930)]['year'].value_counts()


Out[8]:
1937    1187
1936    1129
1938    1115
1939    1055
1935    1039
1934     955
1932     931
1933     880
1931     872
1930     814
dtype: int64

In [ ]:

Plot the number of films that have been released each decade over the history of cinema.


In [29]:
dec=((titles['year']//10)*10)
print(dec.max())
print(dec.min())
dec.hist(bins=(dec.max()-dec.min())/10+1)


2020
1890
Out[29]:
<matplotlib.axes.AxesSubplot at 0x7f7dde262a90>

Plot the number of "Hamlet" films made each decade.


In [35]:
hamdec=titles[titles['title']=="Hamlet"]
hamdec['year']=(hamdec['year']//10)*10
hamdec['year'].hist(bins=(hamdec['year'].max()-hamdec['year'].min())/10+1)


/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
Out[35]:
<matplotlib.axes.AxesSubplot at 0x7f7dde0e2400>

In [34]:
hamdec


Out[34]:
title year
5571 Hamlet 1940
26280 Hamlet 2010
37562 Hamlet 2010
42436 Hamlet 1990
43711 Hamlet 1910
68563 Hamlet 1950
86177 Hamlet 1970
88221 Hamlet 1980
88412 Hamlet 2000
92180 Hamlet 1920
114784 Hamlet 1960
155367 Hamlet 1910
164427 Hamlet 2000
170820 Hamlet 1960
187902 Hamlet 1990
193741 Hamlet 2010
195956 Hamlet 1910
206810 Hamlet 1970
208799 Hamlet 2010

Plot the number of "Rustler" characters in each decade of the history of film.


In [36]:
hamdec=cast[cast['character']=="Rustler"]
hamdec['year']=(hamdec['year']//10)*10
hamdec['year'].hist(bins=(hamdec['year'].max()-hamdec['year'].min())/10+1)


/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
Out[36]:
<matplotlib.axes.AxesSubplot at 0x7f7dddfabf28>

In [ ]:

Plot the number of "Hamlet" characters each decade.


In [37]:
hamdec=cast[cast['character']=="Hamlet"]
hamdec['year']=(hamdec['year']//10)*10
hamdec['year'].hist(bins=(hamdec['year'].max()-hamdec['year'].min())/10+1)


/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
Out[37]:
<matplotlib.axes.AxesSubplot at 0x7f7dde533a90>

In [ ]:

What are the 11 most common character names in movie history?


In [39]:
cast['character'].value_counts()[:11]


Out[39]:
Himself        18857
Dancer         11200
Extra           9268
Reporter        7676
Doctor          6895
Policeman       6534
Student         6415
Nurse           6217
Bartender       6205
Party Guest     6005
Minor Role      5943
dtype: int64

In [ ]:

Who are the 10 people most often credited as "Herself" in film history?


In [9]:
cast[cast['character']=='Herself']['name'].value_counts()[:10]


Out[9]:
Joyce Brothers         14
Queen Elizabeth II     11
Margaret Thatcher       8
Lili?n Garc?a           7
Joan Rivers             7
Mary Jo Pehl            7
Sally Jessy Raphael     5
Chris Evert             5
Juhi Chawla             5
Bunny Yeager            5
dtype: int64

In [ ]:

Who are the 10 people most often credited as "Himself" in film history?


In [42]:
cast[cast['character']=='Himself']['name'].value_counts()[:10]


Out[42]:
Adolf Hitler             93
Richard Nixon            39
Ronald Reagan            31
John F. Kennedy          25
Ron Jeremy               25
Bill Clinton             20
Franklin D. Roosevelt    20
George W. Bush           20
Winston Churchill        20
Benito Mussolini         18
dtype: int64

In [ ]:

Which actors or actresses appeared in the most movies in the year 1945?


In [43]:
cast[cast['year']==1945]['name'].value_counts()[:10]


Out[43]:
Emmett Vogan        39
Sam (II) Harris     30
Bess Flowers        28
Harold Miller       28
Nolan Leary         27
Frank O'Connor      26
Franklyn Farnum     25
Tom London          24
Charles Sullivan    24
Edmund Cobb         24
dtype: int64

In [ ]:

Which actors or actresses appeared in the most movies in the year 1985?


In [44]:
cast[cast['year']==1985]['name'].value_counts()[:10]


Out[44]:
Mammootty        19
Shakti Kapoor    19
Sukumari         17
Lou Scheimer     15
Aruna Irani      14
Rajesh Khanna    13
Deven Verma      13
Mohanlal         13
Raj Babbar       13
Om Shivpuri      12
dtype: int64

In [ ]:

Plot how many roles Mammootty has played in each year of his career.


In [49]:
cast[cast['name']=='Mammootty'].hist(column='year')


Out[49]:
array([[<matplotlib.axes.AxesSubplot object at 0x7f7dddbca128>]], dtype=object)

In [46]:
cast.hist?

In [ ]:

What are the 10 most frequent roles that start with the phrase "Patron in"?


In [52]:
cast[cast['character'].str.startswith('Patron in')]['character'].value_counts()[:10]


Out[52]:
Patron in Frisky Rabbit         16
Patron in Chinese Restaurant     9
Patron in the Coffee House       9
Patron in Billiard Parlor        5
Patron in Bar                    4
Patron in restaurant             3
Patron in Restaurant             3
Patron in Club                   3
Patron in cabaret                3
Patron in Audience               2
dtype: int64

What are the 10 most frequent roles that start with the word "Science"?


In [54]:
cast[cast['character'].str.startswith('Science')]['character'].value_counts()[:10]


Out[54]:
Science Teacher         54
Science Student          9
Science Fair Student     8
Science Club Member      5
Science Reporter         5
Science Promo Cadet      4
Science Fair Judge       4
Science Kid              4
Science Officer          3
Science Fair Kid         3
dtype: int64

In [ ]:

Plot the n-values of the roles that Judi Dench has played over her career.


In [55]:
cast[cast['name']=='Judi Dench'].plot(kind='scatter',x='year',y='n')


Out[55]:
<matplotlib.axes.AxesSubplot at 0x7f7dddda5710>

In [ ]:

Plot the n-values of Cary Grant's roles through his career.


In [ ]:


In [ ]:

Plot the n-value of the roles that Sidney Poitier has acted over the years.


In [56]:
cast[cast['name']=='Sidney Poitier'].plot(kind='scatter',x='year',y='n')


Out[56]:
<matplotlib.axes.AxesSubplot at 0x7f7ddd9c3978>

In [ ]:

How many leading (n=1) roles were available to actors, and how many to actresses, in the 1950s?


In [60]:
cast[(cast['n']==1)&(cast['type']=='actor')&(cast['year']<1960)&(cast['year']>=1950)].shape[0]


Out[60]:
6390

In [61]:
cast[(cast['n']==1)&(cast['type']=='actress')&(cast['year']<1960)&(cast['year']>=1950)].shape[0]


Out[61]:
2818

How many supporting (n=2) roles were available to actors, and how many to actresses, in the 1950s?


In [63]:
cast[(cast['n']==2)&(cast['type']=='actor')&(cast['year']<1960)&(cast['year']>=1950)].shape[0]


Out[63]:
4388

In [62]:
cast[(cast['n']==2)&(cast['type']=='actress')&(cast['year']<1960)&(cast['year']>=1950)].shape[0]


Out[62]:
4406

In [ ]: