In [136]:
import pandas as pd
%xmode plain


Exception reporting mode: Plain

In [164]:
from IPython.core.display import HTML
HTML('<style>{}</style>'.format(open('style-table.css').read()))


Out[164]:

In [57]:
df1 = pd.read_csv('sample.csv')
df1.columns = map(str.lower, df1.columns)
df1


Out[57]:
title year director
0 North by Northwest 1959 Alfred Hitchcock
1 Notorious 1946 Alfred Hitchcock
2 The Philadelphia Story 1940 George Cukor
3 To Catch a Thief 1955 Alfred Hitchcock
4 His Girl Friday 1940 Howard Hawks

In [4]:
df1.dtypes


Out[4]:
title       object
year         int64
director    object
dtype: object

In [5]:
df1.groupby('director')


Out[5]:
<pandas.core.groupby.DataFrameGroupBy object at 0x7f689c0caac8>

In [6]:
df1.groupby('director').head()  # looks the same!


Out[6]:
title year director
0 North by Northwest 1959 Alfred Hitchcock
1 Notorious 1946 Alfred Hitchcock
2 The Philadelphia Story 1940 George Cukor
3 To Catch a Thief 1955 Alfred Hitchcock
4 His Girl Friday 1940 Howard Hawks

In [7]:
df1.groupby('director').head().equals(df1)


Out[7]:
True

In [8]:
df1.reindex_axis(df1.director)


Out[8]:
title year director
director
Alfred Hitchcock NaN NaN NaN
Alfred Hitchcock NaN NaN NaN
George Cukor NaN NaN NaN
Alfred Hitchcock NaN NaN NaN
Howard Hawks NaN NaN NaN

In [9]:
list(df1.groupby('director'))


Out[9]:
[('Alfred Hitchcock',                 title  year          director
  0  North by Northwest  1959  Alfred Hitchcock
  1           Notorious  1946  Alfred Hitchcock
  3    To Catch a Thief  1955  Alfred Hitchcock),
 ('George Cukor',                     title  year      director
  2  The Philadelphia Story  1940  George Cukor),
 ('Howard Hawks',              title  year      director
  4  His Girl Friday  1940  Howard Hawks)]

In [10]:
df1.groupby('director').get_group('Alfred Hitchcock')


Out[10]:
title year director
0 North by Northwest 1959 Alfred Hitchcock
1 Notorious 1946 Alfred Hitchcock
3 To Catch a Thief 1955 Alfred Hitchcock

In [11]:
df1.groupby('director').first()


Out[11]:
title year
director
Alfred Hitchcock North by Northwest 1959
George Cukor The Philadelphia Story 1940
Howard Hawks His Girl Friday 1940

In [12]:
df1.groupby('director').last()


Out[12]:
title year
director
Alfred Hitchcock To Catch a Thief 1955
George Cukor The Philadelphia Story 1940
Howard Hawks His Girl Friday 1940

In [13]:
df1.groupby('director').max()


Out[13]:
title year
director
Alfred Hitchcock To Catch a Thief 1959
George Cukor The Philadelphia Story 1940
Howard Hawks His Girl Friday 1940

In [14]:
df1.groupby('director').max().ix['Alfred Hitchcock']


Out[14]:
title    To Catch a Thief
year                 1959
Name: Alfred Hitchcock, dtype: object

In [161]:
df2 = df1.groupby(['director', df1.index]).first()
df2


Out[161]:
title year
director
Alfred Hitchcock 0 North by Northwest 1959
1 Notorious 1946
3 To Catch a Thief 1955
George Cukor 2 The Philadelphia Story 1940
Howard Hawks 4 His Girl Friday 1940

In [52]:
df3 = df2.reset_index('director')
df3


Out[52]:
director title year
0 Alfred Hitchcock North by Northwest 1959
1 Alfred Hitchcock Notorious 1946
3 Alfred Hitchcock To Catch a Thief 1955
2 George Cukor The Philadelphia Story 1940
4 Howard Hawks His Girl Friday 1940

In [60]:
df4 = df3[['title', 'year', 'director']] #.equals(df1)
df4


Out[60]:
title year director
0 North by Northwest 1959 Alfred Hitchcock
1 Notorious 1946 Alfred Hitchcock
3 To Catch a Thief 1955 Alfred Hitchcock
2 The Philadelphia Story 1940 George Cukor
4 His Girl Friday 1940 Howard Hawks

In [67]:
df4.sort_index()#.equals(df1)


Out[67]:
title year director
0 North by Northwest 1959 Alfred Hitchcock
1 Notorious 1946 Alfred Hitchcock
2 The Philadelphia Story 1940 George Cukor
3 To Catch a Thief 1955 Alfred Hitchcock
4 His Girl Friday 1940 Howard Hawks

In [76]:
df1


Out[76]:
title year director
0 North by Northwest 1959 Alfred Hitchcock
1 Notorious 1946 Alfred Hitchcock
2 The Philadelphia Story 1940 George Cukor
3 To Catch a Thief 1955 Alfred Hitchcock
4 His Girl Friday 1940 Howard Hawks

In [73]:
df1.dtypes


Out[73]:
title       object
year         int64
director    object
dtype: object

In [74]:
df4.sort_index().dtypes


Out[74]:
title       object
year         int64
director    object
dtype: object

In [77]:
df1.index == df4.sort_index().index


Out[77]:
array([ True,  True,  True,  True,  True], dtype=bool)

In [78]:
print(df1)
print(df1 == df4.sort_index())
print(df1.equals(df4.sort_index()))


                    title  year          director
0      North by Northwest  1959  Alfred Hitchcock
1               Notorious  1946  Alfred Hitchcock
2  The Philadelphia Story  1940      George Cukor
3        To Catch a Thief  1955  Alfred Hitchcock
4         His Girl Friday  1940      Howard Hawks
  title  year director
0  True  True     True
1  True  True     True
2  True  True     True
3  True  True     True
4  True  True     True
False

In [82]:
df2.reset_index(1)
#'director')
#pd.DataFrame(index=df2.index)


Out[82]:
level_1 title year
director
Alfred Hitchcock 0 North by Northwest 1959
Alfred Hitchcock 1 Notorious 1946
Alfred Hitchcock 3 To Catch a Thief 1955
George Cukor 2 The Philadelphia Story 1940
Howard Hawks 4 His Girl Friday 1940

In [90]:
x = df2.reset_index(1)
print(x.ix['George Cukor'])
print(x.ix['Alfred Hitchcock'])
print(x.index)


level_1                         2
title      The Philadelphia Story
year                         1940
Name: George Cukor, dtype: object
                  level_1               title  year
director                                           
Alfred Hitchcock        0  North by Northwest  1959
Alfred Hitchcock        1           Notorious  1946
Alfred Hitchcock        3    To Catch a Thief  1955
Index(['Alfred Hitchcock', 'Alfred Hitchcock', 'Alfred Hitchcock', 'George Cukor', 'Howard Hawks'], dtype='object')

In [101]:
y = df1.set_index('director')
y


Out[101]:
title year
director
Alfred Hitchcock North by Northwest 1959
Alfred Hitchcock Notorious 1946
George Cukor The Philadelphia Story 1940
Alfred Hitchcock To Catch a Thief 1955
Howard Hawks His Girl Friday 1940

In [107]:
y.ix['George Cukor']


Out[107]:
title    The Philadelphia Story
year                       1940
Name: George Cukor, dtype: object

In [108]:
y.ix['Alfred Hitchcock']


Out[108]:
title year
director
Alfred Hitchcock North by Northwest 1959
Alfred Hitchcock Notorious 1946
Alfred Hitchcock To Catch a Thief 1955

In [112]:
y.loc['George Cukor']


Out[112]:
title    The Philadelphia Story
year                       1940
Name: George Cukor, dtype: object

In [113]:
y.loc['Alfred Hitchcock']


Out[113]:
title year
director
Alfred Hitchcock North by Northwest 1959
Alfred Hitchcock Notorious 1946
Alfred Hitchcock To Catch a Thief 1955

In [117]:
y.at['Alfred Hitchcock','title']


Out[117]:
array(['North by Northwest', 'Notorious', 'To Catch a Thief'], dtype=object)

In [118]:
y.at['George Cukor','title']


Out[118]:
'The Philadelphia Story'

In [145]:
p(df1.set_index('director'))


Traceback (most recent call last):

  File "<ipython-input-145-560ed8861e3e>", line 1, in <module>
    p(df1.set_index('director'))

NameError: name 'p' is not defined
print(df1.set_index('director').set_index('title', append=True))

In [125]:
df1.set_index('director').set_index('title', append=True).loc['Alfred Hitchcock']


Out[125]:
year
title
North by Northwest 1959
Notorious 1946
To Catch a Thief 1955

In [128]:
df1.set_index(['director', 'title'])


Out[128]:
year
director title
Alfred Hitchcock North by Northwest 1959
Notorious 1946
George Cukor The Philadelphia Story 1940
Alfred Hitchcock To Catch a Thief 1955
Howard Hawks His Girl Friday 1940

In [137]:
df1.set_index(['director', 'title']).loc['Alfred Hitchcock':'George Cukor']


Traceback (most recent call last):

  File "<ipython-input-137-0f8b79e18ab8>", line 1, in <module>
    df1.set_index(['director', 'title']).loc['Alfred Hitchcock':'George Cukor']

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/indexing.py", line 1202, in __getitem__
    return self._getitem_axis(key, axis=0)

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/indexing.py", line 1316, in _getitem_axis
    return self._get_slice_axis(key, axis=axis)

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/indexing.py", line 1224, in _get_slice_axis
    slice_obj.step)

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/index.py", line 2027, in slice_indexer
    start_slice, end_slice = self.slice_locs(start, end, step=step)

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/index.py", line 4063, in slice_locs
    return super(MultiIndex, self).slice_locs(start, end, step)

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/index.py", line 2141, in slice_locs
    start_slice = self.get_slice_bound(start, 'left')

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/index.py", line 4035, in get_slice_bound
    return self._partial_tup_index(label, side=side)

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/index.py", line 4069, in _partial_tup_index
    (len(tup), self.lexsort_depth))

KeyError: 'Key length (1) was greater than MultiIndex lexsort depth (0)'

In [138]:
df1.set_index(['director', 'title']).sort_index().loc['Alfred Hitchcock':'George Cukor']


Out[138]:
year
director title
Alfred Hitchcock North by Northwest 1959
Notorious 1946
To Catch a Thief 1955
George Cukor The Philadelphia Story 1940

In [ ]:
df1.set_index(['director', 'title']).sort_index()

In [154]:
df1.groupby(['director', 'title']).first()


Out[154]:
year
director title
Alfred Hitchcock North by Northwest 1959
Notorious 1946
To Catch a Thief 1955
George Cukor The Philadelphia Story 1940
Howard Hawks His Girl Friday 1940

In [134]:
df1.groupby(['director', 'title']).first()


Out[134]:
year
director title
Alfred Hitchcock North by Northwest 1959
Notorious 1946
To Catch a Thief 1955
George Cukor The Philadelphia Story 1940
Howard Hawks His Girl Friday 1940

In [184]:
#
# Official diagram generation
#

open('u__tyd.html', 'w').write(df1.to_html())
open('u_d_ty.html', 'w').write(df1.set_index('director').to_html())
open('u_dt_y.html', 'w').write(df1.set_index('director').set_index('title', append=True).to_html())
open('s_d_ty.html', 'w').write(df1.set_index('director').sort_index().to_html())
open('s_dt_y.html', 'w').write(df1.set_index('director').set_index('title', append=True).sort_index().to_html())

print(
    df1.set_index('director').set_index('title', append=True).equals(
    df1.set_index(['director', 'title'])
    ))

print(
    df1.set_index(['director', 'title']).sort_index().equals(
    df1.groupby(['director', 'title']).first()
    ))

print(
    df1.set_index('director').sort_index().set_index('title', append=True).equals(
    df1.groupby(['director', 'title']).first()
    ))


True
True
True

In [182]:
df1.set_index('director').sort_index().set_index('title')


Out[182]:
year
title
North by Northwest 1959
Notorious 1946
To Catch a Thief 1955
The Philadelphia Story 1940
His Girl Friday 1940

In [179]:



Out[179]:
title year
director
Alfred Hitchcock North by Northwest 1959
George Cukor The Philadelphia Story 1940
Howard Hawks His Girl Friday 1940

In [ ]:


In [ ]:


In [ ]:


In [127]:
df1.set_index(['director', 'title']).loc['Alfred Hitchcock']


Out[127]:
year
title
North by Northwest 1959
Notorious 1946
To Catch a Thief 1955

In [129]:
df1.set_index(['director', 'title']).reset_index('title')


Out[129]:
title year
director
Alfred Hitchcock North by Northwest 1959
Alfred Hitchcock Notorious 1946
George Cukor The Philadelphia Story 1940
Alfred Hitchcock To Catch a Thief 1955
Howard Hawks His Girl Friday 1940

In [130]:
df1.set_index(['director', 'title']).reset_index(['director', 'title'])


Out[130]:
director title year
0 Alfred Hitchcock North by Northwest 1959
1 Alfred Hitchcock Notorious 1946
2 George Cukor The Philadelphia Story 1940
3 Alfred Hitchcock To Catch a Thief 1955
4 Howard Hawks His Girl Friday 1940

In [131]:
df1


Out[131]:
title year director
0 North by Northwest 1959 Alfred Hitchcock
1 Notorious 1946 Alfred Hitchcock
2 The Philadelphia Story 1940 George Cukor
3 To Catch a Thief 1955 Alfred Hitchcock
4 His Girl Friday 1940 Howard Hawks

In [ ]:


In [ ]:


In [35]:
df2.unstack(level=0)


Out[35]:
title year
director Alfred Hitchcock George Cukor Howard Hawks Alfred Hitchcock George Cukor Howard Hawks
0 North by Northwest NaN NaN 1959 NaN NaN
1 Notorious NaN NaN 1946 NaN NaN
2 NaN The Philadelphia Story NaN NaN 1940 NaN
3 To Catch a Thief NaN NaN 1955 NaN NaN
4 NaN NaN His Girl Friday NaN NaN 1940

In [36]:
df2.transpose()


Out[36]:
director Alfred Hitchcock George Cukor Howard Hawks
0 1 3 2 4
title North by Northwest Notorious To Catch a Thief The Philadelphia Story His Girl Friday
year 1959 1946 1955 1940 1940

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [17]:
df2.ix['Alfred Hitchcock']


Out[17]:
title year
0 North by Northwest 1959
1 Notorious 1946
3 To Catch a Thief 1955

In [19]:
df2.ix['Alfred Hitchcock',3]


Out[19]:
title    To Catch a Thief
year                 1955
Name: (Alfred Hitchcock, 3), dtype: object

In [20]:
df1.groupby('director')['year'].agg([('first_year', min), ('last_year', max)])


Out[20]:
first_year last_year
director
Alfred Hitchcock 1946 1959
George Cukor 1940 1940
Howard Hawks 1940 1940

In [21]:
df1.groupby('director').agg({'title': min, 'year': max})


Out[21]:
title year
director
Alfred Hitchcock North by Northwest 1959
George Cukor The Philadelphia Story 1940
Howard Hawks His Girl Friday 1940

In [24]:
df3 = df1.groupby(['director', 'title']).first()
df3


Out[24]:
year
director title
Alfred Hitchcock North by Northwest 1959
Notorious 1946
To Catch a Thief 1955
George Cukor The Philadelphia Story 1940
Howard Hawks His Girl Friday 1940

In [25]:
df4 = df1.groupby(['director', 'year']).first()
df4


Out[25]:
title
director year
Alfred Hitchcock 1946 Notorious
1955 To Catch a Thief
1959 North by Northwest
George Cukor 1940 The Philadelphia Story
Howard Hawks 1940 His Girl Friday

In [26]:
df1.groupby(['director', 'title', 'year']).first()


Out[26]:
director title year

In [27]:
s1 = df1.stack()
s1


Out[27]:
0  title           North by Northwest
   year                          1959
   director          Alfred Hitchcock
1  title                    Notorious
   year                          1946
   director          Alfred Hitchcock
2  title       The Philadelphia Story
   year                          1940
   director              George Cukor
3  title             To Catch a Thief
   year                          1955
   director          Alfred Hitchcock
4  title              His Girl Friday
   year                          1940
   director              Howard Hawks
dtype: object

In [28]:
s2 = df1.unstack()
s2


Out[28]:
title     0        North by Northwest
          1                 Notorious
          2    The Philadelphia Story
          3          To Catch a Thief
          4           His Girl Friday
year      0                      1959
          1                      1946
          2                      1940
          3                      1955
          4                      1940
director  0          Alfred Hitchcock
          1          Alfred Hitchcock
          2              George Cukor
          3          Alfred Hitchcock
          4              Howard Hawks
dtype: object

In [29]:
df3.stack()


Out[29]:
director          title                       
Alfred Hitchcock  North by Northwest      year    1959
                  Notorious               year    1946
                  To Catch a Thief        year    1955
George Cukor      The Philadelphia Story  year    1940
Howard Hawks      His Girl Friday         year    1940
dtype: int64

In [30]:
df3.unstack()


Out[30]:
year
title His Girl Friday North by Northwest Notorious The Philadelphia Story To Catch a Thief
director
Alfred Hitchcock NaN 1959 1946 NaN 1955
George Cukor NaN NaN NaN 1940 NaN
Howard Hawks 1940 NaN NaN NaN NaN

In [ ]: