notebook.community

Edit and run



In [136]:

    
import pandas as pd
%xmode plain









    



Exception reporting mode: Plain



In [164]:

    
from IPython.core.display import HTML
HTML('<style>{}</style>'.format(open('style-table.css').read()))









    Out[164]:



In [57]:

    
df1 = pd.read_csv('sample.csv')
df1.columns = map(str.lower, df1.columns)
df1









    Out[57]:






  
    
      
      title
      year
      director
    
  
  
    
      0
           North by Northwest
       1959
       Alfred Hitchcock
    
    
      1
                    Notorious
       1946
       Alfred Hitchcock
    
    
      2
       The Philadelphia Story
       1940
           George Cukor
    
    
      3
             To Catch a Thief
       1955
       Alfred Hitchcock
    
    
      4
              His Girl Friday
       1940
           Howard Hawks



In [4]:

    
df1.dtypes









    Out[4]:





title       object
year         int64
director    object
dtype: object



In [5]:

    
df1.groupby('director')









    Out[5]:





<pandas.core.groupby.DataFrameGroupBy object at 0x7f689c0caac8>



In [6]:

    
df1.groupby('director').head()  # looks the same!









    Out[6]:






  
    
      
      title
      year
      director
    
  
  
    
      0
           North by Northwest
       1959
       Alfred Hitchcock
    
    
      1
                    Notorious
       1946
       Alfred Hitchcock
    
    
      2
       The Philadelphia Story
       1940
           George Cukor
    
    
      3
             To Catch a Thief
       1955
       Alfred Hitchcock
    
    
      4
              His Girl Friday
       1940
           Howard Hawks



In [7]:

    
df1.groupby('director').head().equals(df1)









    Out[7]:





True



In [8]:

    
df1.reindex_axis(df1.director)









    Out[8]:






  
    
      
      title
      year
      director
    
    
      director
      
      
      
    
  
  
    
      Alfred Hitchcock
       NaN
      NaN
       NaN
    
    
      Alfred Hitchcock
       NaN
      NaN
       NaN
    
    
      George Cukor
       NaN
      NaN
       NaN
    
    
      Alfred Hitchcock
       NaN
      NaN
       NaN
    
    
      Howard Hawks
       NaN
      NaN
       NaN



In [9]:

    
list(df1.groupby('director'))









    Out[9]:





[('Alfred Hitchcock',                 title  year          director
  0  North by Northwest  1959  Alfred Hitchcock
  1           Notorious  1946  Alfred Hitchcock
  3    To Catch a Thief  1955  Alfred Hitchcock),
 ('George Cukor',                     title  year      director
  2  The Philadelphia Story  1940  George Cukor),
 ('Howard Hawks',              title  year      director
  4  His Girl Friday  1940  Howard Hawks)]



In [10]:

    
df1.groupby('director').get_group('Alfred Hitchcock')









    Out[10]:






  
    
      
      title
      year
      director
    
  
  
    
      0
       North by Northwest
       1959
       Alfred Hitchcock
    
    
      1
                Notorious
       1946
       Alfred Hitchcock
    
    
      3
         To Catch a Thief
       1955
       Alfred Hitchcock



In [11]:

    
df1.groupby('director').first()









    Out[11]:






  
    
      
      title
      year
    
    
      director
      
      
    
  
  
    
      Alfred Hitchcock
           North by Northwest
       1959
    
    
      George Cukor
       The Philadelphia Story
       1940
    
    
      Howard Hawks
              His Girl Friday
       1940



In [12]:

    
df1.groupby('director').last()









    Out[12]:






  
    
      
      title
      year
    
    
      director
      
      
    
  
  
    
      Alfred Hitchcock
             To Catch a Thief
       1955
    
    
      George Cukor
       The Philadelphia Story
       1940
    
    
      Howard Hawks
              His Girl Friday
       1940



In [13]:

    
df1.groupby('director').max()









    Out[13]:






  
    
      
      title
      year
    
    
      director
      
      
    
  
  
    
      Alfred Hitchcock
             To Catch a Thief
       1959
    
    
      George Cukor
       The Philadelphia Story
       1940
    
    
      Howard Hawks
              His Girl Friday
       1940



In [14]:

    
df1.groupby('director').max().ix['Alfred Hitchcock']









    Out[14]:





title    To Catch a Thief
year                 1959
Name: Alfred Hitchcock, dtype: object



In [161]:

    
df2 = df1.groupby(['director', df1.index]).first()
df2









    Out[161]:






  
    
      
      
      title
      year
    
    
      director
      
      
      
    
  
  
    
      Alfred Hitchcock
      0
           North by Northwest
       1959
    
    
      1
                    Notorious
       1946
    
    
      3
             To Catch a Thief
       1955
    
    
      George Cukor
      2
       The Philadelphia Story
       1940
    
    
      Howard Hawks
      4
              His Girl Friday
       1940



In [52]:

    
df3 = df2.reset_index('director')
df3









    Out[52]:






  
    
      
      director
      title
      year
    
  
  
    
      0
       Alfred Hitchcock
           North by Northwest
       1959
    
    
      1
       Alfred Hitchcock
                    Notorious
       1946
    
    
      3
       Alfred Hitchcock
             To Catch a Thief
       1955
    
    
      2
           George Cukor
       The Philadelphia Story
       1940
    
    
      4
           Howard Hawks
              His Girl Friday
       1940



In [60]:

    
df4 = df3[['title', 'year', 'director']] #.equals(df1)
df4









    Out[60]:






  
    
      
      title
      year
      director
    
  
  
    
      0
           North by Northwest
       1959
       Alfred Hitchcock
    
    
      1
                    Notorious
       1946
       Alfred Hitchcock
    
    
      3
             To Catch a Thief
       1955
       Alfred Hitchcock
    
    
      2
       The Philadelphia Story
       1940
           George Cukor
    
    
      4
              His Girl Friday
       1940
           Howard Hawks



In [67]:

    
df4.sort_index()#.equals(df1)









    Out[67]:






  
    
      
      title
      year
      director
    
  
  
    
      0
           North by Northwest
       1959
       Alfred Hitchcock
    
    
      1
                    Notorious
       1946
       Alfred Hitchcock
    
    
      2
       The Philadelphia Story
       1940
           George Cukor
    
    
      3
             To Catch a Thief
       1955
       Alfred Hitchcock
    
    
      4
              His Girl Friday
       1940
           Howard Hawks



In [76]:

    
df1









    Out[76]:






  
    
      
      title
      year
      director
    
  
  
    
      0
           North by Northwest
       1959
       Alfred Hitchcock
    
    
      1
                    Notorious
       1946
       Alfred Hitchcock
    
    
      2
       The Philadelphia Story
       1940
           George Cukor
    
    
      3
             To Catch a Thief
       1955
       Alfred Hitchcock
    
    
      4
              His Girl Friday
       1940
           Howard Hawks



In [73]:

    
df1.dtypes









    Out[73]:





title       object
year         int64
director    object
dtype: object



In [74]:

    
df4.sort_index().dtypes









    Out[74]:





title       object
year         int64
director    object
dtype: object



In [77]:

    
df1.index == df4.sort_index().index









    Out[77]:





array([ True,  True,  True,  True,  True], dtype=bool)



In [78]:

    
print(df1)
print(df1 == df4.sort_index())
print(df1.equals(df4.sort_index()))









    



                    title  year          director
0      North by Northwest  1959  Alfred Hitchcock
1               Notorious  1946  Alfred Hitchcock
2  The Philadelphia Story  1940      George Cukor
3        To Catch a Thief  1955  Alfred Hitchcock
4         His Girl Friday  1940      Howard Hawks
  title  year director
0  True  True     True
1  True  True     True
2  True  True     True
3  True  True     True
4  True  True     True
False



In [82]:

    
df2.reset_index(1)
#'director')
#pd.DataFrame(index=df2.index)









    Out[82]:






  
    
      
      level_1
      title
      year
    
    
      director
      
      
      
    
  
  
    
      Alfred Hitchcock
       0
           North by Northwest
       1959
    
    
      Alfred Hitchcock
       1
                    Notorious
       1946
    
    
      Alfred Hitchcock
       3
             To Catch a Thief
       1955
    
    
      George Cukor
       2
       The Philadelphia Story
       1940
    
    
      Howard Hawks
       4
              His Girl Friday
       1940



In [90]:

    
x = df2.reset_index(1)
print(x.ix['George Cukor'])
print(x.ix['Alfred Hitchcock'])
print(x.index)









    



level_1                         2
title      The Philadelphia Story
year                         1940
Name: George Cukor, dtype: object
                  level_1               title  year
director                                           
Alfred Hitchcock        0  North by Northwest  1959
Alfred Hitchcock        1           Notorious  1946
Alfred Hitchcock        3    To Catch a Thief  1955
Index(['Alfred Hitchcock', 'Alfred Hitchcock', 'Alfred Hitchcock', 'George Cukor', 'Howard Hawks'], dtype='object')



In [101]:

    
y = df1.set_index('director')
y









    Out[101]:






  
    
      
      title
      year
    
    
      director
      
      
    
  
  
    
      Alfred Hitchcock
           North by Northwest
       1959
    
    
      Alfred Hitchcock
                    Notorious
       1946
    
    
      George Cukor
       The Philadelphia Story
       1940
    
    
      Alfred Hitchcock
             To Catch a Thief
       1955
    
    
      Howard Hawks
              His Girl Friday
       1940



In [107]:

    
y.ix['George Cukor']









    Out[107]:





title    The Philadelphia Story
year                       1940
Name: George Cukor, dtype: object



In [108]:

    
y.ix['Alfred Hitchcock']









    Out[108]:






  
    
      
      title
      year
    
    
      director
      
      
    
  
  
    
      Alfred Hitchcock
       North by Northwest
       1959
    
    
      Alfred Hitchcock
                Notorious
       1946
    
    
      Alfred Hitchcock
         To Catch a Thief
       1955



In [112]:

    
y.loc['George Cukor']









    Out[112]:





title    The Philadelphia Story
year                       1940
Name: George Cukor, dtype: object



In [113]:

    
y.loc['Alfred Hitchcock']









    Out[113]:






  
    
      
      title
      year
    
    
      director
      
      
    
  
  
    
      Alfred Hitchcock
       North by Northwest
       1959
    
    
      Alfred Hitchcock
                Notorious
       1946
    
    
      Alfred Hitchcock
         To Catch a Thief
       1955



In [117]:

    
y.at['Alfred Hitchcock','title']









    Out[117]:





array(['North by Northwest', 'Notorious', 'To Catch a Thief'], dtype=object)



In [118]:

    
y.at['George Cukor','title']









    Out[118]:





'The Philadelphia Story'



In [145]:

    
p(df1.set_index('director'))









    



Traceback (most recent call last):

  File "<ipython-input-145-560ed8861e3e>", line 1, in <module>
    p(df1.set_index('director'))

NameError: name 'p' is not defined

print(df1.set_index('director').set_index('title', append=True))



In [125]:

    
df1.set_index('director').set_index('title', append=True).loc['Alfred Hitchcock']









    Out[125]:






  
    
      
      year
    
    
      title
      
    
  
  
    
      North by Northwest
       1959
    
    
      Notorious
       1946
    
    
      To Catch a Thief
       1955



In [128]:

    
df1.set_index(['director', 'title'])









    Out[128]:






  
    
      
      
      year
    
    
      director
      title
      
    
  
  
    
      Alfred Hitchcock
      North by Northwest
       1959
    
    
      Notorious
       1946
    
    
      George Cukor
      The Philadelphia Story
       1940
    
    
      Alfred Hitchcock
      To Catch a Thief
       1955
    
    
      Howard Hawks
      His Girl Friday
       1940



In [137]:

    
df1.set_index(['director', 'title']).loc['Alfred Hitchcock':'George Cukor']









    



Traceback (most recent call last):

  File "<ipython-input-137-0f8b79e18ab8>", line 1, in <module>
    df1.set_index(['director', 'title']).loc['Alfred Hitchcock':'George Cukor']

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/indexing.py", line 1202, in __getitem__
    return self._getitem_axis(key, axis=0)

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/indexing.py", line 1316, in _getitem_axis
    return self._get_slice_axis(key, axis=axis)

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/indexing.py", line 1224, in _get_slice_axis
    slice_obj.step)

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/index.py", line 2027, in slice_indexer
    start_slice, end_slice = self.slice_locs(start, end, step=step)

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/index.py", line 4063, in slice_locs
    return super(MultiIndex, self).slice_locs(start, end, step)

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/index.py", line 2141, in slice_locs
    start_slice = self.get_slice_bound(start, 'left')

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/index.py", line 4035, in get_slice_bound
    return self._partial_tup_index(label, side=side)

  File "/home/brandon/.v/pandas-tutorial/lib/python3.4/site-packages/pandas/core/index.py", line 4069, in _partial_tup_index
    (len(tup), self.lexsort_depth))

KeyError: 'Key length (1) was greater than MultiIndex lexsort depth (0)'



In [138]:

    
df1.set_index(['director', 'title']).sort_index().loc['Alfred Hitchcock':'George Cukor']









    Out[138]:






  
    
      
      
      year
    
    
      director
      title
      
    
  
  
    
      Alfred Hitchcock
      North by Northwest
       1959
    
    
      Notorious
       1946
    
    
      To Catch a Thief
       1955
    
    
      George Cukor
      The Philadelphia Story
       1940



In [ ]:

    
df1.set_index(['director', 'title']).sort_index()



In [154]:

    
df1.groupby(['director', 'title']).first()









    Out[154]:






  
    
      
      
      year
    
    
      director
      title
      
    
  
  
    
      Alfred Hitchcock
      North by Northwest
       1959
    
    
      Notorious
       1946
    
    
      To Catch a Thief
       1955
    
    
      George Cukor
      The Philadelphia Story
       1940
    
    
      Howard Hawks
      His Girl Friday
       1940



In [134]:

    
df1.groupby(['director', 'title']).first()









    Out[134]:






  
    
      
      
      year
    
    
      director
      title
      
    
  
  
    
      Alfred Hitchcock
      North by Northwest
       1959
    
    
      Notorious
       1946
    
    
      To Catch a Thief
       1955
    
    
      George Cukor
      The Philadelphia Story
       1940
    
    
      Howard Hawks
      His Girl Friday
       1940



In [184]:

    
#
# Official diagram generation
#

open('u__tyd.html', 'w').write(df1.to_html())
open('u_d_ty.html', 'w').write(df1.set_index('director').to_html())
open('u_dt_y.html', 'w').write(df1.set_index('director').set_index('title', append=True).to_html())
open('s_d_ty.html', 'w').write(df1.set_index('director').sort_index().to_html())
open('s_dt_y.html', 'w').write(df1.set_index('director').set_index('title', append=True).sort_index().to_html())

print(
    df1.set_index('director').set_index('title', append=True).equals(
    df1.set_index(['director', 'title'])
    ))

print(
    df1.set_index(['director', 'title']).sort_index().equals(
    df1.groupby(['director', 'title']).first()
    ))

print(
    df1.set_index('director').sort_index().set_index('title', append=True).equals(
    df1.groupby(['director', 'title']).first()
    ))









    



True
True
True



In [182]:

    
df1.set_index('director').sort_index().set_index('title')









    Out[182]:






  
    
      
      year
    
    
      title
      
    
  
  
    
      North by Northwest
       1959
    
    
      Notorious
       1946
    
    
      To Catch a Thief
       1955
    
    
      The Philadelphia Story
       1940
    
    
      His Girl Friday
       1940



In [179]:









    Out[179]:






  
    
      
      title
      year
    
    
      director
      
      
    
  
  
    
      Alfred Hitchcock
           North by Northwest
       1959
    
    
      George Cukor
       The Philadelphia Story
       1940
    
    
      Howard Hawks
              His Girl Friday
       1940



In [ ]:



In [ ]:



In [ ]:



In [127]:

    
df1.set_index(['director', 'title']).loc['Alfred Hitchcock']









    Out[127]:






  
    
      
      year
    
    
      title
      
    
  
  
    
      North by Northwest
       1959
    
    
      Notorious
       1946
    
    
      To Catch a Thief
       1955



In [129]:

    
df1.set_index(['director', 'title']).reset_index('title')









    Out[129]:






  
    
      
      title
      year
    
    
      director
      
      
    
  
  
    
      Alfred Hitchcock
           North by Northwest
       1959
    
    
      Alfred Hitchcock
                    Notorious
       1946
    
    
      George Cukor
       The Philadelphia Story
       1940
    
    
      Alfred Hitchcock
             To Catch a Thief
       1955
    
    
      Howard Hawks
              His Girl Friday
       1940



In [130]:

    
df1.set_index(['director', 'title']).reset_index(['director', 'title'])









    Out[130]:






  
    
      
      director
      title
      year
    
  
  
    
      0
       Alfred Hitchcock
           North by Northwest
       1959
    
    
      1
       Alfred Hitchcock
                    Notorious
       1946
    
    
      2
           George Cukor
       The Philadelphia Story
       1940
    
    
      3
       Alfred Hitchcock
             To Catch a Thief
       1955
    
    
      4
           Howard Hawks
              His Girl Friday
       1940



In [131]:

    
df1









    Out[131]:






  
    
      
      title
      year
      director
    
  
  
    
      0
           North by Northwest
       1959
       Alfred Hitchcock
    
    
      1
                    Notorious
       1946
       Alfred Hitchcock
    
    
      2
       The Philadelphia Story
       1940
           George Cukor
    
    
      3
             To Catch a Thief
       1955
       Alfred Hitchcock
    
    
      4
              His Girl Friday
       1940
           Howard Hawks



In [ ]:



In [ ]:



In [35]:

    
df2.unstack(level=0)









    Out[35]:






  
    
      
      title
      year
    
    
      director
      Alfred Hitchcock
      George Cukor
      Howard Hawks
      Alfred Hitchcock
      George Cukor
      Howard Hawks
    
  
  
    
      0
       North by Northwest
                          NaN
                   NaN
       1959
        NaN
        NaN
    
    
      1
                Notorious
                          NaN
                   NaN
       1946
        NaN
        NaN
    
    
      2
                      NaN
       The Philadelphia Story
                   NaN
        NaN
       1940
        NaN
    
    
      3
         To Catch a Thief
                          NaN
                   NaN
       1955
        NaN
        NaN
    
    
      4
                      NaN
                          NaN
       His Girl Friday
        NaN
        NaN
       1940



In [36]:

    
df2.transpose()









    Out[36]:






  
    
      director
      Alfred Hitchcock
      George Cukor
      Howard Hawks
    
    
      
      0
      1
      3
      2
      4
    
  
  
    
      title
       North by Northwest
       Notorious
       To Catch a Thief
       The Philadelphia Story
       His Girl Friday
    
    
      year
                     1959
            1946
                   1955
                         1940
                  1940



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [17]:

    
df2.ix['Alfred Hitchcock']









    Out[17]:






  
    
      
      title
      year
    
  
  
    
      0
       North by Northwest
       1959
    
    
      1
                Notorious
       1946
    
    
      3
         To Catch a Thief
       1955



In [19]:

    
df2.ix['Alfred Hitchcock',3]









    Out[19]:





title    To Catch a Thief
year                 1955
Name: (Alfred Hitchcock, 3), dtype: object



In [20]:

    
df1.groupby('director')['year'].agg([('first_year', min), ('last_year', max)])









    Out[20]:






  
    
      
      first_year
      last_year
    
    
      director
      
      
    
  
  
    
      Alfred Hitchcock
       1946
       1959
    
    
      George Cukor
       1940
       1940
    
    
      Howard Hawks
       1940
       1940



In [21]:

    
df1.groupby('director').agg({'title': min, 'year': max})









    Out[21]:






  
    
      
      title
      year
    
    
      director
      
      
    
  
  
    
      Alfred Hitchcock
           North by Northwest
       1959
    
    
      George Cukor
       The Philadelphia Story
       1940
    
    
      Howard Hawks
              His Girl Friday
       1940



In [24]:

    
df3 = df1.groupby(['director', 'title']).first()
df3









    Out[24]:






  
    
      
      
      year
    
    
      director
      title
      
    
  
  
    
      Alfred Hitchcock
      North by Northwest
       1959
    
    
      Notorious
       1946
    
    
      To Catch a Thief
       1955
    
    
      George Cukor
      The Philadelphia Story
       1940
    
    
      Howard Hawks
      His Girl Friday
       1940



In [25]:

    
df4 = df1.groupby(['director', 'year']).first()
df4









    Out[25]:






  
    
      
      
      title
    
    
      director
      year
      
    
  
  
    
      Alfred Hitchcock
      1946
                    Notorious
    
    
      1955
             To Catch a Thief
    
    
      1959
           North by Northwest
    
    
      George Cukor
      1940
       The Philadelphia Story
    
    
      Howard Hawks
      1940
              His Girl Friday



In [26]:

    
df1.groupby(['director', 'title', 'year']).first()









    Out[26]:






  
    
      
      
      
    
    
      director
      title
      year



In [27]:

    
s1 = df1.stack()
s1









    Out[27]:





0  title           North by Northwest
   year                          1959
   director          Alfred Hitchcock
1  title                    Notorious
   year                          1946
   director          Alfred Hitchcock
2  title       The Philadelphia Story
   year                          1940
   director              George Cukor
3  title             To Catch a Thief
   year                          1955
   director          Alfred Hitchcock
4  title              His Girl Friday
   year                          1940
   director              Howard Hawks
dtype: object



In [28]:

    
s2 = df1.unstack()
s2









    Out[28]:





title     0        North by Northwest
          1                 Notorious
          2    The Philadelphia Story
          3          To Catch a Thief
          4           His Girl Friday
year      0                      1959
          1                      1946
          2                      1940
          3                      1955
          4                      1940
director  0          Alfred Hitchcock
          1          Alfred Hitchcock
          2              George Cukor
          3          Alfred Hitchcock
          4              Howard Hawks
dtype: object



In [29]:

    
df3.stack()









    Out[29]:





director          title                       
Alfred Hitchcock  North by Northwest      year    1959
                  Notorious               year    1946
                  To Catch a Thief        year    1955
George Cukor      The Philadelphia Story  year    1940
Howard Hawks      His Girl Friday         year    1940
dtype: int64



In [30]:

    
df3.unstack()









    Out[30]:






  
    
      
      year
    
    
      title
      His Girl Friday
      North by Northwest
      Notorious
      The Philadelphia Story
      To Catch a Thief
    
    
      director
      
      
      
      
      
    
  
  
    
      Alfred Hitchcock
        NaN
       1959
       1946
        NaN
       1955
    
    
      George Cukor
        NaN
        NaN
        NaN
       1940
        NaN
    
    
      Howard Hawks
       1940
        NaN
        NaN
        NaN
        NaN



In [ ]:

	title	year	director
0	North by Northwest	1959	Alfred Hitchcock
1	Notorious	1946	Alfred Hitchcock
2	The Philadelphia Story	1940	George Cukor
3	To Catch a Thief	1955	Alfred Hitchcock
4	His Girl Friday	1940	Howard Hawks

	title	year	director
director
Alfred Hitchcock	NaN	NaN	NaN
Alfred Hitchcock	NaN	NaN	NaN
George Cukor	NaN	NaN	NaN
Alfred Hitchcock	NaN	NaN	NaN
Howard Hawks	NaN	NaN	NaN

	year
title
North by Northwest	1959
Notorious	1946
To Catch a Thief	1955

		title
director	year
Alfred Hitchcock	1946	Notorious
	1955	To Catch a Thief
	1959	North by Northwest
George Cukor	1940	The Philadelphia Story
Howard Hawks	1940	His Girl Friday