notebook.community

Edit and run

Title: Hierarchical Data In Pandas
Slug: pandas_hierarchical_data
Summary: Hierarchical Data In Pandas
Date: 2016-05-01 12:00
Category: Python
Tags: Data Wrangling
Authors: Chris Albon



In [1]:

    
# import modules
import pandas as pd



In [2]:

    
# Create dataframe
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'], 
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'], 
        'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'], 
        'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', 'preTestScore', 'postTestScore'])
df









    Out[2]:







  
    
      
      regiment
      company
      name
      preTestScore
      postTestScore
    
  
  
    
      0
      Nighthawks
      1st
      Miller
      4
      25
    
    
      1
      Nighthawks
      1st
      Jacobson
      24
      94
    
    
      2
      Nighthawks
      2nd
      Ali
      31
      57
    
    
      3
      Nighthawks
      2nd
      Milner
      2
      62
    
    
      4
      Dragoons
      1st
      Cooze
      3
      70
    
    
      5
      Dragoons
      1st
      Jacon
      4
      25
    
    
      6
      Dragoons
      2nd
      Ryaner
      24
      94
    
    
      7
      Dragoons
      2nd
      Sone
      31
      57
    
    
      8
      Scouts
      1st
      Sloan
      2
      62
    
    
      9
      Scouts
      1st
      Piger
      3
      70
    
    
      10
      Scouts
      2nd
      Riani
      2
      62
    
    
      11
      Scouts
      2nd
      Ali
      3
      70



In [3]:

    
# Set the hierarchical index but leave the columns inplace
df.set_index(['regiment', 'company'], drop=False)
df









    Out[3]:







  
    
      
      regiment
      company
      name
      preTestScore
      postTestScore
    
  
  
    
      0
      Nighthawks
      1st
      Miller
      4
      25
    
    
      1
      Nighthawks
      1st
      Jacobson
      24
      94
    
    
      2
      Nighthawks
      2nd
      Ali
      31
      57
    
    
      3
      Nighthawks
      2nd
      Milner
      2
      62
    
    
      4
      Dragoons
      1st
      Cooze
      3
      70
    
    
      5
      Dragoons
      1st
      Jacon
      4
      25
    
    
      6
      Dragoons
      2nd
      Ryaner
      24
      94
    
    
      7
      Dragoons
      2nd
      Sone
      31
      57
    
    
      8
      Scouts
      1st
      Sloan
      2
      62
    
    
      9
      Scouts
      1st
      Piger
      3
      70
    
    
      10
      Scouts
      2nd
      Riani
      2
      62
    
    
      11
      Scouts
      2nd
      Ali
      3
      70



In [4]:

    
# Set the hierarchical index to be by regiment, and then by company
df = df.set_index(['regiment', 'company'])
df









    Out[4]:







  
    
      
      
      name
      preTestScore
      postTestScore
    
    
      regiment
      company
      
      
      
    
  
  
    
      Nighthawks
      1st
      Miller
      4
      25
    
    
      1st
      Jacobson
      24
      94
    
    
      2nd
      Ali
      31
      57
    
    
      2nd
      Milner
      2
      62
    
    
      Dragoons
      1st
      Cooze
      3
      70
    
    
      1st
      Jacon
      4
      25
    
    
      2nd
      Ryaner
      24
      94
    
    
      2nd
      Sone
      31
      57
    
    
      Scouts
      1st
      Sloan
      2
      62
    
    
      1st
      Piger
      3
      70
    
    
      2nd
      Riani
      2
      62
    
    
      2nd
      Ali
      3
      70



In [5]:

    
# View the index
df.index









    Out[5]:





MultiIndex(levels=[['Dragoons', 'Nighthawks', 'Scouts'], ['1st', '2nd']],
           labels=[[1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2], [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]],
           names=['regiment', 'company'])



In [6]:

    
# Swap the levels in the index
df.swaplevel('regiment', 'company')









    Out[6]:







  
    
      
      
      name
      preTestScore
      postTestScore
    
    
      company
      regiment
      
      
      
    
  
  
    
      1st
      Nighthawks
      Miller
      4
      25
    
    
      Nighthawks
      Jacobson
      24
      94
    
    
      2nd
      Nighthawks
      Ali
      31
      57
    
    
      Nighthawks
      Milner
      2
      62
    
    
      1st
      Dragoons
      Cooze
      3
      70
    
    
      Dragoons
      Jacon
      4
      25
    
    
      2nd
      Dragoons
      Ryaner
      24
      94
    
    
      Dragoons
      Sone
      31
      57
    
    
      1st
      Scouts
      Sloan
      2
      62
    
    
      Scouts
      Piger
      3
      70
    
    
      2nd
      Scouts
      Riani
      2
      62
    
    
      Scouts
      Ali
      3
      70



In [7]:

    
# Summarize the results by regiment
df.sum(level='regiment')









    Out[7]:







  
    
      
      preTestScore
      postTestScore
    
    
      regiment
      
      
    
  
  
    
      Dragoons
      62
      246
    
    
      Nighthawks
      61
      238
    
    
      Scouts
      10
      264

	regiment	company	name	preTestScore	postTestScore
0	Nighthawks	1st	Miller	4	25
1	Nighthawks	1st	Jacobson	24	94
2	Nighthawks	2nd	Ali	31	57
3	Nighthawks	2nd	Milner	2	62
4	Dragoons	1st	Cooze	3	70
5	Dragoons	1st	Jacon	4	25
6	Dragoons	2nd	Ryaner	24	94
7	Dragoons	2nd	Sone	31	57
8	Scouts	1st	Sloan	2	62
9	Scouts	1st	Piger	3	70
10	Scouts	2nd	Riani	2	62
11	Scouts	2nd	Ali	3	70

		name	preTestScore	postTestScore
regiment	company
Nighthawks	1st	Miller	4	25
	1st	Jacobson	24	94
	2nd	Ali	31	57
	2nd	Milner	2	62
Dragoons	1st	Cooze	3	70
	1st	Jacon	4	25
	2nd	Ryaner	24	94
	2nd	Sone	31	57
Scouts	1st	Sloan	2	62
	1st	Piger	3	70
	2nd	Riani	2	62
	2nd	Ali	3	70