Title: Hierarchical Data In Pandas
Slug: pandas_hierarchical_data
Summary: Hierarchical Data In Pandas
Date: 2016-05-01 12:00
Category: Python
Tags: Data Wrangling
Authors: Chris Albon


In [1]:
# import modules
import pandas as pd

In [2]:
# Create dataframe
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'], 
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'], 
        'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'], 
        'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', 'preTestScore', 'postTestScore'])
df


Out[2]:
regiment company name preTestScore postTestScore
0 Nighthawks 1st Miller 4 25
1 Nighthawks 1st Jacobson 24 94
2 Nighthawks 2nd Ali 31 57
3 Nighthawks 2nd Milner 2 62
4 Dragoons 1st Cooze 3 70
5 Dragoons 1st Jacon 4 25
6 Dragoons 2nd Ryaner 24 94
7 Dragoons 2nd Sone 31 57
8 Scouts 1st Sloan 2 62
9 Scouts 1st Piger 3 70
10 Scouts 2nd Riani 2 62
11 Scouts 2nd Ali 3 70

In [3]:
# Set the hierarchical index but leave the columns inplace
df.set_index(['regiment', 'company'], drop=False)
df


Out[3]:
regiment company name preTestScore postTestScore
0 Nighthawks 1st Miller 4 25
1 Nighthawks 1st Jacobson 24 94
2 Nighthawks 2nd Ali 31 57
3 Nighthawks 2nd Milner 2 62
4 Dragoons 1st Cooze 3 70
5 Dragoons 1st Jacon 4 25
6 Dragoons 2nd Ryaner 24 94
7 Dragoons 2nd Sone 31 57
8 Scouts 1st Sloan 2 62
9 Scouts 1st Piger 3 70
10 Scouts 2nd Riani 2 62
11 Scouts 2nd Ali 3 70

In [4]:
# Set the hierarchical index to be by regiment, and then by company
df = df.set_index(['regiment', 'company'])
df


Out[4]:
name preTestScore postTestScore
regiment company
Nighthawks 1st Miller 4 25
1st Jacobson 24 94
2nd Ali 31 57
2nd Milner 2 62
Dragoons 1st Cooze 3 70
1st Jacon 4 25
2nd Ryaner 24 94
2nd Sone 31 57
Scouts 1st Sloan 2 62
1st Piger 3 70
2nd Riani 2 62
2nd Ali 3 70

In [5]:
# View the index
df.index


Out[5]:
MultiIndex(levels=[['Dragoons', 'Nighthawks', 'Scouts'], ['1st', '2nd']],
           labels=[[1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2], [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]],
           names=['regiment', 'company'])

In [6]:
# Swap the levels in the index
df.swaplevel('regiment', 'company')


Out[6]:
name preTestScore postTestScore
company regiment
1st Nighthawks Miller 4 25
Nighthawks Jacobson 24 94
2nd Nighthawks Ali 31 57
Nighthawks Milner 2 62
1st Dragoons Cooze 3 70
Dragoons Jacon 4 25
2nd Dragoons Ryaner 24 94
Dragoons Sone 31 57
1st Scouts Sloan 2 62
Scouts Piger 3 70
2nd Scouts Riani 2 62
Scouts Ali 3 70

In [7]:
# Summarize the results by regiment
df.sum(level='regiment')


Out[7]:
preTestScore postTestScore
regiment
Dragoons 62 246
Nighthawks 61 238
Scouts 10 264