In [1]:
#### Introduction to Data Wrangling with Pandas ####
## Page 5 ##

In [2]:
#### Split-apply-combine operations. An overview ####

In [3]:
# """
# Split: Divide the data into groups
# Apply: Operate on each group separately and independently
# Combine: Combine the result of split-apply
# """

In [5]:
# Types of operations
# """
# Aggregation/ Redcuction: Return an aggregated result for the whole group.
# Transformation: Return same number of rows but with transformed data values.
# Filtration: Discard a group or values in a group if the condition evaluates to false
# """

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
#so that we can view the graphs inside the notebook

In [2]:
df = pd.DataFrame({'A' : ['first', 'second', 'first', 'second','first', 'second', 'first', 'second'],
                   'B' : ['a', 'b', 'a', 'c','c', 'a', 'b', 'c'],
                   'C' : [1,2,3,4,5,6,7,8],
                   'D' : [2,4,6,8,10,12,14,16]})

In [3]:
df


Out[3]:
A B C D
0 first a 1 2
1 second b 2 4
2 first a 3 6
3 second c 4 8
4 first c 5 10
5 second a 6 12
6 first b 7 14
7 second c 8 16

In [5]:
df.groupby('B')


Out[5]:
<pandas.core.groupby.DataFrameGroupBy object at 0x7ff449d64710>

In [6]:
df.groupby('B').groups


Out[6]:
{'a': Int64Index([0, 2, 5], dtype='int64'),
 'b': Int64Index([1, 6], dtype='int64'),
 'c': Int64Index([3, 4, 7], dtype='int64')}

In [7]:
df.groupby('B').get_group('b')


Out[7]:
A B C D
1 second b 2 4
6 first b 7 14

In [8]:
df.groupby('B').sum() #aggregate apply function


Out[8]:
C D
B
a 10 20
b 9 18
c 17 34

In [9]:
df.groupby('B').mean()


Out[9]:
C D
B
a 3.333333 6.666667
b 4.500000 9.000000
c 5.666667 11.333333

In [11]:
## From the docs
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], 
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
df_temp = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3],
                   'B': np.arange(8), 'C': np.arange(8)},
                    index=index)
df_temp


Out[11]:
A B C
first second
bar one 1 0 0
two 1 1 1
baz one 1 2 2
two 1 3 3
foo one 2 4 4
two 2 5 5
qux one 3 6 6
two 3 7 7

In [12]:
df_temp.groupby([pd.Grouper(level=0)]).sum().add_prefix('total_') #0,1 etc for index level, can use the name as well,


Out[12]:
total_A total_B total_C
first
bar 2 1 1
baz 2 5 5
foo 4 9 9
qux 6 13 13

In [14]:
df_temp.groupby([pd.Grouper(level='first')]).sum().add_prefix('total_')


Out[14]:
total_A total_B total_C
first
bar 2 1 1
baz 2 5 5
foo 4 9 9
qux 6 13 13

In [15]:
df


Out[15]:
A B C D
0 first a 1 2
1 second b 2 4
2 first a 3 6
3 second c 4 8
4 first c 5 10
5 second a 6 12
6 first b 7 14
7 second c 8 16

In [16]:
grouped = df.groupby('A')
grouped.aggregate([np.mean, np.median])


Out[16]:
C D
mean median mean median
A
first 4 4 8 8
second 5 5 10 10

In [17]:
grouped.aggregate({'C' : np.sum, 'D' : np.std})


Out[17]:
C D
A
first 16 5.163978
second 20 5.163978

In [18]:
transformed = df.groupby('A').transform(lambda x: x*2) #normalize

In [19]:
transformed #str=a, then str*2 ==aa


Out[19]:
B C D
0 aa 2 4
1 bb 4 8
2 aa 6 12
3 cc 8 16
4 cc 10 20
5 aa 12 24
6 bb 14 28
7 cc 16 32

In [20]:
df.head(2)


Out[20]:
A B C D
0 first a 1 2
1 second b 2 4

In [22]:
df.tail(2)


Out[22]:
A B C D
6 first b 7 14
7 second c 8 16