In [1]:
#### Introduction to Data Wrangling with Pandas ####
## Page 5 ##
In [2]:
#### Split-apply-combine operations. An overview ####
In [3]:
# """
# Split: Divide the data into groups
# Apply: Operate on each group separately and independently
# Combine: Combine the result of split-apply
# """
In [5]:
# Types of operations
# """
# Aggregation/ Redcuction: Return an aggregated result for the whole group.
# Transformation: Return same number of rows but with transformed data values.
# Filtration: Discard a group or values in a group if the condition evaluates to false
# """
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#so that we can view the graphs inside the notebook
In [2]:
df = pd.DataFrame({'A' : ['first', 'second', 'first', 'second','first', 'second', 'first', 'second'],
'B' : ['a', 'b', 'a', 'c','c', 'a', 'b', 'c'],
'C' : [1,2,3,4,5,6,7,8],
'D' : [2,4,6,8,10,12,14,16]})
In [3]:
df
Out[3]:
In [5]:
df.groupby('B')
Out[5]:
In [6]:
df.groupby('B').groups
Out[6]:
In [7]:
df.groupby('B').get_group('b')
Out[7]:
In [8]:
df.groupby('B').sum() #aggregate apply function
Out[8]:
In [9]:
df.groupby('B').mean()
Out[9]:
In [11]:
## From the docs
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
df_temp = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3],
'B': np.arange(8), 'C': np.arange(8)},
index=index)
df_temp
Out[11]:
In [12]:
df_temp.groupby([pd.Grouper(level=0)]).sum().add_prefix('total_') #0,1 etc for index level, can use the name as well,
Out[12]:
In [14]:
df_temp.groupby([pd.Grouper(level='first')]).sum().add_prefix('total_')
Out[14]:
In [15]:
df
Out[15]:
In [16]:
grouped = df.groupby('A')
grouped.aggregate([np.mean, np.median])
Out[16]:
In [17]:
grouped.aggregate({'C' : np.sum, 'D' : np.std})
Out[17]:
In [18]:
transformed = df.groupby('A').transform(lambda x: x*2) #normalize
In [19]:
transformed #str=a, then str*2 ==aa
Out[19]:
In [20]:
df.head(2)
Out[20]:
In [22]:
df.tail(2)
Out[22]: