notebook.community

Edit and run



In [1]:

    
#### Introduction to Data Wrangling with Pandas ####
## Page 5 ##



In [2]:

    
#### Split-apply-combine operations. An overview ####



In [3]:

    
# """
# Split: Divide the data into groups
# Apply: Operate on each group separately and independently
# Combine: Combine the result of split-apply
# """



In [5]:

    
# Types of operations
# """
# Aggregation/ Redcuction: Return an aggregated result for the whole group.
# Transformation: Return same number of rows but with transformed data values.
# Filtration: Discard a group or values in a group if the condition evaluates to false
# """



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
#so that we can view the graphs inside the notebook



In [2]:

    
df = pd.DataFrame({'A' : ['first', 'second', 'first', 'second','first', 'second', 'first', 'second'],
                   'B' : ['a', 'b', 'a', 'c','c', 'a', 'b', 'c'],
                   'C' : [1,2,3,4,5,6,7,8],
                   'D' : [2,4,6,8,10,12,14,16]})



In [3]:

    
df



In [5]:

    
df.groupby('B')









    Out[5]:





<pandas.core.groupby.DataFrameGroupBy object at 0x7ff449d64710>



In [6]:

    
df.groupby('B').groups









    Out[6]:





{'a': Int64Index([0, 2, 5], dtype='int64'),
 'b': Int64Index([1, 6], dtype='int64'),
 'c': Int64Index([3, 4, 7], dtype='int64')}



In [7]:

    
df.groupby('B').get_group('b')



In [8]:

    
df.groupby('B').sum() #aggregate apply function



In [9]:

    
df.groupby('B').mean()



In [11]:

    
## From the docs
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], 
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
df_temp = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3],
                   'B': np.arange(8), 'C': np.arange(8)},
                    index=index)
df_temp



In [12]:

    
df_temp.groupby([pd.Grouper(level=0)]).sum().add_prefix('total_') #0,1 etc for index level, can use the name as well,



In [14]:

    
df_temp.groupby([pd.Grouper(level='first')]).sum().add_prefix('total_')



In [15]:

    
df



In [16]:

    
grouped = df.groupby('A')
grouped.aggregate([np.mean, np.median])



In [17]:

    
grouped.aggregate({'C' : np.sum, 'D' : np.std})



In [18]:

    
transformed = df.groupby('A').transform(lambda x: x*2) #normalize



In [19]:

    
transformed #str=a, then str*2 ==aa



In [20]:

    
df.head(2)



In [22]:

    
df.tail(2)

	C	D
B
a	3.333333	6.666667
b	4.500000	9.000000
c	5.666667	11.333333

	A	B	C	D
0	first	a	1	2
1	second	b	2	4
2	first	a	3	6
3	second	c	4	8
4	first	c	5	10
5	second	a	6	12
6	first	b	7	14
7	second	c	8	16