In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [2]:
df = DataFrame({'k1':['X','X','Y','Y','Z'],
                'k2':['alpha','beta','alpha','beta','alpha'],
                'dataset1':np.random.randn(5),
                'dataset2':np.random.randn(5)})
df


Out[2]:
dataset1 dataset2 k1 k2
0 0.932585 0.538335 X alpha
1 -0.862080 1.898216 X beta
2 -1.431315 -0.544031 Y alpha
3 1.495488 -2.993656 Y beta
4 -0.281585 0.169025 Z alpha

In [3]:
group1 = df['dataset1'].groupby(df['k1'])

group1


Out[3]:
<pandas.core.groupby.SeriesGroupBy object at 0x7fc69cfb6ed0>

In [4]:
group1.mean()


Out[4]:
k1
X    0.035253
Y    0.032087
Z   -0.281585
Name: dataset1, dtype: float64

In [5]:
# with series
cities = np.array(['NY','LA','LA','NY','NY'])

In [6]:
month = np.array(['JAN','FEB','JAN','FEB','JAN'])

In [7]:
df['dataset1'].groupby([cities,month]).mean()


Out[7]:
LA  FEB   -0.862080
    JAN   -1.431315
NY  FEB    1.495488
    JAN    0.325500
Name: dataset1, dtype: float64

In [9]:
# pass columns as keys
df.groupby('k1').mean()


Out[9]:
dataset1 dataset2
k1
X 0.035253 1.218276
Y 0.032087 -1.768843
Z -0.281585 0.169025

In [10]:
df


Out[10]:
dataset1 dataset2 k1 k2
0 0.932585 0.538335 X alpha
1 -0.862080 1.898216 X beta
2 -1.431315 -0.544031 Y alpha
3 1.495488 -2.993656 Y beta
4 -0.281585 0.169025 Z alpha

In [11]:
# groupby by multiple columns
df.groupby(['k1','k2']).mean()


Out[11]:
dataset1 dataset2
k1 k2
X alpha 0.932585 0.538335
beta -0.862080 1.898216
Y alpha -1.431315 -0.544031
beta 1.495488 -2.993656
Z alpha -0.281585 0.169025

In [12]:
df.groupby(['k1']).size()


Out[12]:
k1
X    2
Y    2
Z    1
dtype: int64

In [13]:
df.groupby('k2').size()


Out[13]:
k2
alpha    3
beta     2
dtype: int64

In [21]:
for name,group in df.groupby('k1'):
    print('This is the %s group' %name)
    print(group)
    print('\n')


This is the X group
   dataset1  dataset2 k1     k2
0  0.932585  0.538335  X  alpha
1 -0.862080  1.898216  X   beta


This is the Y group
   dataset1  dataset2 k1     k2
2 -1.431315 -0.544031  Y  alpha
3  1.495488 -2.993656  Y   beta


This is the Z group
   dataset1  dataset2 k1     k2
4 -0.281585  0.169025  Z  alpha



In [24]:
for (k1,k2),group in df.groupby(['k1','k2']):
    print('Key1 = %s , key2 = %s' %(k1,k2))
    print(group)
    print('\n')


Key1 = X , key2 = alpha
   dataset1  dataset2 k1     k2
0  0.932585  0.538335  X  alpha


Key1 = X , key2 = beta
   dataset1  dataset2 k1    k2
1  -0.86208  1.898216  X  beta


Key1 = Y , key2 = alpha
   dataset1  dataset2 k1     k2
2 -1.431315 -0.544031  Y  alpha


Key1 = Y , key2 = beta
   dataset1  dataset2 k1    k2
3  1.495488 -2.993656  Y  beta


Key1 = Z , key2 = alpha
   dataset1  dataset2 k1     k2
4 -0.281585  0.169025  Z  alpha



In [28]:
group_dict = dict(list(df.groupby('k1')))

group_dict


Out[28]:
{'X':    dataset1  dataset2 k1     k2
 0  0.932585  0.538335  X  alpha
 1 -0.862080  1.898216  X   beta, 'Y':    dataset1  dataset2 k1     k2
 2 -1.431315 -0.544031  Y  alpha
 3  1.495488 -2.993656  Y   beta, 'Z':    dataset1  dataset2 k1     k2
 4 -0.281585  0.169025  Z  alpha}

In [27]:
group_dict['X']


Out[27]:
dataset1 dataset2 k1 k2
0 0.932585 0.538335 X alpha
1 -0.862080 1.898216 X beta

In [29]:
group_dict['Y']


Out[29]:
dataset1 dataset2 k1 k2
2 -1.431315 -0.544031 Y alpha
3 1.495488 -2.993656 Y beta

In [35]:
# separate data by dtypes
group_dict_axis1 = dict(list(df.groupby(df.dtypes,axis=1)))

In [32]:
group_dict_axis1


Out[32]:
{dtype('float64'):    dataset1  dataset2
 0  0.932585  0.538335
 1 -0.862080  1.898216
 2 -1.431315 -0.544031
 3  1.495488 -2.993656
 4 -0.281585  0.169025, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [36]:
# using columns
dataset2_group = df.groupby(['k1','k2'])[['dataset2']]

dataset2_group.mean()


Out[36]:
dataset2
k1 k2
X alpha 0.538335
beta 1.898216
Y alpha -0.544031
beta -2.993656
Z alpha 0.169025

In [ ]: