Chapter_09_part_01



In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

GroupBy mechanics


In [7]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df


Out[7]:
data1 data2 key1 key2
0 0.950840 -0.295795 a one
1 0.173257 1.716127 a two
2 -1.673077 1.750187 b one
3 -0.737930 -0.825913 b two
4 0.700727 -0.838416 a one

In [3]:
grouped = df['data1'].groupby(df['key1'])
grouped


Out[3]:
<pandas.core.groupby.SeriesGroupBy object at 0x11136f358>

In [4]:
grouped.mean()


Out[4]:
key1
a    0.467434
b    0.476689
Name: data1, dtype: float64

In [6]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means


Out[6]:
key1  key2
a     one     0.023136
      two     1.356030
b     one     0.885590
      two     0.067788
Name: data1, dtype: float64

In [7]:
means.unstack()


Out[7]:
key2 one two
key1
a 0.023136 1.356030
b 0.885590 0.067788

In [8]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()


Out[8]:
California  2005    1.356030
            2006    0.885590
Ohio        2005    0.045152
            2006    0.023755
Name: data1, dtype: float64

In [9]:
df.groupby('key1').mean()


Out[9]:
data1 data2
key1
a 0.467434 0.113715
b 0.476689 0.052054

In [10]:
df.groupby(['key1', 'key2']).mean()


Out[10]:
data1 data2
key1 key2
a one 0.023136 -0.063732
two 1.356030 0.468610
b one 0.885590 0.125019
two 0.067788 -0.020910

In [11]:
df.groupby(['key1', 'key2']).size()


Out[11]:
key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

Iterating over groups


In [12]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)


a
      data1     data2 key1 key2
0  0.022517 -0.103368    a  one
1  1.356030  0.468610    a  two
4  0.023755 -0.024096    a  one
b
      data1     data2 key1 key2
2  0.885590  0.125019    b  one
3  0.067788 -0.020910    b  two

In [13]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)


('a', 'one')
      data1     data2 key1 key2
0  0.022517 -0.103368    a  one
4  0.023755 -0.024096    a  one
('a', 'two')
     data1    data2 key1 key2
1  1.35603  0.46861    a  two
('b', 'one')
     data1     data2 key1 key2
2  0.88559  0.125019    b  one
('b', 'two')
      data1    data2 key1 key2
3  0.067788 -0.02091    b  two

In [14]:
pieces = dict(list(df.groupby('key1')))
pieces['b']


Out[14]:
data1 data2 key1 key2
2 0.885590 0.125019 b one
3 0.067788 -0.020910 b two

In [15]:
df.dtypes


Out[15]:
data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [16]:
grouped = df.groupby(df.dtypes, axis=1)
dict(list(grouped))


Out[16]:
{dtype('float64'):       data1     data2
 0  0.022517 -0.103368
 1  1.356030  0.468610
 2  0.885590  0.125019
 3  0.067788 -0.020910
 4  0.023755 -0.024096, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

Selecting a column or subset of columns


In [17]:
df.groupby(['key1', 'key2'])[['data2']].mean()


Out[17]:
data2
key1 key2
a one -0.063732
two 0.468610
b one 0.125019
two -0.020910

In [18]:
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped


Out[18]:
<pandas.core.groupby.SeriesGroupBy object at 0x111682160>

In [19]:
s_grouped.mean()


Out[19]:
key1  key2
a     one    -0.063732
      two     0.468610
b     one     0.125019
      two    -0.020910
Name: data2, dtype: float64

Grouping with dicts and Series


In [2]:
people = DataFrame(np.random.randn(5, 5),
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.ix[2:3, ['b', 'c']] = np.nan # Add a few NA values
people


Out[2]:
a b c d e
Joe 0.416326 0.411755 -0.020388 -0.580486 -0.237133
Steve -0.516179 1.254876 -0.426501 -0.944397 0.503768
Wes -0.183956 NaN NaN -0.986383 -0.509479
Jim -0.393597 -0.058779 -1.569126 1.195260 -1.009167
Travis -0.535035 0.693127 0.001421 -0.389958 0.709623

In [21]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'}

In [22]:
by_column = people.groupby(mapping, axis=1)
by_column.sum()


Out[22]:
blue red
Joe -0.769452 1.745117
Steve -1.684565 -1.416331
Wes -0.383247 0.016724
Jim -0.655258 1.209242
Travis 1.199529 -3.453057

In [23]:
map_series = Series(mapping)
map_series


Out[23]:
a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [24]:
people.groupby(map_series, axis=1).count()


Out[24]:
blue red
Joe 2 3
Steve 2 3
Wes 1 2
Jim 2 3
Travis 2 3

Grouping with functions


In [3]:
people.groupby(len).sum()


Out[3]:
a b c d e
3 -0.161227 0.352977 -1.589514 -0.371608 -1.755780
5 -0.516179 1.254876 -0.426501 -0.944397 0.503768
6 -0.535035 0.693127 0.001421 -0.389958 0.709623

In [4]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()


Out[4]:
a b c d e
3 one -0.183956 0.411755 -0.020388 -0.986383 -0.509479
two -0.393597 -0.058779 -1.569126 1.195260 -1.009167
5 one -0.516179 1.254876 -0.426501 -0.944397 0.503768
6 two -0.535035 0.693127 0.001421 -0.389958 0.709623

Grouping by index levels


In [5]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]], names=['cty', 'tenor'])
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)
hier_df


Out[5]:
cty US JP
tenor 1 3 5 1 3
0 0.703802 -0.208907 0.201098 0.711903 -0.660690
1 -1.494946 -2.383929 0.076390 -0.057416 1.432268
2 0.070857 0.212368 2.085132 2.913021 -0.005329
3 1.036175 -0.044281 -0.961118 -0.995575 -0.807160

In [6]:
hier_df.groupby(level='cty', axis=1).count()


Out[6]:
cty JP US
0 2 3
1 2 3
2 2 3
3 2 3

Data aggregation


In [8]:
df


Out[8]:
data1 data2 key1 key2
0 0.950840 -0.295795 a one
1 0.173257 1.716127 a two
2 -1.673077 1.750187 b one
3 -0.737930 -0.825913 b two
4 0.700727 -0.838416 a one

In [9]:
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)


Out[9]:
key1
a    0.900818
b   -0.831444
Name: data1, dtype: float64

In [10]:
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak)


Out[10]:
data1 data2
key1
a 0.777583 2.554543
b 0.935147 2.576100

In [11]:
grouped.describe()


Out[11]:
data1 data2
key1
a count 3.000000 3.000000
mean 0.608275 0.193972
std 0.396950 1.345855
min 0.173257 -0.838416
25% 0.436992 -0.567105
50% 0.700727 -0.295795
75% 0.825784 0.710166
max 0.950840 1.716127
b count 2.000000 2.000000
mean -1.205503 0.462137
std 0.661249 1.821578
min -1.673077 -0.825913
25% -1.439290 -0.181888
50% -1.205503 0.462137
75% -0.971716 1.106162
max -0.737930 1.750187

In [13]:
tips = pd.read_csv('tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]


Out[13]:
total_bill tip sex smoker day time size tip_pct
0 16.99 1.01 Female No Sun Dinner 2 0.059447
1 10.34 1.66 Male No Sun Dinner 3 0.160542
2 21.01 3.50 Male No Sun Dinner 3 0.166587
3 23.68 3.31 Male No Sun Dinner 2 0.139780
4 24.59 3.61 Female No Sun Dinner 4 0.146808
5 25.29 4.71 Male No Sun Dinner 4 0.186240

Column-wise and multiple function application


In [14]:
grouped = tips.groupby(['sex', 'smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')


Out[14]:
sex     smoker
Female  No        0.156921
        Yes       0.182150
Male    No        0.160669
        Yes       0.152771
Name: tip_pct, dtype: float64

In [15]:
grouped_pct.agg(['mean', 'std', peak_to_peak])


Out[15]:
mean std peak_to_peak
sex smoker
Female No 0.156921 0.036421 0.195876
Yes 0.182150 0.071595 0.360233
Male No 0.160669 0.041849 0.220186
Yes 0.152771 0.090588 0.674707

In [16]:
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])


Out[16]:
foo bar
sex smoker
Female No 0.156921 0.036421
Yes 0.182150 0.071595
Male No 0.160669 0.041849
Yes 0.152771 0.090588

In [17]:
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)
result


Out[17]:
tip_pct total_bill
count mean max count mean max
sex smoker
Female No 54 0.156921 0.252672 54 18.105185 35.83
Yes 33 0.182150 0.416667 33 17.977879 44.30
Male No 97 0.160669 0.291990 97 19.791237 48.33
Yes 60 0.152771 0.710345 60 22.284500 50.81

In [18]:
result['tip_pct']


Out[18]:
count mean max
sex smoker
Female No 54 0.156921 0.252672
Yes 33 0.182150 0.416667
Male No 97 0.160669 0.291990
Yes 60 0.152771 0.710345

In [19]:
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)


Out[19]:
tip_pct total_bill
Durchschnitt Abweichung Durchschnitt Abweichung
sex smoker
Female No 0.156921 0.001327 18.105185 53.092422
Yes 0.182150 0.005126 17.977879 84.451517
Male No 0.160669 0.001751 19.791237 76.152961
Yes 0.152771 0.008206 22.284500 98.244673

In [20]:
grouped.agg({'tip' : np.max, 'size' : 'sum'})


Out[20]:
tip size
sex smoker
Female No 5.2 140
Yes 6.5 74
Male No 9.0 263
Yes 10.0 150

In [21]:
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],
             'size' : 'sum'})


Out[21]:
tip_pct size
min max mean std sum
sex smoker
Female No 0.056797 0.252672 0.156921 0.036421 140
Yes 0.056433 0.416667 0.182150 0.071595 74
Male No 0.071804 0.291990 0.160669 0.041849 263
Yes 0.035638 0.710345 0.152771 0.090588 150

Returning aggregated data in "unindexed" form


In [22]:
tips.groupby(['sex', 'smoker'], as_index=False).mean()


Out[22]:
sex smoker total_bill tip size tip_pct
0 Female No 18.105185 2.773519 2.592593 0.156921
1 Female Yes 17.977879 2.931515 2.242424 0.182150
2 Male No 19.791237 3.113402 2.711340 0.160669
3 Male Yes 22.284500 3.051167 2.500000 0.152771

Group-wise operations and transformations


In [23]:
df


Out[23]:
data1 data2 key1 key2
0 0.950840 -0.295795 a one
1 0.173257 1.716127 a two
2 -1.673077 1.750187 b one
3 -0.737930 -0.825913 b two
4 0.700727 -0.838416 a one

In [24]:
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means


Out[24]:
mean_data1 mean_data2
key1
a 0.608275 0.193972
b -1.205503 0.462137

In [25]:
pd.merge(df, k1_means, left_on='key1', right_index=True)


Out[25]:
data1 data2 key1 key2 mean_data1 mean_data2
0 0.950840 -0.295795 a one 0.608275 0.193972
1 0.173257 1.716127 a two 0.608275 0.193972
4 0.700727 -0.838416 a one 0.608275 0.193972
2 -1.673077 1.750187 b one -1.205503 0.462137
3 -0.737930 -0.825913 b two -1.205503 0.462137

In [26]:
key = ['one', 'two', 'one', 'two', 'one']
people.groupby(key).mean()


Out[26]:
a b c d e
one -0.100888 0.552441 -0.009484 -0.652276 -0.01233
two -0.454888 0.598049 -0.997814 0.125431 -0.25270

In [27]:
people.groupby(key).transform(np.mean)


Out[27]:
a b c d e
Joe -0.100888 0.552441 -0.009484 -0.652276 -0.01233
Steve -0.454888 0.598049 -0.997814 0.125431 -0.25270
Wes -0.100888 0.552441 -0.009484 -0.652276 -0.01233
Jim -0.454888 0.598049 -0.997814 0.125431 -0.25270
Travis -0.100888 0.552441 -0.009484 -0.652276 -0.01233

In [28]:
def demean(arr):
    return arr - arr.mean()
demeaned = people.groupby(key).transform(demean)
demeaned


Out[28]:
a b c d e
Joe 0.517214 -0.140686 -0.010905 0.071790 -0.224803
Steve -0.061291 0.656827 0.571312 -1.069829 0.756467
Wes -0.083067 NaN NaN -0.334107 -0.497149
Jim 0.061291 -0.656827 -0.571312 1.069829 -0.756467
Travis -0.434147 0.140686 0.010905 0.262317 0.721953

In [29]:
demeaned.groupby(key).mean()


Out[29]:
a b c d e
one 1.850372e-17 5.551115e-17 0.000000e+00 -7.401487e-17 0.0
two 2.775558e-17 5.551115e-17 5.551115e-17 0.000000e+00 0.0

Apply: General split-apply-combine


In [31]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]
top(tips, n=6)


Out[31]:
total_bill tip sex smoker day time size tip_pct
109 14.31 4.00 Female Yes Sat Dinner 2 0.279525
183 23.17 6.50 Male Yes Sun Dinner 4 0.280535
232 11.61 3.39 Male No Sat Dinner 2 0.291990
67 3.07 1.00 Female Yes Sat Dinner 1 0.325733
178 9.60 4.00 Female Yes Sun Dinner 2 0.416667
172 7.25 5.15 Male Yes Sun Dinner 2 0.710345

In [32]:
tips.groupby('smoker').apply(top)


Out[32]:
total_bill tip sex smoker day time size tip_pct
smoker
No 88 24.71 5.85 Male No Thur Lunch 2 0.236746
185 20.69 5.00 Male No Sun Dinner 5 0.241663
51 10.29 2.60 Female No Sun Dinner 2 0.252672
149 7.51 2.00 Male No Thur Lunch 2 0.266312
232 11.61 3.39 Male No Sat Dinner 2 0.291990
Yes 109 14.31 4.00 Female Yes Sat Dinner 2 0.279525
183 23.17 6.50 Male Yes Sun Dinner 4 0.280535
67 3.07 1.00 Female Yes Sat Dinner 1 0.325733
178 9.60 4.00 Female Yes Sun Dinner 2 0.416667
172 7.25 5.15 Male Yes Sun Dinner 2 0.710345

In [33]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')


Out[33]:
total_bill tip sex smoker day time size tip_pct
smoker day
No Fri 94 22.75 3.25 Female No Fri Dinner 2 0.142857
Sat 212 48.33 9.00 Male No Sat Dinner 4 0.186220
Sun 156 48.17 5.00 Male No Sun Dinner 6 0.103799
Thur 142 41.19 5.00 Male No Thur Lunch 5 0.121389
Yes Fri 95 40.17 4.73 Male Yes Fri Dinner 4 0.117750
Sat 170 50.81 10.00 Male Yes Sat Dinner 3 0.196812
Sun 182 45.35 3.50 Male Yes Sun Dinner 3 0.077178
Thur 197 43.11 5.00 Female Yes Thur Lunch 4 0.115982

In [34]:
result = tips.groupby('smoker')['tip_pct'].describe()
result


Out[34]:
smoker       
No      count    151.000000
        mean       0.159328
        std        0.039910
        min        0.056797
        25%        0.136906
        50%        0.155625
        75%        0.185014
        max        0.291990
Yes     count     93.000000
        mean       0.163196
        std        0.085119
        min        0.035638
        25%        0.106771
        50%        0.153846
        75%        0.195059
        max        0.710345
Name: tip_pct, dtype: float64

In [35]:
result.unstack('smoker')


Out[35]:
smoker No Yes
count 151.000000 93.000000
mean 0.159328 0.163196
std 0.039910 0.085119
min 0.056797 0.035638
25% 0.136906 0.106771
50% 0.155625 0.153846
75% 0.185014 0.195059
max 0.291990 0.710345

Suppressing the group keys


In [36]:
tips.groupby('smoker', group_keys=False).apply(top)


Out[36]:
total_bill tip sex smoker day time size tip_pct
88 24.71 5.85 Male No Thur Lunch 2 0.236746
185 20.69 5.00 Male No Sun Dinner 5 0.241663
51 10.29 2.60 Female No Sun Dinner 2 0.252672
149 7.51 2.00 Male No Thur Lunch 2 0.266312
232 11.61 3.39 Male No Sat Dinner 2 0.291990
109 14.31 4.00 Female Yes Sat Dinner 2 0.279525
183 23.17 6.50 Male Yes Sun Dinner 4 0.280535
67 3.07 1.00 Female Yes Sat Dinner 1 0.325733
178 9.60 4.00 Female Yes Sun Dinner 2 0.416667
172 7.25 5.15 Male Yes Sun Dinner 2 0.710345

In [ ]: