In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pandas import DataFrame, Series

In [2]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]], names=['cty', 'tenor'])
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)
hier_df


Out[2]:
cty US JP
tenor 1 3 5 1 3
0 0.026181 -0.343651 -1.839707 -0.239827 0.554371
1 -0.342152 1.696171 0.911705 -0.610586 0.795040
2 -0.229819 0.623547 -0.721125 1.040916 0.937521
3 0.023564 -0.023763 -0.024780 -0.815674 0.752994

In [3]:
hier_df.groupby(level='cty', axis=1).count()


Out[3]:
cty JP US
0 2 3
1 2 3
2 2 3
3 2 3

In [29]:
tips = pd.read_csv('ch08/tips.csv')
# 添加"小费占总额百分比"的列
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]


Out[29]:
total_bill tip sex smoker day time size tip_pct
0 16.99 1.01 Female No Sun Dinner 2 0.059447
1 10.34 1.66 Male No Sun Dinner 3 0.160542
2 21.01 3.50 Male No Sun Dinner 3 0.166587
3 23.68 3.31 Male No Sun Dinner 2 0.139780
4 24.59 3.61 Female No Sun Dinner 4 0.146808
5 25.29 4.71 Male No Sun Dinner 4 0.186240

In [9]:
grouped = tips.groupby(['sex', 'smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')


Out[9]:
sex     smoker
Female  No        0.156921
        Yes       0.182150
Male    No        0.160669
        Yes       0.152771
Name: tip_pct, dtype: float64

In [10]:
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped_pct.agg(['mean', 'std', peak_to_peak])


Out[10]:
mean std peak_to_peak
sex smoker
Female No 0.156921 0.036421 0.195876
Yes 0.182150 0.071595 0.360233
Male No 0.160669 0.041849 0.220186
Yes 0.152771 0.090588 0.674707

In [11]:
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])


Out[11]:
foo bar
sex smoker
Female No 0.156921 0.036421
Yes 0.182150 0.071595
Male No 0.160669 0.041849
Yes 0.152771 0.090588

In [12]:
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)
result


Out[12]:
tip_pct total_bill
count mean max count mean max
sex smoker
Female No 54 0.156921 0.252672 54 18.105185 35.83
Yes 33 0.182150 0.416667 33 17.977879 44.30
Male No 97 0.160669 0.291990 97 19.791237 48.33
Yes 60 0.152771 0.710345 60 22.284500 50.81

In [13]:
result['tip_pct']


Out[13]:
count mean max
sex smoker
Female No 54 0.156921 0.252672
Yes 33 0.182150 0.416667
Male No 97 0.160669 0.291990
Yes 60 0.152771 0.710345

In [14]:
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)


Out[14]:
tip_pct total_bill
Durchschnitt Abweichung Durchschnitt Abweichung
sex smoker
Female No 0.156921 0.001327 18.105185 53.092422
Yes 0.182150 0.005126 17.977879 84.451517
Male No 0.160669 0.001751 19.791237 76.152961
Yes 0.152771 0.008206 22.284500 98.244673

In [15]:
grouped.agg({'tip': np.max, 'size' : 'sum'})


Out[15]:
tip size
sex smoker
Female No 5.2 140
Yes 6.5 74
Male No 9.0 263
Yes 10.0 150

In [16]:
grouped.agg({'tip_pct':['min', 'max', 'mean', 'std'],
            'size' : 'sum'})


Out[16]:
tip_pct size
min max mean std sum
sex smoker
Female No 0.056797 0.252672 0.156921 0.036421 140
Yes 0.056433 0.416667 0.182150 0.071595 74
Male No 0.071804 0.291990 0.160669 0.041849 263
Yes 0.035638 0.710345 0.152771 0.090588 150

In [17]:
tips.groupby(['sex', 'smoker'], as_index=False).mean()


Out[17]:
sex smoker total_bill tip size tip_pct
0 Female No 18.105185 2.773519 2.592593 0.156921
1 Female Yes 17.977879 2.931515 2.242424 0.182150
2 Male No 19.791237 3.113402 2.711340 0.160669
3 Male Yes 22.284500 3.051167 2.500000 0.152771

In [18]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]

In [19]:
top(tips, n=6)


Out[19]:
total_bill tip sex smoker day time size tip_pct
109 14.31 4.00 Female Yes Sat Dinner 2 0.279525
183 23.17 6.50 Male Yes Sun Dinner 4 0.280535
232 11.61 3.39 Male No Sat Dinner 2 0.291990
67 3.07 1.00 Female Yes Sat Dinner 1 0.325733
178 9.60 4.00 Female Yes Sun Dinner 2 0.416667
172 7.25 5.15 Male Yes Sun Dinner 2 0.710345

In [20]:
tips.groupby('smoker').apply(top)


Out[20]:
total_bill tip sex smoker day time size tip_pct
smoker
No 88 24.71 5.85 Male No Thur Lunch 2 0.236746
185 20.69 5.00 Male No Sun Dinner 5 0.241663
51 10.29 2.60 Female No Sun Dinner 2 0.252672
149 7.51 2.00 Male No Thur Lunch 2 0.266312
232 11.61 3.39 Male No Sat Dinner 2 0.291990
Yes 109 14.31 4.00 Female Yes Sat Dinner 2 0.279525
183 23.17 6.50 Male Yes Sun Dinner 4 0.280535
67 3.07 1.00 Female Yes Sat Dinner 1 0.325733
178 9.60 4.00 Female Yes Sun Dinner 2 0.416667
172 7.25 5.15 Male Yes Sun Dinner 2 0.710345

In [21]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')


Out[21]:
total_bill tip sex smoker day time size tip_pct
smoker day
No Fri 94 22.75 3.25 Female No Fri Dinner 2 0.142857
Sat 212 48.33 9.00 Male No Sat Dinner 4 0.186220
Sun 156 48.17 5.00 Male No Sun Dinner 6 0.103799
Thur 142 41.19 5.00 Male No Thur Lunch 5 0.121389
Yes Fri 95 40.17 4.73 Male Yes Fri Dinner 4 0.117750
Sat 170 50.81 10.00 Male Yes Sat Dinner 3 0.196812
Sun 182 45.35 3.50 Male Yes Sun Dinner 3 0.077178
Thur 197 43.11 5.00 Female Yes Thur Lunch 4 0.115982

In [22]:
result = tips.groupby('smoker')['tip_pct'].describe()
result


Out[22]:
smoker       
No      count    151.000000
        mean       0.159328
        std        0.039910
        min        0.056797
        25%        0.136906
        50%        0.155625
        75%        0.185014
        max        0.291990
Yes     count     93.000000
        mean       0.163196
        std        0.085119
        min        0.035638
        25%        0.106771
        50%        0.153846
        75%        0.195059
        max        0.710345
Name: tip_pct, dtype: float64

In [23]:
result.unstack('smoker')


Out[23]:
smoker No Yes
count 151.000000 93.000000
mean 0.159328 0.163196
std 0.039910 0.085119
min 0.056797 0.035638
25% 0.136906 0.106771
50% 0.155625 0.153846
75% 0.185014 0.195059
max 0.291990 0.710345

In [24]:
f = lambda x: x.describe()
tips.groupby('smoker')['tip_pct'].apply(f).unstack('smoker')


Out[24]:
smoker No Yes
count 151.000000 93.000000
mean 0.159328 0.163196
std 0.039910 0.085119
min 0.056797 0.035638
25% 0.136906 0.106771
50% 0.155625 0.153846
75% 0.185014 0.195059
max 0.291990 0.710345

In [25]:
tips.groupby('smoker', group_keys=False).apply(top)


Out[25]:
total_bill tip sex smoker day time size tip_pct
88 24.71 5.85 Male No Thur Lunch 2 0.236746
185 20.69 5.00 Male No Sun Dinner 5 0.241663
51 10.29 2.60 Female No Sun Dinner 2 0.252672
149 7.51 2.00 Male No Thur Lunch 2 0.266312
232 11.61 3.39 Male No Sat Dinner 2 0.291990
109 14.31 4.00 Female Yes Sat Dinner 2 0.279525
183 23.17 6.50 Male Yes Sun Dinner 4 0.280535
67 3.07 1.00 Female Yes Sat Dinner 1 0.325733
178 9.60 4.00 Female Yes Sun Dinner 2 0.416667
172 7.25 5.15 Male Yes Sun Dinner 2 0.710345

In [30]:
tips.pivot_table(index=['sex', 'smoker'])


Out[30]:
size tip tip_pct total_bill
sex smoker
Female No 2.592593 2.773519 0.156921 18.105185
Yes 2.242424 2.931515 0.182150 17.977879
Male No 2.711340 3.113402 0.160669 19.791237
Yes 2.500000 3.051167 0.152771 22.284500

In [32]:
tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],
                columns='smoker')


Out[32]:
tip_pct size
smoker No Yes No Yes
sex day
Female Fri 0.165296 0.209129 2.500000 2.000000
Sat 0.147993 0.163817 2.307692 2.200000
Sun 0.165710 0.237075 3.071429 2.500000
Thur 0.155971 0.163073 2.480000 2.428571
Male Fri 0.138005 0.144730 2.000000 2.125000
Sat 0.162132 0.139067 2.656250 2.629630
Sun 0.158291 0.173964 2.883721 2.600000
Thur 0.165706 0.164417 2.500000 2.300000

In [33]:
tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],
                columns='smoker', margins=True)


Out[33]:
tip_pct size
smoker No Yes All No Yes All
sex day
Female Fri 0.165296 0.209129 0.199388 2.500000 2.000000 2.111111
Sat 0.147993 0.163817 0.156470 2.307692 2.200000 2.250000
Sun 0.165710 0.237075 0.181569 3.071429 2.500000 2.944444
Thur 0.155971 0.163073 0.157525 2.480000 2.428571 2.468750
Male Fri 0.138005 0.144730 0.143385 2.000000 2.125000 2.100000
Sat 0.162132 0.139067 0.151577 2.656250 2.629630 2.644068
Sun 0.158291 0.173964 0.162344 2.883721 2.600000 2.810345
Thur 0.165706 0.164417 0.165276 2.500000 2.300000 2.433333
All 0.159328 0.163196 0.160803 2.668874 2.408602 2.569672

In [34]:
tips.pivot_table('tip_pct', index=['sex', 'smoker'],
                columns='day', aggfunc=len, margins=True)


Out[34]:
day Fri Sat Sun Thur All
sex smoker
Female No 2.0 13.0 14.0 25.0 54.0
Yes 7.0 15.0 4.0 7.0 33.0
Male No 2.0 32.0 43.0 20.0 97.0
Yes 8.0 27.0 15.0 10.0 60.0
All 19.0 87.0 76.0 62.0 244.0

In [36]:
tips.pivot_table('size', index=['time', 'sex', 'smoker'],
                columns='day', aggfunc='sum', fill_value=0, margins=True)


Out[36]:
day Fri Sat Sun Thur All
time sex smoker
Dinner Female No 2.0 30.0 43.0 2.0 77.0
Yes 8.0 33.0 10.0 0.0 51.0
Male No 4.0 85.0 124.0 0.0 213.0
Yes 12.0 71.0 39.0 0.0 122.0
Lunch Female No 3.0 0.0 0.0 60.0 63.0
Yes 6.0 0.0 0.0 17.0 23.0
Male No 0.0 0.0 0.0 50.0 50.0
Yes 5.0 0.0 0.0 23.0 28.0
All 40.0 219.0 216.0 152.0 627.0

In [37]:
pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)


Out[37]:
smoker No Yes All
time day
Dinner Fri 3 9 12
Sat 45 42 87
Sun 57 19 76
Thur 1 0 1
Lunch Fri 1 6 7
Thur 44 17 61
All 151 93 244

In [26]:
import seaborn as sns

tips = sns.load_dataset( "tips" )

sns.jointplot( "total_bill" , "tip" , tips, kind = 'reg' );



In [ ]:


In [27]:
sns.lmplot( "total_bill" , "tip" , tips, col = "smoker" );



In [ ]: