notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pandas import DataFrame, Series



In [2]:

    
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]], names=['cty', 'tenor'])
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)
hier_df



In [3]:

    
hier_df.groupby(level='cty', axis=1).count()



In [29]:

    
tips = pd.read_csv('ch08/tips.csv')
# 添加"小费占总额百分比"的列
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]









    Out[29]:






  
    
      
      total_bill
      tip
      sex
      smoker
      day
      time
      size
      tip_pct
    
  
  
    
      0
      16.99
      1.01
      Female
      No
      Sun
      Dinner
      2
      0.059447
    
    
      1
      10.34
      1.66
      Male
      No
      Sun
      Dinner
      3
      0.160542
    
    
      2
      21.01
      3.50
      Male
      No
      Sun
      Dinner
      3
      0.166587
    
    
      3
      23.68
      3.31
      Male
      No
      Sun
      Dinner
      2
      0.139780
    
    
      4
      24.59
      3.61
      Female
      No
      Sun
      Dinner
      4
      0.146808
    
    
      5
      25.29
      4.71
      Male
      No
      Sun
      Dinner
      4
      0.186240



In [9]:

    
grouped = tips.groupby(['sex', 'smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')









    Out[9]:





sex     smoker
Female  No        0.156921
        Yes       0.182150
Male    No        0.160669
        Yes       0.152771
Name: tip_pct, dtype: float64



In [10]:

    
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped_pct.agg(['mean', 'std', peak_to_peak])









    Out[10]:






  
    
      
      
      mean
      std
      peak_to_peak
    
    
      sex
      smoker
      
      
      
    
  
  
    
      Female
      No
      0.156921
      0.036421
      0.195876
    
    
      Yes
      0.182150
      0.071595
      0.360233
    
    
      Male
      No
      0.160669
      0.041849
      0.220186
    
    
      Yes
      0.152771
      0.090588
      0.674707



In [11]:

    
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])



In [12]:

    
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)
result



In [13]:

    
result['tip_pct']



In [14]:

    
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)









    Out[14]:






  
    
      
      
      tip_pct
      total_bill
    
    
      
      
      Durchschnitt
      Abweichung
      Durchschnitt
      Abweichung
    
    
      sex
      smoker
      
      
      
      
    
  
  
    
      Female
      No
      0.156921
      0.001327
      18.105185
      53.092422
    
    
      Yes
      0.182150
      0.005126
      17.977879
      84.451517
    
    
      Male
      No
      0.160669
      0.001751
      19.791237
      76.152961
    
    
      Yes
      0.152771
      0.008206
      22.284500
      98.244673



In [15]:

    
grouped.agg({'tip': np.max, 'size' : 'sum'})



In [16]:

    
grouped.agg({'tip_pct':['min', 'max', 'mean', 'std'],
            'size' : 'sum'})



In [17]:

    
tips.groupby(['sex', 'smoker'], as_index=False).mean()



In [18]:

    
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]



In [19]:

    
top(tips, n=6)









    Out[19]:






  
    
      
      total_bill
      tip
      sex
      smoker
      day
      time
      size
      tip_pct
    
  
  
    
      109
      14.31
      4.00
      Female
      Yes
      Sat
      Dinner
      2
      0.279525
    
    
      183
      23.17
      6.50
      Male
      Yes
      Sun
      Dinner
      4
      0.280535
    
    
      232
      11.61
      3.39
      Male
      No
      Sat
      Dinner
      2
      0.291990
    
    
      67
      3.07
      1.00
      Female
      Yes
      Sat
      Dinner
      1
      0.325733
    
    
      178
      9.60
      4.00
      Female
      Yes
      Sun
      Dinner
      2
      0.416667
    
    
      172
      7.25
      5.15
      Male
      Yes
      Sun
      Dinner
      2
      0.710345



In [20]:

    
tips.groupby('smoker').apply(top)



In [21]:

    
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')



In [22]:

    
result = tips.groupby('smoker')['tip_pct'].describe()
result









    Out[22]:





smoker       
No      count    151.000000
        mean       0.159328
        std        0.039910
        min        0.056797
        25%        0.136906
        50%        0.155625
        75%        0.185014
        max        0.291990
Yes     count     93.000000
        mean       0.163196
        std        0.085119
        min        0.035638
        25%        0.106771
        50%        0.153846
        75%        0.195059
        max        0.710345
Name: tip_pct, dtype: float64



In [23]:

    
result.unstack('smoker')



In [24]:

    
f = lambda x: x.describe()
tips.groupby('smoker')['tip_pct'].apply(f).unstack('smoker')



In [25]:

    
tips.groupby('smoker', group_keys=False).apply(top)



In [30]:

    
tips.pivot_table(index=['sex', 'smoker'])



In [32]:

    
tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],
                columns='smoker')



In [33]:

    
tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],
                columns='smoker', margins=True)



In [34]:

    
tips.pivot_table('tip_pct', index=['sex', 'smoker'],
                columns='day', aggfunc=len, margins=True)



In [36]:

    
tips.pivot_table('size', index=['time', 'sex', 'smoker'],
                columns='day', aggfunc='sum', fill_value=0, margins=True)



In [37]:

    
pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)



In [26]:

    
import seaborn as sns

tips = sns.load_dataset( "tips" )

sns.jointplot( "total_bill" , "tip" , tips, kind = 'reg' );



In [ ]:



In [27]:

    
sns.lmplot( "total_bill" , "tip" , tips, col = "smoker" );



In [ ]:

cty	US			JP
tenor	1	3	5	1	3
0	0.026181	-0.343651	-1.839707	-0.239827	0.554371
1	-0.342152	1.696171	0.911705	-0.610586	0.795040
2	-0.229819	0.623547	-0.721125	1.040916	0.937521
3	0.023564	-0.023763	-0.024780	-0.815674	0.752994

		tip_pct		size
	smoker	No	Yes	No	Yes
sex	day
Female	Fri	0.165296	0.209129	2.500000	2.000000
	Sat	0.147993	0.163817	2.307692	2.200000
	Sun	0.165710	0.237075	3.071429	2.500000
	Thur	0.155971	0.163073	2.480000	2.428571
Male	Fri	0.138005	0.144730	2.000000	2.125000
	Sat	0.162132	0.139067	2.656250	2.629630
	Sun	0.158291	0.173964	2.883721	2.600000
	Thur	0.165706	0.164417	2.500000	2.300000

		tip_pct			size
	smoker	No	Yes	All	No	Yes	All
sex	day
Female	Fri	0.165296	0.209129	0.199388	2.500000	2.000000	2.111111
	Sat	0.147993	0.163817	0.156470	2.307692	2.200000	2.250000
	Sun	0.165710	0.237075	0.181569	3.071429	2.500000	2.944444
	Thur	0.155971	0.163073	0.157525	2.480000	2.428571	2.468750
Male	Fri	0.138005	0.144730	0.143385	2.000000	2.125000	2.100000
	Sat	0.162132	0.139067	0.151577	2.656250	2.629630	2.644068
	Sun	0.158291	0.173964	0.162344	2.883721	2.600000	2.810345
	Thur	0.165706	0.164417	0.165276	2.500000	2.300000	2.433333
All		0.159328	0.163196	0.160803	2.668874	2.408602	2.569672

	day	Fri	Sat	Sun	Thur	All
sex	smoker
Female	No	2.0	13.0	14.0	25.0	54.0
Female	Yes	7.0	15.0	4.0	7.0	33.0
Male	No	2.0	32.0	43.0	20.0	97.0
Male	Yes	8.0	27.0	15.0	10.0	60.0
All		19.0	87.0	76.0	62.0	244.0

	total_bill	tip	sex	smoker	day	time	size	tip_pct
0	16.99	1.01	Female	No	Sun	Dinner	2	0.059447
1	10.34	1.66	Male	No	Sun	Dinner	3	0.160542
2	21.01	3.50	Male	No	Sun	Dinner	3	0.166587
3	23.68	3.31	Male	No	Sun	Dinner	2	0.139780
4	24.59	3.61	Female	No	Sun	Dinner	4	0.146808
5	25.29	4.71	Male	No	Sun	Dinner	4	0.186240

		mean	std	peak_to_peak
sex	smoker
Female	No	0.156921	0.036421	0.195876
Female	Yes	0.182150	0.071595	0.360233
Male	No	0.160669	0.041849	0.220186
Male	Yes	0.152771	0.090588	0.674707

		tip_pct			total_bill
		count	mean	max	count	mean	max
sex	smoker
Female	No	54	0.156921	0.252672	54	18.105185	35.83
Female	Yes	33	0.182150	0.416667	33	17.977879	44.30
Male	No	97	0.160669	0.291990	97	19.791237	48.33
Male	Yes	60	0.152771	0.710345	60	22.284500	50.81

		tip_pct		total_bill
		Durchschnitt	Abweichung	Durchschnitt	Abweichung
sex	smoker
Female	No	0.156921	0.001327	18.105185	53.092422
Female	Yes	0.182150	0.005126	17.977879	84.451517
Male	No	0.160669	0.001751	19.791237	76.152961
Male	Yes	0.152771	0.008206	22.284500	98.244673

	sex	smoker	total_bill	tip	size	tip_pct
0	Female	No	18.105185	2.773519	2.592593	0.156921
1	Female	Yes	17.977879	2.931515	2.242424	0.182150
2	Male	No	19.791237	3.113402	2.711340	0.160669
3	Male	Yes	22.284500	3.051167	2.500000	0.152771

	total_bill	tip	sex	smoker	day	time	size	tip_pct
109	14.31	4.00	Female	Yes	Sat	Dinner	2	0.279525
183	23.17	6.50	Male	Yes	Sun	Dinner	4	0.280535
232	11.61	3.39	Male	No	Sat	Dinner	2	0.291990
67	3.07	1.00	Female	Yes	Sat	Dinner	1	0.325733
178	9.60	4.00	Female	Yes	Sun	Dinner	2	0.416667
172	7.25	5.15	Male	Yes	Sun	Dinner	2	0.710345

		total_bill	tip	sex	smoker	day	time	size	tip_pct
smoker
No	88	24.71	5.85	Male	No	Thur	Lunch	2	0.236746
	185	20.69	5.00	Male	No	Sun	Dinner	5	0.241663
	51	10.29	2.60	Female	No	Sun	Dinner	2	0.252672
	149	7.51	2.00	Male	No	Thur	Lunch	2	0.266312
	232	11.61	3.39	Male	No	Sat	Dinner	2	0.291990
Yes	109	14.31	4.00	Female	Yes	Sat	Dinner	2	0.279525
	183	23.17	6.50	Male	Yes	Sun	Dinner	4	0.280535
	67	3.07	1.00	Female	Yes	Sat	Dinner	1	0.325733
	178	9.60	4.00	Female	Yes	Sun	Dinner	2	0.416667
	172	7.25	5.15	Male	Yes	Sun	Dinner	2	0.710345

			total_bill	tip	sex	smoker	day	time	size	tip_pct
smoker	day
No	Fri	94	22.75	3.25	Female	No	Fri	Dinner	2	0.142857
	Sat	212	48.33	9.00	Male	No	Sat	Dinner	4	0.186220
	Sun	156	48.17	5.00	Male	No	Sun	Dinner	6	0.103799
	Thur	142	41.19	5.00	Male	No	Thur	Lunch	5	0.121389
Yes	Fri	95	40.17	4.73	Male	Yes	Fri	Dinner	4	0.117750
	Sat	170	50.81	10.00	Male	Yes	Sat	Dinner	3	0.196812
	Sun	182	45.35	3.50	Male	Yes	Sun	Dinner	3	0.077178
	Thur	197	43.11	5.00	Female	Yes	Thur	Lunch	4	0.115982

smoker	No	Yes
count	151.000000	93.000000
mean	0.159328	0.163196
std	0.039910	0.085119
min	0.056797	0.035638
25%	0.136906	0.106771
50%	0.155625	0.153846
75%	0.185014	0.195059
max	0.291990	0.710345

		day	Fri	Sat	Sun	Thur	All
time	sex	smoker
Dinner	Female	No	2.0	30.0	43.0	2.0	77.0
	Female	Yes	8.0	33.0	10.0	0.0	51.0
	Male	No	4.0	85.0	124.0	0.0	213.0
	Male	Yes	12.0	71.0	39.0	0.0	122.0
Lunch	Female	No	3.0	0.0	0.0	60.0	63.0
	Female	Yes	6.0	0.0	0.0	17.0	23.0
	Male	No	0.0	0.0	0.0	50.0	50.0
	Male	Yes	5.0	0.0	0.0	23.0	28.0
All			40.0	219.0	216.0	152.0	627.0