In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pandas import DataFrame, Series
In [2]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
[1, 3, 5, 1, 3]], names=['cty', 'tenor'])
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)
hier_df
Out[2]:
In [3]:
hier_df.groupby(level='cty', axis=1).count()
Out[3]:
In [29]:
tips = pd.read_csv('ch08/tips.csv')
# 添加"小费占总额百分比"的列
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]
Out[29]:
In [9]:
grouped = tips.groupby(['sex', 'smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
Out[9]:
In [10]:
def peak_to_peak(arr):
return arr.max() - arr.min()
grouped_pct.agg(['mean', 'std', peak_to_peak])
Out[10]:
In [11]:
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])
Out[11]:
In [12]:
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)
result
Out[12]:
In [13]:
result['tip_pct']
Out[13]:
In [14]:
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)
Out[14]:
In [15]:
grouped.agg({'tip': np.max, 'size' : 'sum'})
Out[15]:
In [16]:
grouped.agg({'tip_pct':['min', 'max', 'mean', 'std'],
'size' : 'sum'})
Out[16]:
In [17]:
tips.groupby(['sex', 'smoker'], as_index=False).mean()
Out[17]:
In [18]:
def top(df, n=5, column='tip_pct'):
return df.sort_values(by=column)[-n:]
In [19]:
top(tips, n=6)
Out[19]:
In [20]:
tips.groupby('smoker').apply(top)
Out[20]:
In [21]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')
Out[21]:
In [22]:
result = tips.groupby('smoker')['tip_pct'].describe()
result
Out[22]:
In [23]:
result.unstack('smoker')
Out[23]:
In [24]:
f = lambda x: x.describe()
tips.groupby('smoker')['tip_pct'].apply(f).unstack('smoker')
Out[24]:
In [25]:
tips.groupby('smoker', group_keys=False).apply(top)
Out[25]:
In [30]:
tips.pivot_table(index=['sex', 'smoker'])
Out[30]:
In [32]:
tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],
columns='smoker')
Out[32]:
In [33]:
tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],
columns='smoker', margins=True)
Out[33]:
In [34]:
tips.pivot_table('tip_pct', index=['sex', 'smoker'],
columns='day', aggfunc=len, margins=True)
Out[34]:
In [36]:
tips.pivot_table('size', index=['time', 'sex', 'smoker'],
columns='day', aggfunc='sum', fill_value=0, margins=True)
Out[36]:
In [37]:
pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)
Out[37]:
In [26]:
import seaborn as sns
tips = sns.load_dataset( "tips" )
sns.jointplot( "total_bill" , "tip" , tips, kind = 'reg' );
In [ ]:
In [27]:
sns.lmplot( "total_bill" , "tip" , tips, col = "smoker" );
In [ ]: