In [1]:
    
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
    
In [7]:
    
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df
    
    Out[7]:
In [3]:
    
grouped = df['data1'].groupby(df['key1'])
grouped
    
    Out[3]:
In [4]:
    
grouped.mean()
    
    Out[4]:
In [6]:
    
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means
    
    Out[6]:
In [7]:
    
means.unstack()
    
    Out[7]:
In [8]:
    
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()
    
    Out[8]:
In [9]:
    
df.groupby('key1').mean()
    
    Out[9]:
In [10]:
    
df.groupby(['key1', 'key2']).mean()
    
    Out[10]:
In [11]:
    
df.groupby(['key1', 'key2']).size()
    
    Out[11]:
In [12]:
    
for name, group in df.groupby('key1'):
    print(name)
    print(group)
    
    
In [13]:
    
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)
    
    
In [14]:
    
pieces = dict(list(df.groupby('key1')))
pieces['b']
    
    Out[14]:
In [15]:
    
df.dtypes
    
    Out[15]:
In [16]:
    
grouped = df.groupby(df.dtypes, axis=1)
dict(list(grouped))
    
    Out[16]:
In [17]:
    
df.groupby(['key1', 'key2'])[['data2']].mean()
    
    Out[17]:
In [18]:
    
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped
    
    Out[18]:
In [19]:
    
s_grouped.mean()
    
    Out[19]:
In [2]:
    
people = DataFrame(np.random.randn(5, 5),
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.ix[2:3, ['b', 'c']] = np.nan # Add a few NA values
people
    
    Out[2]:
In [21]:
    
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'}
    
In [22]:
    
by_column = people.groupby(mapping, axis=1)
by_column.sum()
    
    Out[22]:
In [23]:
    
map_series = Series(mapping)
map_series
    
    Out[23]:
In [24]:
    
people.groupby(map_series, axis=1).count()
    
    Out[24]:
In [3]:
    
people.groupby(len).sum()
    
    Out[3]:
In [4]:
    
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()
    
    Out[4]:
In [5]:
    
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]], names=['cty', 'tenor'])
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)
hier_df
    
    Out[5]:
In [6]:
    
hier_df.groupby(level='cty', axis=1).count()
    
    Out[6]:
In [8]:
    
df
    
    Out[8]:
In [9]:
    
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)
    
    Out[9]:
In [10]:
    
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak)
    
    Out[10]:
In [11]:
    
grouped.describe()
    
    Out[11]:
In [13]:
    
tips = pd.read_csv('tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]
    
    Out[13]:
In [14]:
    
grouped = tips.groupby(['sex', 'smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
    
    Out[14]:
In [15]:
    
grouped_pct.agg(['mean', 'std', peak_to_peak])
    
    Out[15]:
In [16]:
    
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])
    
    Out[16]:
In [17]:
    
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)
result
    
    Out[17]:
In [18]:
    
result['tip_pct']
    
    Out[18]:
In [19]:
    
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)
    
    Out[19]:
In [20]:
    
grouped.agg({'tip' : np.max, 'size' : 'sum'})
    
    Out[20]:
In [21]:
    
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],
             'size' : 'sum'})
    
    Out[21]:
In [22]:
    
tips.groupby(['sex', 'smoker'], as_index=False).mean()
    
    Out[22]:
In [23]:
    
df
    
    Out[23]:
In [24]:
    
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means
    
    Out[24]:
In [25]:
    
pd.merge(df, k1_means, left_on='key1', right_index=True)
    
    Out[25]:
In [26]:
    
key = ['one', 'two', 'one', 'two', 'one']
people.groupby(key).mean()
    
    Out[26]:
In [27]:
    
people.groupby(key).transform(np.mean)
    
    Out[27]:
In [28]:
    
def demean(arr):
    return arr - arr.mean()
demeaned = people.groupby(key).transform(demean)
demeaned
    
    Out[28]:
In [29]:
    
demeaned.groupby(key).mean()
    
    Out[29]:
In [31]:
    
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]
top(tips, n=6)
    
    Out[31]:
In [32]:
    
tips.groupby('smoker').apply(top)
    
    Out[32]:
In [33]:
    
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')
    
    Out[33]:
In [34]:
    
result = tips.groupby('smoker')['tip_pct'].describe()
result
    
    Out[34]:
In [35]:
    
result.unstack('smoker')
    
    Out[35]:
In [36]:
    
tips.groupby('smoker', group_keys=False).apply(top)
    
    Out[36]:
In [ ]: