In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
In [7]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1' : np.random.randn(5),
'data2' : np.random.randn(5)})
df
Out[7]:
In [3]:
grouped = df['data1'].groupby(df['key1'])
grouped
Out[3]:
In [4]:
grouped.mean()
Out[4]:
In [6]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means
Out[6]:
In [7]:
means.unstack()
Out[7]:
In [8]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()
Out[8]:
In [9]:
df.groupby('key1').mean()
Out[9]:
In [10]:
df.groupby(['key1', 'key2']).mean()
Out[10]:
In [11]:
df.groupby(['key1', 'key2']).size()
Out[11]:
In [12]:
for name, group in df.groupby('key1'):
print(name)
print(group)
In [13]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
print((k1, k2))
print(group)
In [14]:
pieces = dict(list(df.groupby('key1')))
pieces['b']
Out[14]:
In [15]:
df.dtypes
Out[15]:
In [16]:
grouped = df.groupby(df.dtypes, axis=1)
dict(list(grouped))
Out[16]:
In [17]:
df.groupby(['key1', 'key2'])[['data2']].mean()
Out[17]:
In [18]:
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped
Out[18]:
In [19]:
s_grouped.mean()
Out[19]:
In [2]:
people = DataFrame(np.random.randn(5, 5),
columns=['a', 'b', 'c', 'd', 'e'],
index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.ix[2:3, ['b', 'c']] = np.nan # Add a few NA values
people
Out[2]:
In [21]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
'd': 'blue', 'e': 'red', 'f' : 'orange'}
In [22]:
by_column = people.groupby(mapping, axis=1)
by_column.sum()
Out[22]:
In [23]:
map_series = Series(mapping)
map_series
Out[23]:
In [24]:
people.groupby(map_series, axis=1).count()
Out[24]:
In [3]:
people.groupby(len).sum()
Out[3]:
In [4]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()
Out[4]:
In [5]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
[1, 3, 5, 1, 3]], names=['cty', 'tenor'])
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)
hier_df
Out[5]:
In [6]:
hier_df.groupby(level='cty', axis=1).count()
Out[6]:
In [8]:
df
Out[8]:
In [9]:
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)
Out[9]:
In [10]:
def peak_to_peak(arr):
return arr.max() - arr.min()
grouped.agg(peak_to_peak)
Out[10]:
In [11]:
grouped.describe()
Out[11]:
In [13]:
tips = pd.read_csv('tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]
Out[13]:
In [14]:
grouped = tips.groupby(['sex', 'smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
Out[14]:
In [15]:
grouped_pct.agg(['mean', 'std', peak_to_peak])
Out[15]:
In [16]:
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])
Out[16]:
In [17]:
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)
result
Out[17]:
In [18]:
result['tip_pct']
Out[18]:
In [19]:
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)
Out[19]:
In [20]:
grouped.agg({'tip' : np.max, 'size' : 'sum'})
Out[20]:
In [21]:
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],
'size' : 'sum'})
Out[21]:
In [22]:
tips.groupby(['sex', 'smoker'], as_index=False).mean()
Out[22]:
In [23]:
df
Out[23]:
In [24]:
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means
Out[24]:
In [25]:
pd.merge(df, k1_means, left_on='key1', right_index=True)
Out[25]:
In [26]:
key = ['one', 'two', 'one', 'two', 'one']
people.groupby(key).mean()
Out[26]:
In [27]:
people.groupby(key).transform(np.mean)
Out[27]:
In [28]:
def demean(arr):
return arr - arr.mean()
demeaned = people.groupby(key).transform(demean)
demeaned
Out[28]:
In [29]:
demeaned.groupby(key).mean()
Out[29]:
In [31]:
def top(df, n=5, column='tip_pct'):
return df.sort_values(by=column)[-n:]
top(tips, n=6)
Out[31]:
In [32]:
tips.groupby('smoker').apply(top)
Out[32]:
In [33]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')
Out[33]:
In [34]:
result = tips.groupby('smoker')['tip_pct'].describe()
result
Out[34]:
In [35]:
result.unstack('smoker')
Out[35]:
In [36]:
tips.groupby('smoker', group_keys=False).apply(top)
Out[36]:
In [ ]: