In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pandas import DataFrame, Series

In [26]:
df = DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
               'key2': ['one', 'two', 'one', 'two', 'one'],
               'data1': np.random.randn(5),
               'data2': np.random.randn(5)})
df


Out[26]:
data1 data2 key1 key2
0 0.697461 -1.141562 a one
1 -0.546280 -0.516677 a two
2 0.375060 0.741650 b one
3 0.389968 -0.138009 b two
4 -0.628502 0.039023 a one

In [27]:
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)


Out[27]:
key1
a    0.448712
b    0.388477
Name: data1, dtype: float64

In [29]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

grouped.agg(peak_to_peak)


Out[29]:
data1 data2
key1
a 1.325962 1.180585
b 0.014909 0.879659

In [31]:
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means


Out[31]:
mean_data1 mean_data2
key1
a -0.159107 -0.539738
b 0.382514 0.301820

In [32]:
pd.merge(df, k1_means, left_on='key1', right_index=True)


Out[32]:
data1 data2 key1 key2 mean_data1 mean_data2
0 0.697461 -1.141562 a one -0.159107 -0.539738
1 -0.546280 -0.516677 a two -0.159107 -0.539738
4 -0.628502 0.039023 a one -0.159107 -0.539738
2 0.375060 0.741650 b one 0.382514 0.301820
3 0.389968 -0.138009 b two 0.382514 0.301820

In [ ]:


In [30]:
grouped.describe()


Out[30]:
data1 data2
key1
a count 3.000000 3.000000
mean -0.159107 -0.539738
std 0.742948 0.590630
min -0.628502 -1.141562
25% -0.587391 -0.829119
50% -0.546280 -0.516677
75% 0.075590 -0.238827
max 0.697461 0.039023
b count 2.000000 2.000000
mean 0.382514 0.301820
std 0.010542 0.622013
min 0.375060 -0.138009
25% 0.378787 0.081906
50% 0.382514 0.301820
75% 0.386241 0.521735
max 0.389968 0.741650

In [5]:
df['data1'].mean()
df['data2'].sum()


Out[5]:
4.638270548168735

In [6]:
grouped = df['data1'].groupby(df['key1'])

grouped


Out[6]:
<pandas.core.groupby.SeriesGroupBy object at 0x0000000008FDF470>

In [7]:
grouped.mean()


Out[7]:
key1
a    0.062949
b    0.117550
Name: data1, dtype: float64

In [8]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means


Out[8]:
key1  key2
a     one     0.021389
      two     0.146069
b     one    -0.167094
      two     0.402195
Name: data1, dtype: float64

In [9]:
means.unstack()


Out[9]:
key2 one two
key1
a 0.021389 0.146069
b -0.167094 0.402195

In [10]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])

years = np.array([2005, 2005, 2006, 2005, 2006])

df['data1'].groupby([states, years]).mean()


Out[10]:
California  2005    0.146069
            2006   -0.167094
Ohio        2005    0.161579
            2006    0.121814
Name: data1, dtype: float64

In [11]:
df.groupby('key1').mean()


Out[11]:
data1 data2
key1
a 0.062949 1.16219
b 0.117550 0.57585

In [12]:
df.groupby(['key1', 'key2']).mean()


Out[12]:
data1 data2
key1 key2
a one 0.021389 1.368126
two 0.146069 0.750319
b one -0.167094 -0.080059
two 0.402195 1.231759

In [13]:
df.groupby(['key1', 'key2']).size()


Out[13]:
key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [14]:
for name, group in df.groupby('key1'):
    print name
    print group


a
      data1     data2 key1 key2
0 -0.079036  1.583633    a  one
1  0.146069  0.750319    a  two
4  0.121814  1.152618    a  one
b
      data1     data2 key1 key2
2 -0.167094 -0.080059    b  one
3  0.402195  1.231759    b  two

In [15]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print k1, k2
    print group


a one
      data1     data2 key1 key2
0 -0.079036  1.583633    a  one
4  0.121814  1.152618    a  one
a two
      data1     data2 key1 key2
1  0.146069  0.750319    a  two
b one
      data1     data2 key1 key2
2 -0.167094 -0.080059    b  one
b two
      data1     data2 key1 key2
3  0.402195  1.231759    b  two

In [16]:
pieces = dict(list(df.groupby('key1')))
pieces['b']


Out[16]:
data1 data2 key1 key2
2 -0.167094 -0.080059 b one
3 0.402195 1.231759 b two

In [17]:
df.dtypes


Out[17]:
data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [18]:
grouped = df.groupby(df.dtypes, axis=1)
dict(list(grouped))


Out[18]:
{dtype('float64'):       data1     data2
 0 -0.079036  1.583633
 1  0.146069  0.750319
 2 -0.167094 -0.080059
 3  0.402195  1.231759
 4  0.121814  1.152618, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

In [22]:
df.groupby(['key1', 'key2'])[['data2', 'data1']].mean()


Out[22]:
data2 data1
key1 key2
a one 1.368126 0.021389
two 0.750319 0.146069
b one -0.080059 -0.167094
two 1.231759 0.402195

In [23]:
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped


Out[23]:
<pandas.core.groupby.SeriesGroupBy object at 0x00000000090FBB70>

In [24]:
s_grouped.mean()


Out[24]:
key1  key2
a     one     1.368126
      two     0.750319
b     one    -0.080059
      two     1.231759
Name: data2, dtype: float64

In [ ]: