In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pandas import DataFrame, Series

In [9]:
people = DataFrame(np.random.randn(5, 5),
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.ix[2:3, ['b', 'c']] = np.nan # 添加几个NA值
people


Out[9]:
a b c d e
Joe 0.164279 -0.288185 0.860354 -2.456273 -1.154645
Steve -0.529059 -0.787231 -1.175355 0.405180 -1.227246
Wes -0.583101 NaN NaN 0.451797 -0.445886
Jim -2.297013 0.741557 0.531242 -0.561197 -0.358583
Travis 0.570748 0.366625 -0.298591 -0.699932 0.180338

In [12]:
key = ['one', 'two', 'one', 'two', 'one']
people.groupby(key).mean()


Out[12]:
a b c d e
one 0.050642 0.039220 0.280882 -0.901469 -0.473398
two -1.413036 -0.022837 -0.322057 -0.078009 -0.792914

In [13]:
people.groupby(key).transform(np.mean)


Out[13]:
a b c d e
Joe 0.050642 0.039220 0.280882 -0.901469 -0.473398
Steve -1.413036 -0.022837 -0.322057 -0.078009 -0.792914
Wes 0.050642 0.039220 0.280882 -0.901469 -0.473398
Jim -1.413036 -0.022837 -0.322057 -0.078009 -0.792914
Travis 0.050642 0.039220 0.280882 -0.901469 -0.473398

In [14]:
def demean(arr):
    return arr - arr.mean()

demeaned = people.groupby(key).transform(demean)
demeaned


Out[14]:
a b c d e
Joe 0.113637 -0.327405 0.579472 -1.554804 -0.681247
Steve 0.883977 -0.764394 -0.853299 0.483189 -0.434331
Wes -0.633743 NaN NaN 1.353266 0.027512
Jim -0.883977 0.764394 0.853299 -0.483189 0.434331
Travis 0.520106 0.327405 -0.579472 0.201538 0.653735

In [15]:
demeaned.groupby(key).mean()


Out[15]:
a b c d e
one 3.700743e-17 0.0 -5.551115e-17 -7.401487e-17 0.000000e+00
two 0.000000e+00 0.0 0.000000e+00 0.000000e+00 5.551115e-17

In [ ]:


In [3]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
          'd': 'blue', 'e': 'red', 'f': 'orange'}

by_column = people.groupby(mapping, axis=1)
by_column.sum()


Out[3]:
blue red
Joe -1.072349 -3.244332
Steve 0.715276 2.127817
Wes 0.810232 0.514027
Jim -1.583890 3.318706
Travis 1.451777 -0.789477

In [ ]:


In [4]:
map_series = Series(mapping)
map_series


Out[4]:
a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [5]:
people.groupby(map_series, axis=1).count()


Out[5]:
blue red
Joe 2 3
Steve 2 3
Wes 1 2
Jim 2 3
Travis 2 3

In [6]:
people.groupby(len).sum()


Out[6]:
a b c d e
3 1.342105 -0.870905 0.360060 -2.206066 0.117201
5 0.013084 1.266295 -0.053526 0.768802 0.848437
6 0.252838 0.727597 0.092327 1.359451 -1.769912

In [7]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()


Out[7]:
a b c d e
3 one -0.025513 -2.077310 0.452118 -1.524466 -1.206672
two 1.327967 1.206405 -0.092058 -1.491832 0.784333
5 one 0.013084 1.266295 -0.053526 0.768802 0.848437
6 two 0.252838 0.727597 0.092327 1.359451 -1.769912

In [ ]: