In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pandas import DataFrame, Series

In [2]:
frame = DataFrame({'data1': np.random.randn(1000),
                  'data2': np.random.randn(1000)})
factor = pd.cut(frame.data1, 4)
factor[:10]


Out[2]:
0    (-1.711, -0.142]
1    (-1.711, -0.142]
2    (-1.711, -0.142]
3     (-0.142, 1.427]
4      (1.427, 2.996]
5    (-1.711, -0.142]
6    (-1.711, -0.142]
7    (-1.711, -0.142]
8     (-0.142, 1.427]
9    (-1.711, -0.142]
Name: data1, dtype: category
Categories (4, object): [(-3.286, -1.711] < (-1.711, -0.142] < (-0.142, 1.427] < (1.427, 2.996]]

In [5]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
           'count': group.count(), 'mean': group.mean()}

grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()


Out[5]:
count max mean min
data1
(-3.286, -1.711] 36.0 1.429755 0.133470 -1.841987
(-1.711, -0.142] 413.0 2.706651 -0.029715 -3.052640
(-0.142, 1.427] 454.0 3.396982 -0.055334 -2.835381
(1.427, 2.996] 97.0 2.247663 -0.085518 -2.458182

In [6]:
# 返回分位数编号
grouping = pd.qcut(frame.data1, 10, labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()


Out[6]:
count max mean min
data1
0 100.0 2.281205 -0.014777 -1.841987
1 100.0 2.044891 -0.081777 -3.052640
2 100.0 2.029033 -0.069251 -1.902880
3 100.0 2.706651 0.017314 -1.686695
4 100.0 3.139268 0.178949 -1.903161
5 100.0 3.142929 -0.267838 -2.821386
6 100.0 2.158538 -0.065632 -1.945404
7 100.0 2.321973 -0.095375 -2.125057
8 100.0 3.396982 0.057176 -2.835381
9 100.0 2.247663 -0.067630 -2.458182

In [8]:
s=Series(np.random.randn(6))
s[::2] = np.nan
s


Out[8]:
0         NaN
1   -0.913053
2         NaN
3    0.717628
4         NaN
5    0.511689
dtype: float64

In [9]:
s.fillna(s.mean())


Out[9]:
0    0.105421
1   -0.913053
2    0.105421
3    0.717628
4    0.105421
5    0.511689
dtype: float64

In [10]:
states = ['Ohio', 'New York', 'Vermont', 'Florida',
         'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East']*4 + ['West'] * 4
data = Series(np.random.randn(8), index=states)
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data


Out[10]:
Ohio         -0.033187
New York      2.496403
Vermont            NaN
Florida      -1.350902
Oregon        1.325042
Nevada             NaN
California    1.603818
Idaho              NaN
dtype: float64

In [11]:
data.groupby(group_key).mean()


Out[11]:
East    0.370771
West    1.464430
dtype: float64

In [12]:
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)


Out[12]:
Ohio         -0.033187
New York      2.496403
Vermont       0.370771
Florida      -1.350902
Oregon        1.325042
Nevada        1.464430
California    1.603818
Idaho         1.464430
dtype: float64

In [13]:
fill_values = {'East': 0.5, 'West': -1}
fill_func = lambda g: g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)


Out[13]:
Ohio         -0.033187
New York      2.496403
Vermont       0.500000
Florida      -1.350902
Oregon        1.325042
Nevada       -1.000000
California    1.603818
Idaho        -1.000000
dtype: float64

In [14]:
# 红桃(Hearts) 、黑桃(Spades)、梅花(Clubs)、方片(Diamonds)
suits = ['H', 'S', 'C', 'D']
card_val = (range(1, 11) + [10]*3)*4
base_names = ['A'] + range(2, 11) + ['J', 'K', 'Q']
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)
deck = Series(card_val, index=cards)
deck[:13]


Out[14]:
AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

In [15]:
def draw(deck, n=5):
    return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)


Out[15]:
AD    1
2H    2
7H    7
4D    4
8S    8
dtype: int64

In [16]:
get_suit = lambda card: card[-1] # 只要最后一个字母就可以了

deck.groupby(get_suit).apply(draw, n=2)


Out[16]:
C  QC    10
   2C     2
D  4D     4
   KD    10
H  KH    10
   QH    10
S  JS    10
   5S     5
dtype: int64

In [18]:
#另一种办法
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)


Out[18]:
7C      7
JC     10
JD     10
10D    10
10H    10
9H      9
3S      3
10S    10
dtype: int64

In [ ]: