学习Pandas之透视表(pivotTab)和交叉表(crossTab)


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

%matplotlib inline

一. groupby , pivot_table and crosstab

参考


In [2]:
df1 = pd.DataFrame({
    'key1':['a','a','b','b','a'],
    'key2':['one','two','one','two','one'],
    'data1':np.random.randn(5),
    'data2':np.random.randn(5)})
df1


Out[2]:
data1 data2 key1 key2
0 1.245835 0.101619 a one
1 -0.785056 -0.661818 a two
2 -1.769749 -0.597338 b one
3 -1.212648 -2.601815 b two
4 0.035891 -0.869209 a one

1. groupby


In [3]:
# (1) groupby one column
df1_grouped_by_key1 = df1.groupby(by='key1')
print(type(df1_grouped_by_key1))
# list(df1_grouped_by_key1)[0][1]
dict(list(df1_grouped_by_key1))['a']['data1']


<class 'pandas.core.groupby.DataFrameGroupBy'>
Out[3]:
0    1.245835
1   -0.785056
4    0.035891
Name: data1, dtype: float64

In [4]:
df1_grouped_by_key1.describe()


Out[4]:
data1 data2
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
key1
a 3.0 0.165556 1.021636 -0.785056 -0.374583 0.035891 0.640863 1.245835 3.0 -0.476469 0.511265 -0.869209 -0.765513 -0.661818 -0.280099 0.101619
b 2.0 -1.491199 0.393930 -1.769749 -1.630474 -1.491199 -1.351923 -1.212648 2.0 -1.599576 1.417379 -2.601815 -2.100695 -1.599576 -1.098457 -0.597338

In [5]:
print([x for x in df1_grouped_by_key1])
df1_grouped_by_key1.count()


[('a',       data1     data2 key1 key2
0  1.245835  0.101619    a  one
1 -0.785056 -0.661818    a  two
4  0.035891 -0.869209    a  one), ('b',       data1     data2 key1 key2
2 -1.769749 -0.597338    b  one
3 -1.212648 -2.601815    b  two)]
Out[5]:
data1 data2 key2
key1
a 3 3 3
b 2 2 2

In [6]:
# (2) groupby multiple columns
df1_grouped_by_key1_key2 = df1.groupby(by=['key1', 'key2'])
df1_grouped_by_key1_key2


Out[6]:
<pandas.core.groupby.DataFrameGroupBy object at 0x7fcc87498f60>

In [7]:
print([x for x in df1_grouped_by_key1_key2])
df1_grouped_by_key1_key2.count()


[(('a', 'one'),       data1     data2 key1 key2
0  1.245835  0.101619    a  one
4  0.035891 -0.869209    a  one), (('a', 'two'),       data1     data2 key1 key2
1 -0.785056 -0.661818    a  two), (('b', 'one'),       data1     data2 key1 key2
2 -1.769749 -0.597338    b  one), (('b', 'two'),       data1     data2 key1 key2
3 -1.212648 -2.601815    b  two)]
Out[7]:
data1 data2
key1 key2
a one 2 2
two 1 1
b one 1 1
two 1 1

In [8]:
# (3) agg on grouped
df1_grouped_by_key1['data1'].agg('sum')


Out[8]:
key1
a    0.496669
b   -2.982397
Name: data1, dtype: float64

In [9]:
df1_grouped_by_key1_key2['data1', 'data2'].agg(['mean', 'sum'])


Out[9]:
data1 data2
mean sum mean sum
key1 key2
a one 0.640863 1.281726 -0.383795 -0.767590
two -0.785056 -0.785056 -0.661818 -0.661818
b one -1.769749 -1.769749 -0.597338 -0.597338
two -1.212648 -1.212648 -2.601815 -2.601815

In [10]:
# (4) apply on grouped
# the distinguish between agg and apply is 'apply' can action on all 'func'
# agg: mean, sum, count and so on, apply can work with user define func
df1_grouped_by_key1.agg('mean')


Out[10]:
data1 data2
key1
a 0.165556 -0.476469
b -1.491199 -1.599576

In [11]:
# sample above 'agg'
df1_grouped_by_key1.apply(np.mean)


Out[11]:
data1 data2
key1
a 0.165556 -0.476469
b -1.491199 -1.599576

In [12]:
df1_grouped_by_key1_key2.agg('mean')


Out[12]:
data1 data2
key1 key2
a one 0.640863 -0.383795
two -0.785056 -0.661818
b one -1.769749 -0.597338
two -1.212648 -2.601815

In [13]:
# sample above 'agg'
df1_grouped_by_key1_key2.apply(np.mean)


Out[13]:
data1 data2
key1 key2
a one 0.640863 -0.383795
two -0.785056 -0.661818
b one -1.769749 -0.597338
two -1.212648 -2.601815

2. pivot_table & pivot


In [14]:
df1


Out[14]:
data1 data2 key1 key2
0 1.245835 0.101619 a one
1 -0.785056 -0.661818 a two
2 -1.769749 -0.597338 b one
3 -1.212648 -2.601815 b two
4 0.035891 -0.869209 a one

In [15]:
# (1) Stat. by group row and column
df1.pivot_table(index='key1', columns='key2')
# df1.pivot_table(index='key1', columns='key2', values='data1')


Out[15]:
data1 data2
key2 one two one two
key1
a 0.640863 -0.785056 -0.383795 -0.661818
b -1.769749 -1.212648 -0.597338 -2.601815

In [16]:
# (2) Stat. sumary
df1.pivot_table(index='key1', columns='key2', margins=True)


Out[16]:
data1 data2
key2 one two All one two All
key1
a 0.640863 -0.785056 0.165556 -0.383795 -0.661818 -0.476469
b -1.769749 -1.212648 -1.491199 -0.597338 -2.601815 -1.599576
All -0.162674 -0.998852 -0.497146 -0.454976 -1.631816 -0.925712

In [17]:
# (3) distinguish: pivot for reshape, row <-> column
# https://nikgrozev.com/2015/07/01/reshaping-in-pandas-pivot-pivot-table-stack-and-unstack-explained-with-pictures/
# Index contains duplicate entries, cannot reshape
# df1.pivot(index='key1', columns='key2')

# if change Item[2]: Item1 --> Item0, will error: Index contains duplicate entries, cannot reshape
# so pivot_table can solve this problem

from collections import OrderedDict
table = OrderedDict((
    ("Item", ['Item0', 'Item0', 'Item1', 'Item1']),
    ('CType',['Gold', 'Bronze', 'Gold', 'Silver']),
    ('USD',  ['1$', '2$', '3$', '4$']),
    ('EU',   ['1€', '2€', '3€', '4€'])
))
df2 = pd.DataFrame(table)
df2


Out[17]:
Item CType USD EU
0 Item0 Gold 1$ 1€
1 Item0 Bronze 2$ 2€
2 Item1 Gold 3$ 3€
3 Item1 Silver 4$ 4€

In [18]:
df2.groupby(by=['Item', 'CType']).count()


Out[18]:
USD EU
Item CType
Item0 Bronze 1 1
Gold 1 1
Item1 Gold 1 1
Silver 1 1

In [19]:
df2_piv1_item = df2.pivot(index='Item', columns='CType', values='USD')
df2_piv1_item


Out[19]:
CType Bronze Gold Silver
Item
Item0 2$ 1$ None
Item1 None 3$ 4$

In [20]:
df2_piv2_item = df2.pivot(index='Item', columns='CType')
df2_piv2_item


Out[20]:
USD EU
CType Bronze Gold Silver Bronze Gold Silver
Item
Item0 2$ 1$ None 2€ 1€ None
Item1 None 3$ 4$ None 3€ 4€

In [21]:
# make error test, change Item1 -> Item0
table = OrderedDict((
    ("Item", ['Item0', 'Item0', 'Item0', 'Item1']),
    ('CType',['Gold', 'Bronze', 'Gold', 'Silver']),
    ('USD',  ['1', '2', '3', '4']),
    ('EU',   ['1', '2', '3', '4'])
))

df2_2 = pd.DataFrame(table)
df2_2


Out[21]:
Item CType USD EU
0 Item0 Gold 1 1
1 Item0 Bronze 2 2
2 Item0 Gold 3 3
3 Item1 Silver 4 4

In [22]:
# Item0 and Gold --> 2, 2
df2_2.groupby(by=['Item', 'CType']).count()


Out[22]:
USD EU
Item CType
Item0 Bronze 1 1
Gold 2 2
Item1 Silver 1 1

In [23]:
# Error: Index contains duplicate entries, cannot reshape
# df2_2_piv2_item = df2_2.pivot(index='Item', columns='CType')
# df2_2_piv2_item

# using pivot_table instead, aggregate these duplicate entries, must assign the aggfunc
# Error: No numeric types to aggregate if aggfunc=np.mean, why ?
df2_2_piv_item = df2_2.pivot_table(index='Item', columns='CType', aggfunc=np.max)
df2_2_piv_item


Out[23]:
EU USD
CType Bronze Gold Silver Bronze Gold Silver
Item
Item0 2 3 None 2 3 None
Item1 None None 4 None None 4

3. crosstab


In [24]:
# (1) cross tab is-a special pivot table that calculate the frequences of grouped 
pd.crosstab(index=df1.key1, columns=df1.key2, margins=True)


Out[24]:
key2 one two All
key1
a 2 1 3
b 1 1 2
All 3 2 5

In [25]:
df1.pivot_table(index=['key1'], columns=['key2'], margins=True, aggfunc=len)


Out[25]:
data1 data2
key2 one two All one two All
key1
a 2.0 1.0 3.0 2.0 1.0 3.0
b 1.0 1.0 2.0 1.0 1.0 2.0
All 3.0 2.0 5.0 3.0 2.0 5.0

In [26]:
# (2)
pd.crosstab(index=df1.key1, columns=df1.key2, values=df1.data1, aggfunc=np.min)


Out[26]:
key2 one two
key1
a 0.035891 -0.785056
b -1.769749 -1.212648

二. 过滤

参考

1. loc


In [27]:
df1


Out[27]:
data1 data2 key1 key2
0 1.245835 0.101619 a one
1 -0.785056 -0.661818 a two
2 -1.769749 -0.597338 b one
3 -1.212648 -2.601815 b two
4 0.035891 -0.869209 a one

In [28]:
df1.loc[df1.key1=='a']


Out[28]:
data1 data2 key1 key2
0 1.245835 0.101619 a one
1 -0.785056 -0.661818 a two
4 0.035891 -0.869209 a one

2.query


In [29]:
df1.query('data1 > -0.4 & data1 < -0.1')


/usr/local/lib/python3.4/dist-packages/pandas/core/computation/check.py:17: UserWarning: The installed version of numexpr 2.2.2 is not supported in pandas and will be not be used
The minimum supported version is 2.4.6

  ver=ver, min_ver=_MIN_NUMEXPR_VERSION), UserWarning)
Out[29]:
data1 data2 key1 key2