学习Pandas之透视表(pivotTab)和交叉表(crossTab)



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

%matplotlib inline

一. groupby , pivot_table and crosstab

参考



In [2]:

    
df1 = pd.DataFrame({
    'key1':['a','a','b','b','a'],
    'key2':['one','two','one','two','one'],
    'data1':np.random.randn(5),
    'data2':np.random.randn(5)})
df1

1. groupby



In [3]:

    
# (1) groupby one column
df1_grouped_by_key1 = df1.groupby(by='key1')
print(type(df1_grouped_by_key1))
# list(df1_grouped_by_key1)[0][1]
dict(list(df1_grouped_by_key1))['a']['data1']









    



<class 'pandas.core.groupby.DataFrameGroupBy'>






    Out[3]:





0    1.245835
1   -0.785056
4    0.035891
Name: data1, dtype: float64



In [4]:

    
df1_grouped_by_key1.describe()



In [5]:

    
print([x for x in df1_grouped_by_key1])
df1_grouped_by_key1.count()









    



[('a',       data1     data2 key1 key2
0  1.245835  0.101619    a  one
1 -0.785056 -0.661818    a  two
4  0.035891 -0.869209    a  one), ('b',       data1     data2 key1 key2
2 -1.769749 -0.597338    b  one
3 -1.212648 -2.601815    b  two)]






    Out[5]:







  
    
      
      data1
      data2
      key2
    
    
      key1
      
      
      
    
  
  
    
      a
      3
      3
      3
    
    
      b
      2
      2
      2



In [6]:

    
# (2) groupby multiple columns
df1_grouped_by_key1_key2 = df1.groupby(by=['key1', 'key2'])
df1_grouped_by_key1_key2









    Out[6]:





<pandas.core.groupby.DataFrameGroupBy object at 0x7fcc87498f60>



In [7]:

    
print([x for x in df1_grouped_by_key1_key2])
df1_grouped_by_key1_key2.count()









    



[(('a', 'one'),       data1     data2 key1 key2
0  1.245835  0.101619    a  one
4  0.035891 -0.869209    a  one), (('a', 'two'),       data1     data2 key1 key2
1 -0.785056 -0.661818    a  two), (('b', 'one'),       data1     data2 key1 key2
2 -1.769749 -0.597338    b  one), (('b', 'two'),       data1     data2 key1 key2
3 -1.212648 -2.601815    b  two)]






    Out[7]:







  
    
      
      
      data1
      data2
    
    
      key1
      key2
      
      
    
  
  
    
      a
      one
      2
      2
    
    
      two
      1
      1
    
    
      b
      one
      1
      1
    
    
      two
      1
      1



In [8]:

    
# (3) agg on grouped
df1_grouped_by_key1['data1'].agg('sum')









    Out[8]:





key1
a    0.496669
b   -2.982397
Name: data1, dtype: float64



In [9]:

    
df1_grouped_by_key1_key2['data1', 'data2'].agg(['mean', 'sum'])



In [10]:

    
# (4) apply on grouped
# the distinguish between agg and apply is 'apply' can action on all 'func'
# agg: mean, sum, count and so on, apply can work with user define func
df1_grouped_by_key1.agg('mean')



In [11]:

    
# sample above 'agg'
df1_grouped_by_key1.apply(np.mean)



In [12]:

    
df1_grouped_by_key1_key2.agg('mean')



In [13]:

    
# sample above 'agg'
df1_grouped_by_key1_key2.apply(np.mean)

2. pivot_table & pivot



In [14]:

    
df1



In [15]:

    
# (1) Stat. by group row and column
df1.pivot_table(index='key1', columns='key2')
# df1.pivot_table(index='key1', columns='key2', values='data1')



In [16]:

    
# (2) Stat. sumary
df1.pivot_table(index='key1', columns='key2', margins=True)



In [17]:

    
# (3) distinguish: pivot for reshape, row <-> column
# https://nikgrozev.com/2015/07/01/reshaping-in-pandas-pivot-pivot-table-stack-and-unstack-explained-with-pictures/
# Index contains duplicate entries, cannot reshape
# df1.pivot(index='key1', columns='key2')

# if change Item[2]: Item1 --> Item0, will error: Index contains duplicate entries, cannot reshape
# so pivot_table can solve this problem

from collections import OrderedDict
table = OrderedDict((
    ("Item", ['Item0', 'Item0', 'Item1', 'Item1']),
    ('CType',['Gold', 'Bronze', 'Gold', 'Silver']),
    ('USD',  ['1$', '2$', '3$', '4$']),
    ('EU',   ['1€', '2€', '3€', '4€'])
))
df2 = pd.DataFrame(table)
df2



In [18]:

    
df2.groupby(by=['Item', 'CType']).count()



In [19]:

    
df2_piv1_item = df2.pivot(index='Item', columns='CType', values='USD')
df2_piv1_item









    Out[19]:







  
    
      CType
      Bronze
      Gold
      Silver
    
    
      Item
      
      
      
    
  
  
    
      Item0
      2$
      1$
      None
    
    
      Item1
      None
      3$
      4$



In [20]:

    
df2_piv2_item = df2.pivot(index='Item', columns='CType')
df2_piv2_item









    Out[20]:







  
    
      
      USD
      EU
    
    
      CType
      Bronze
      Gold
      Silver
      Bronze
      Gold
      Silver
    
    
      Item
      
      
      
      
      
      
    
  
  
    
      Item0
      2$
      1$
      None
      2€
      1€
      None
    
    
      Item1
      None
      3$
      4$
      None
      3€
      4€



In [21]:

    
# make error test, change Item1 -> Item0
table = OrderedDict((
    ("Item", ['Item0', 'Item0', 'Item0', 'Item1']),
    ('CType',['Gold', 'Bronze', 'Gold', 'Silver']),
    ('USD',  ['1', '2', '3', '4']),
    ('EU',   ['1', '2', '3', '4'])
))

df2_2 = pd.DataFrame(table)
df2_2



In [22]:

    
# Item0 and Gold --> 2, 2
df2_2.groupby(by=['Item', 'CType']).count()



In [23]:

    
# Error: Index contains duplicate entries, cannot reshape
# df2_2_piv2_item = df2_2.pivot(index='Item', columns='CType')
# df2_2_piv2_item

# using pivot_table instead, aggregate these duplicate entries, must assign the aggfunc
# Error: No numeric types to aggregate if aggfunc=np.mean, why ?
df2_2_piv_item = df2_2.pivot_table(index='Item', columns='CType', aggfunc=np.max)
df2_2_piv_item









    Out[23]:







  
    
      
      EU
      USD
    
    
      CType
      Bronze
      Gold
      Silver
      Bronze
      Gold
      Silver
    
    
      Item
      
      
      
      
      
      
    
  
  
    
      Item0
      2
      3
      None
      2
      3
      None
    
    
      Item1
      None
      None
      4
      None
      None
      4

3. crosstab



In [24]:

    
# (1) cross tab is-a special pivot table that calculate the frequences of grouped 
pd.crosstab(index=df1.key1, columns=df1.key2, margins=True)



In [25]:

    
df1.pivot_table(index=['key1'], columns=['key2'], margins=True, aggfunc=len)



In [26]:

    
# (2)
pd.crosstab(index=df1.key1, columns=df1.key2, values=df1.data1, aggfunc=np.min)

二. 过滤

参考

1. loc



In [27]:

    
df1



In [28]:

    
df1.loc[df1.key1=='a']

2.query



In [29]:

    
df1.query('data1 > -0.4 & data1 < -0.1')









    



/usr/local/lib/python3.4/dist-packages/pandas/core/computation/check.py:17: UserWarning: The installed version of numexpr 2.2.2 is not supported in pandas and will be not be used
The minimum supported version is 2.4.6

  ver=ver, min_ver=_MIN_NUMEXPR_VERSION), UserWarning)






    Out[29]:







  
    
      
      data1
      data2
      key1
      key2

	data1								data2
	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max
key1
a	3.0	0.165556	1.021636	-0.785056	-0.374583	0.035891	0.640863	1.245835	3.0	-0.476469	0.511265	-0.869209	-0.765513	-0.661818	-0.280099	0.101619
b	2.0	-1.491199	0.393930	-1.769749	-1.630474	-1.491199	-1.351923	-1.212648	2.0	-1.599576	1.417379	-2.601815	-2.100695	-1.599576	-1.098457	-0.597338

	data1	data2
key1
a	0.165556	-0.476469
b	-1.491199	-1.599576

	data1	data2
key1
a	0.165556	-0.476469
b	-1.491199	-1.599576

	data1			data2
key2	one	two	All	one	two	All
key1
a	0.640863	-0.785056	0.165556	-0.383795	-0.661818	-0.476469
b	-1.769749	-1.212648	-1.491199	-0.597338	-2.601815	-1.599576
All	-0.162674	-0.998852	-0.497146	-0.454976	-1.631816	-0.925712

	data1			data2
key2	one	two	All	one	two	All
key1
a	2.0	1.0	3.0	2.0	1.0	3.0
b	1.0	1.0	2.0	1.0	1.0	2.0
All	3.0	2.0	5.0	3.0	2.0	5.0

	Item	CType	USD	EU
0	Item0	Gold	1$	1€
1	Item0	Bronze	2$	2€
2	Item1	Gold	3$	3€
3	Item1	Silver	4$	4€

	USD			EU
CType	Bronze	Gold	Silver	Bronze	Gold	Silver
Item
Item0	2$	1$	None	2€	1€	None
Item1	None	3$	4$	None	3€	4€