In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
%matplotlib inline
In [2]:
df1 = pd.DataFrame({
'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randn(5),
'data2':np.random.randn(5)})
df1
Out[2]:
In [3]:
# (1) groupby one column
df1_grouped_by_key1 = df1.groupby(by='key1')
print(type(df1_grouped_by_key1))
# list(df1_grouped_by_key1)[0][1]
dict(list(df1_grouped_by_key1))['a']['data1']
Out[3]:
In [4]:
df1_grouped_by_key1.describe()
Out[4]:
In [5]:
print([x for x in df1_grouped_by_key1])
df1_grouped_by_key1.count()
Out[5]:
In [6]:
# (2) groupby multiple columns
df1_grouped_by_key1_key2 = df1.groupby(by=['key1', 'key2'])
df1_grouped_by_key1_key2
Out[6]:
In [7]:
print([x for x in df1_grouped_by_key1_key2])
df1_grouped_by_key1_key2.count()
Out[7]:
In [8]:
# (3) agg on grouped
df1_grouped_by_key1['data1'].agg('sum')
Out[8]:
In [9]:
df1_grouped_by_key1_key2['data1', 'data2'].agg(['mean', 'sum'])
Out[9]:
In [10]:
# (4) apply on grouped
# the distinguish between agg and apply is 'apply' can action on all 'func'
# agg: mean, sum, count and so on, apply can work with user define func
df1_grouped_by_key1.agg('mean')
Out[10]:
In [11]:
# sample above 'agg'
df1_grouped_by_key1.apply(np.mean)
Out[11]:
In [12]:
df1_grouped_by_key1_key2.agg('mean')
Out[12]:
In [13]:
# sample above 'agg'
df1_grouped_by_key1_key2.apply(np.mean)
Out[13]:
In [14]:
df1
Out[14]:
In [15]:
# (1) Stat. by group row and column
df1.pivot_table(index='key1', columns='key2')
# df1.pivot_table(index='key1', columns='key2', values='data1')
Out[15]:
In [16]:
# (2) Stat. sumary
df1.pivot_table(index='key1', columns='key2', margins=True)
Out[16]:
In [17]:
# (3) distinguish: pivot for reshape, row <-> column
# https://nikgrozev.com/2015/07/01/reshaping-in-pandas-pivot-pivot-table-stack-and-unstack-explained-with-pictures/
# Index contains duplicate entries, cannot reshape
# df1.pivot(index='key1', columns='key2')
# if change Item[2]: Item1 --> Item0, will error: Index contains duplicate entries, cannot reshape
# so pivot_table can solve this problem
from collections import OrderedDict
table = OrderedDict((
("Item", ['Item0', 'Item0', 'Item1', 'Item1']),
('CType',['Gold', 'Bronze', 'Gold', 'Silver']),
('USD', ['1$', '2$', '3$', '4$']),
('EU', ['1€', '2€', '3€', '4€'])
))
df2 = pd.DataFrame(table)
df2
Out[17]:
In [18]:
df2.groupby(by=['Item', 'CType']).count()
Out[18]:
In [19]:
df2_piv1_item = df2.pivot(index='Item', columns='CType', values='USD')
df2_piv1_item
Out[19]:
In [20]:
df2_piv2_item = df2.pivot(index='Item', columns='CType')
df2_piv2_item
Out[20]:
In [21]:
# make error test, change Item1 -> Item0
table = OrderedDict((
("Item", ['Item0', 'Item0', 'Item0', 'Item1']),
('CType',['Gold', 'Bronze', 'Gold', 'Silver']),
('USD', ['1', '2', '3', '4']),
('EU', ['1', '2', '3', '4'])
))
df2_2 = pd.DataFrame(table)
df2_2
Out[21]:
In [22]:
# Item0 and Gold --> 2, 2
df2_2.groupby(by=['Item', 'CType']).count()
Out[22]:
In [23]:
# Error: Index contains duplicate entries, cannot reshape
# df2_2_piv2_item = df2_2.pivot(index='Item', columns='CType')
# df2_2_piv2_item
# using pivot_table instead, aggregate these duplicate entries, must assign the aggfunc
# Error: No numeric types to aggregate if aggfunc=np.mean, why ?
df2_2_piv_item = df2_2.pivot_table(index='Item', columns='CType', aggfunc=np.max)
df2_2_piv_item
Out[23]:
In [24]:
# (1) cross tab is-a special pivot table that calculate the frequences of grouped
pd.crosstab(index=df1.key1, columns=df1.key2, margins=True)
Out[24]:
In [25]:
df1.pivot_table(index=['key1'], columns=['key2'], margins=True, aggfunc=len)
Out[25]:
In [26]:
# (2)
pd.crosstab(index=df1.key1, columns=df1.key2, values=df1.data1, aggfunc=np.min)
Out[26]:
In [27]:
df1
Out[27]:
In [28]:
df1.loc[df1.key1=='a']
Out[28]:
In [29]:
df1.query('data1 > -0.4 & data1 < -0.1')
Out[29]: