In [ ]:
%run setup_env.py
%matplotlib inline
label - group name
mappingfor name, group in grouped:
print(name)
print(group)
grouped = df.groupby('PK')
grouped.agg({
'A': np.sum,
'B': 'first',
'C': 'last',
'D': 'mean',
})
In [ ]:
df = pd.DataFrame(np.random.randn(10, 5), columns=['A', 'B', 'C', 'D', 'E'])
df['A'] = [np.random.choice(['T', 'A', 'N']) for _ in range(10)]
df['B'] = [np.random.choice(['T', 'A', 'N']) for _ in range(10)]
display_html(df)
df.groupby('A').agg({
'B': 'first',
'C': np.sum,
'D': 'last',
})
In [ ]:
df = pd.DataFrame(np.random.randn(10, 5), columns=['A', 'B', 'C', 'D', 'E'])
df['A'] = [np.random.choice(['T', 'A', 'N']) for _ in range(10)]
df['B'] = [np.random.choice(['T', 'A', 'N']) for _ in range(10)]
display_html(df)
zscore = lambda x: (x - x.mean()) / x.std()
grouped = df.groupby('A')
grouped.transform(zscore)
In [ ]:
log.info('compare transformed with original data')
ts = pd.Series(np.random.normal(0.5, 2, 1100), pd.date_range('2000/1/1', periods=1100))
transformed = ts.groupby(lambda x: x.year).transform(lambda x: (x - x.mean()) / x.std())
pd.DataFrame({'original': ts, 'transformed': transformed}).plot()
In [ ]:
df = pd.DataFrame({'A': np.arange(10), 'B': list('abbbccdddd'), 'C': np.random.randint(0, 10, 10)})
df2 = df.groupby('B').filter(lambda x: x['C'].sum() > 10)
df3 = df.groupby('B').head(2)
display_html(df, df2, df3)
In [ ]:
s = pd.Series(np.random.randn(100))
s.groupby(pd.qcut(s, [0, .25, .5, .75, 1.])).mean()
In [ ]:
from datetime import datetime
df = pd.DataFrame({
'Branch' : 'A A A A A A A B'.split(),
'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
'Quantity': [1,3,5,1,8,1,9,3],
'Date' : [
datetime(2013,1,1,13,0),
datetime(2013,1,1,13,5),
datetime(2013,10,1,20,0),
datetime(2013,10,2,10,0),
datetime(2013,10,1,20,0),
datetime(2013,10,2,10,0),
datetime(2013,12,2,12,0),
datetime(2013,12,2,14,0),
]})
df2 = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()
display_html(df2)
df = df.set_index('Date')
df['Date'] = df.index + pd.offsets.MonthBegin(3)
df3 = df.groupby([pd.Grouper(freq='2M',key='Date'),'Buyer']).sum()
display_html(df3)