このノートブックは、pandasのドキュメントのCookbookを写経したものです。
In [1]:
import functools
import itertools
import numpy as np
import pandas as pd
In [2]:
df = pd.DataFrame({
'AAA': [4, 5, 6, 7],
'BBB': [10, 20, 30, 40],
'CCC': [100, 50, -30, -50]
})
df
Out[2]:
In [3]:
df.loc[df.AAA >= 5, 'BBB'] = -1
df
Out[3]:
In [4]:
df.loc[df.AAA >= 5, ['BBB', 'CCC']] = 555
df
Out[4]:
In [5]:
df.loc[df.AAA < 5, ['BBB', 'CCC']] = 2000
df
Out[5]:
In [6]:
df_mask = pd.DataFrame({
'AAA': [True] * 4,
'BBB': [False] * 4,
'CCC': [True,False] * 2
})
df.where(df_mask, -1000)
Out[6]:
In [7]:
df = pd.DataFrame({
'AAA': [4, 5, 6, 7],
'BBB': [10, 20, 30, 40],
'CCC': [100, 50, -30, -50]
})
df
Out[7]:
In [8]:
df['logic'] = np.where(df['AAA'] > 5, 'high', 'low')
df
Out[8]:
In [9]:
df = pd.DataFrame({
'AAA': [4, 5, 6, 7],
'BBB': [10, 20, 30, 40],
'CCC': [100, 50, -30, -50]
})
df
Out[9]:
In [10]:
dflow = df[df.AAA <= 5]
dflow
Out[10]:
In [11]:
dfhigh = df[df.AAA > 5]
dfhigh
Out[11]:
In [12]:
df = pd.DataFrame({
'AAA': [4, 5, 6, 7],
'BBB': [10, 20, 30, 40],
'CCC': [100, 50, -30, -50]
})
df
Out[12]:
In [13]:
newseries = df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA']
newseries
Out[13]:
In [14]:
newseries = df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA']
newseries
Out[14]:
In [15]:
df.loc[(df['BBB'] > 25) | (df['CCC'] >= 75), 'AAA'] = 0.1
df
Out[15]:
In [16]:
df = pd.DataFrame({
'AAA': [4,5,6,7],
'BBB': [10,20,30,40],
'CCC': [100,50,-30,-50]
})
df
Out[16]:
In [17]:
aValue = 43.0
df.loc[(df.CCC - aValue).abs().argsort()]
Out[17]:
In [18]:
df = pd.DataFrame({
'AAA': [4,5,6,7],
'BBB': [10,20,30,40],
'CCC': [100,50,-30,-50]
})
Crit1 = df.AAA <= 5.5
Crit2 = df.BBB == 10.0
Crit3 = df.CCC > -40.0
AllCrit = Crit1 & Crit2 & Crit3
CritList = [Crit1,Crit2,Crit3]
AllCrit = functools.reduce(lambda x,y: x & y, CritList)
df[AllCrit]
Out[18]:
In [19]:
df = pd.DataFrame({
'AAA': [4,5,6,7],
'BBB': [10,20,30,40],
'CCC': [100,50,-30,-50]
})
df
Out[19]:
In [20]:
df[(df.AAA <= 6) & (df.index.isin([0,2,4]))]
Out[20]:
In [21]:
data = {
'AAA': [4,5,6,7],
'BBB': [10,20,30,40],
'CCC': [100,50,-30,-50]
}
df = pd.DataFrame(data=data, index=['foo','bar','boo','kar'])
df
Out[21]:
In [22]:
df.loc['bar': 'kar']
df.iloc[0:3]
df.loc['bar': 'kar']
Out[22]:
In [23]:
df2 = pd.DataFrame(data=data,index=[1,2,3,4])
df2.iloc[1:3]
df2.loc[1:3]
Out[23]:
In [24]:
df = pd.DataFrame({
'AAA': [4,5,6,7],
'BBB': [10,20,30,40],
'CCC': [100,50,-30,-50]
})
df[~((df.AAA <= 6) & (df.index.isin([0,2,4])))]
Out[24]:
In [25]:
rng = pd.date_range('1/1/2013', periods=100, freq='D')
data = np.random.randn(100, 4)
cols = ['A','B','C','D']
df1 = pd.DataFrame(data, rng, cols)
df2 = pd.DataFrame(data, rng, cols)
df3 = pd.DataFrame(data, rng, cols)
pf = pd.Panel({'df1': df1, 'df2': df2, 'df3': df3})
pf
Out[25]:
In [26]:
pf = pf.transpose(2, 0, 1)
pf['E'] = pd.DataFrame(data, rng, cols)
pf = pf.transpose(1, 2, 0)
pf
Out[26]:
In [27]:
pf.loc[:,:,'F'] = pd.DataFrame(data, rng, cols)
pf
Out[27]:
In [28]:
df = pd.DataFrame({
'AAA': [1,2,1,3],
'BBB': [1,1,2,2],
'CCC': [2,1,3,1]
})
source_cols = df.columns
new_cols = [str(x) + "_cat" for x in source_cols]
categories = { 1: 'Alpha', 2: 'Beta', 3: 'Charlie' }
df[new_cols] = df[source_cols].applymap(categories.get)
df
Out[28]:
In [29]:
df = pd.DataFrame({
'AAA': [1,1,1,2,2,2,3,3],
'BBB': [2,1,3,4,5,1,2,3]
})
df
Out[29]:
In [30]:
df.loc[df.groupby("AAA")["BBB"].idxmin()]
Out[30]:
In [31]:
df.sort_values(by="BBB").groupby("AAA", as_index=False).first()
Out[31]:
In [32]:
df = pd.DataFrame({
'row': [0,1,2],
'One_X': [1.1,1.1,1.1],
'One_Y': [1.2,1.2,1.2],
'Two_X': [1.11,1.11,1.11],
'Two_Y': [1.22,1.22,1.22]
})
df
Out[32]:
In [33]:
df = df.set_index('row')
df
Out[33]:
In [34]:
df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) for c in df.columns])
df
Out[34]:
In [35]:
df = df.stack(0).reset_index(1)
df
Out[35]:
In [36]:
df.columns = ['Sample', 'All_X', 'All_Y']
In [37]:
cols = pd.MultiIndex.from_tuples([ (x,y) for x in ['A','B','C'] for y in ['O','I']])
df = pd.DataFrame(np.random.randn(2,6), index=['n','m'], columns=cols)
df
Out[37]:
In [38]:
df = df.div(df['C'], level=1)
df
Out[38]:
In [39]:
coords = [('AA','one'), ('AA','six'), ('BB','one'), ('BB','two'), ('BB','six')]
index = pd.MultiIndex.from_tuples(coords)
df = pd.DataFrame([11,22,33,44,55], index, ['MyData'])
df
Out[39]:
In [40]:
df.xs('BB', level=0, axis=0)
Out[40]:
In [41]:
df.xs('six', level=1, axis=0)
Out[41]:
In [42]:
index = list(itertools.product(['Ada','Quinn','Violet'],['Comp','Math','Sci']))
headr = list(itertools.product(['Exams','Labs'],['I','II']))
indx = pd.MultiIndex.from_tuples(index,names=['Student','Course'])
cols = pd.MultiIndex.from_tuples(headr)
data = [[70+x+y+(x*y)%3 for x in range(4)] for y in range(9)]
df = pd.DataFrame(data,indx,cols)
df
Out[42]:
In [43]:
All = slice(None)
df.loc['Violet']
Out[43]:
In [44]:
df.loc[(All, 'Math'), All]
Out[44]:
In [45]:
df.loc[(slice('Ada','Quinn'), 'Math'), All]
Out[45]:
In [46]:
df.loc[(All,'Math'), ('Exams')]
Out[46]:
In [47]:
df.loc[(All,'Math'),(All,'II')]
Out[47]:
In [48]:
df.sort_values(by=('Labs', 'II'), ascending=False)
Out[48]:
In [49]:
df = pd.DataFrame(np.random.randn(6,1), index=pd.date_range('2013-08-01', periods=6, freq='B'), columns=list('A'))
df.loc[df.index[3], 'A'] = np.nan
df
Out[49]:
In [50]:
df.reindex(df.index[::-1]).ffill()
Out[50]:
In [51]:
df = pd.DataFrame({
'animal': 'cat dog cat fish dog cat cat'.split(),
'size': list('SSMMMLL'),
'weight': [8, 10, 11, 1, 20, 12, 12],
'adult' : [False] * 5 + [True] * 2
})
df
Out[51]:
In [52]:
df.groupby('animal').apply(lambda subf: subf['size'][subf['weight'].idxmax()])
Out[52]:
In [53]:
gb = df.groupby(['animal'])
gb.get_group('cat')
Out[53]:
In [54]:
def GrowUp(x):
avg_weight = sum(x[x['size'] == 'S'].weight * 1.5)
avg_weight += sum(x[x['size'] == 'M'].weight * 1.25)
avg_weight += sum(x[x['size'] == 'L'].weight)
avg_weight /= len(x)
return pd.Series(['L',avg_weight,True], index=['size', 'weight', 'adult'])
expected_df = gb.apply(GrowUp)
expected_df
Out[54]:
In [55]:
S = pd.Series([i / 100.0 for i in range(1, 11)])
def CumRet(x,y):
return x * (1 + y)
def Red(x):
return functools.reduce(CumRet, x, 1.0)
S.expanding().apply(Red)
Out[55]:
In [56]:
df = pd.DataFrame({'A' : [1, 1, 2, 2], 'B' : [1, -1, 1, 2]})
gb = df.groupby('A')
def replace(g):
mask = g < 0
g.loc[mask] = g[~mask].mean()
return g
gb.transform(replace)
Out[56]:
In [57]:
df = pd.DataFrame({
'code': ['foo', 'bar', 'baz'] * 2,
'data': [0.16, -0.21, 0.33, 0.45, -0.59, 0.62],
'flag': [False, True] * 3
})
code_groups = df.groupby('code')
agg_n_sort_order = code_groups[['data']].transform(sum).sort_values(by='data')
sorted_df = df.loc[agg_n_sort_order.index]
sorted_df
Out[57]:
In [58]:
rng = pd.date_range(start="2014-10-07",periods=10,freq='2min')
ts = pd.Series(data = list(range(10)), index = rng)
def MyCust(x):
if len(x) > 2:
return x[1] * 1.234
return pd.NaT
mhc = {'Mean': np.mean, 'Max': np.max, 'Custom': MyCust}
ts.resample("5min").apply(mhc)
ts
Out[58]:
In [59]:
df = pd.DataFrame({
'Color': 'Red Red Red Blue'.split(),
'Value': [100, 150, 50, 50]
})
df['Counts'] = df.groupby(['Color']).transform(len)
df
Out[59]:
In [60]:
df = pd.DataFrame({
'line_race': [10, 10, 8, 10, 10, 8],
'beyer': [99, 102, 103, 103, 88, 100]
}, index=[
'Last Gunfighter',
'Last Gunfighter',
'Last Gunfighter',
'Paynter',
'Paynter',
'Paynter'
])
df
Out[60]:
In [61]:
df['beyer_shifted'] = df.groupby(level=0)['beyer'].shift(1)
df
Out[61]:
In [62]:
df = pd.DataFrame({
'host': ['other', 'other', 'that', 'this', 'this'],
'service': ['mail', 'web', 'mail', 'mail', 'web'],
'no': [1, 2, 1, 2, 1]
}).set_index(['host', 'service'])
mask = df.groupby(level=0).agg('idxmax')
df_count = df.loc[mask['no']].reset_index()
df_count
Out[62]:
In [63]:
df = pd.DataFrame([0, 1, 0, 1, 1, 1, 0, 1, 1], columns=['A'])
df.A.groupby((df.A != df.A.shift()).cumsum()).groups
df.A.groupby((df.A != df.A.shift()).cumsum()).cumsum()
Out[63]:
In [64]:
df = pd.DataFrame(data={
'Case': ['A','A','A','B','A','A','B','A','A'],
'Data': np.random.randn(9)
})
dfs = list(zip(*df.groupby((1*(df['Case']=='B')).cumsum().rolling(window=3,min_periods=1).median())))[-1]
dfs[0]
Out[64]:
In [65]:
dfs[1]
Out[65]:
In [66]:
dfs[2]
Out[66]:
In [67]:
df = pd.DataFrame(data={
'Province': ['ON','QC','BC','AL','AL','MN','ON'],
'City': ['Toronto','Montreal','Vancouver','Calgary','Edmonton','Winnipeg','Windsor'],
'Sales': [13,6,16,8,4,3,1]
})
table = pd.pivot_table(df,values=['Sales'], index=['Province'], columns=['City'], aggfunc=np.sum, margins=True)
table.stack('City')
Out[67]:
In [68]:
grades = [48,99,75,80,42,80,72,68,36,78]
df = pd.DataFrame({
'ID': ["x%d" % r for r in range(10)],
'Gender': ['F', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'M'],
'ExamYear': ['2007','2007','2007','2008','2008','2008','2008','2009','2009','2009'],
'Class': ['algebra', 'stats', 'bio', 'algebra', 'algebra', 'stats', 'stats', 'algebra', 'bio', 'bio'],
'Participated': ['yes','yes','yes','yes','no','yes','yes','yes','yes','yes'],
'Passed': ['yes' if x > 50 else 'no' for x in grades],
'Employed': [True,True,True,False,False,False,False,True,True,False],
'Grade': grades
})
df.groupby('ExamYear').agg({
'Participated': lambda x: x.value_counts()['yes'],
'Passed': lambda x: sum(x == 'yes'),
'Employed' : lambda x : sum(x),
'Grade' : lambda x : sum(x) / len(x)
})
Out[68]:
In [69]:
df = pd.DataFrame({
'value': np.random.randn(36)
}, index=pd.date_range('2011-01-01', freq='M', periods=36))
pd.pivot_table(df, index=df.index.month, columns=df.index.year, values='value', aggfunc='sum')
Out[69]:
In [70]:
df = pd.DataFrame(data={
'A': [[2,4,8,16],[100,200],[10,20,30]],
'B': [['a','b','c'],['jj','kk'],['ccc']]
}, index=['I','II','III'])
def SeriesFromSubList(aList):
return pd.Series(aList)
df_orgz = pd.concat(dict([ (ind,row.apply(SeriesFromSubList)) for ind,row in df.iterrows() ]))
In [71]:
df = pd.DataFrame(
data=np.random.randn(2000,2)/10000,
index=pd.date_range('2001-01-01',periods=2000),
columns=['A','B']
)
df
Out[71]:
In [72]:
def gm(aDF,Const):
v = ((((aDF.A + aDF.B) + 1).cumprod()) - 1) * Const
return (aDF.index[0], v.iloc[-1])
S = pd.Series(dict([ gm(df.iloc[i:min(i+51,len(df)-1)],5) for i in range(len(df)-50) ]))
S
Out[72]:
In [73]:
rng = pd.date_range(start='2014-01-01', periods=100)
df = pd.DataFrame({
'Open': np.random.randn(len(rng)),
'Close': np.random.randn(len(rng)),
'Volume': np.random.randint(100,2000,len(rng))
}, index=rng)
df
Out[73]:
In [74]:
def vwap(bars):
return ((bars.Close*bars.Volume).sum()/bars.Volume.sum())
window = 5
s = pd.concat([ (pd.Series(vwap(df.iloc[i:i+window]), index=[df.index[i+window]])) for i in range(len(df)-window) ])
s.round(2)
Out[74]:
In [75]:
rng = pd.date_range('2000-01-01', periods=6)
df1 = pd.DataFrame(np.random.randn(6, 3), index=rng, columns=['A', 'B', 'C'])
df2 = df1.copy()
In [76]:
df = df1.append(df2, ignore_index=True)
df
Out[76]:
In [77]:
df = pd.DataFrame(data={
'Area': ['A'] * 5 + ['C'] * 2,
'Bins': [110] * 2 + [160] * 3 + [40] * 2,
'Test_0': [0, 1, 0, 1, 2, 0, 1],
'Data' : np.random.randn(7)
})
df
Out[77]:
In [78]:
df['Test_1'] = df['Test_0'] - 1
pd.merge(df, df, left_on=['Bins', 'Area','Test_0'], right_on=['Bins', 'Area','Test_1'],suffixes=('_L','_R'))
Out[78]:
In [79]:
df = pd.DataFrame({
'stratifying_var': np.random.uniform(0, 100, 20),
'price': np.random.normal(100, 5, 20)
})
df['quartiles'] = pd.qcut(
df['stratifying_var'],
4,
labels=['0-25%', '25-50%', '50-75%', '75-100%']
)
df.boxplot(column='price', by='quartiles')
Out[79]:
In [ ]:
In [80]:
def expand_grid(data_dict):
rows = itertools.product(*data_dict.values())
return pd.DataFrame.from_records(rows, columns=data_dict.keys())
df = expand_grid({
'height': [60, 70],
'weight': [100, 140, 180],
'sex': ['Male', 'Female']
})
df
Out[80]: