このノートブックは、pandasのドキュメントのCookbookを写経したものです。


In [1]:
import functools
import itertools

import numpy as np
import pandas as pd

Idioms


In [2]:
df = pd.DataFrame({
    'AAA': [4, 5, 6, 7],
    'BBB': [10, 20, 30, 40],
    'CCC': [100, 50, -30, -50]
})

df


Out[2]:
AAA BBB CCC
0 4 10 100
1 5 20 50
2 6 30 -30
3 7 40 -50

if-then...


In [3]:
df.loc[df.AAA >= 5, 'BBB'] = -1

df


Out[3]:
AAA BBB CCC
0 4 10 100
1 5 -1 50
2 6 -1 -30
3 7 -1 -50

In [4]:
df.loc[df.AAA >= 5, ['BBB', 'CCC']] = 555

df


Out[4]:
AAA BBB CCC
0 4 10 100
1 5 555 555
2 6 555 555
3 7 555 555

In [5]:
df.loc[df.AAA < 5, ['BBB', 'CCC']] = 2000

df


Out[5]:
AAA BBB CCC
0 4 2000 2000
1 5 555 555
2 6 555 555
3 7 555 555

In [6]:
df_mask = pd.DataFrame({
    'AAA': [True] * 4,
    'BBB': [False] * 4,
    'CCC': [True,False] * 2
})

df.where(df_mask, -1000)


Out[6]:
AAA BBB CCC
0 4 -1000 2000
1 5 -1000 -1000
2 6 -1000 555
3 7 -1000 -1000

In [7]:
df = pd.DataFrame({
    'AAA': [4, 5, 6, 7],
    'BBB': [10, 20, 30, 40],
    'CCC': [100, 50, -30, -50]
})

df


Out[7]:
AAA BBB CCC
0 4 10 100
1 5 20 50
2 6 30 -30
3 7 40 -50

In [8]:
df['logic'] = np.where(df['AAA'] > 5, 'high', 'low')

df


Out[8]:
AAA BBB CCC logic
0 4 10 100 low
1 5 20 50 low
2 6 30 -30 high
3 7 40 -50 high

Splitting


In [9]:
df = pd.DataFrame({
    'AAA': [4, 5, 6, 7],
    'BBB': [10, 20, 30, 40],
    'CCC': [100, 50, -30, -50]
})

df


Out[9]:
AAA BBB CCC
0 4 10 100
1 5 20 50
2 6 30 -30
3 7 40 -50

In [10]:
dflow = df[df.AAA <= 5]

dflow


Out[10]:
AAA BBB CCC
0 4 10 100
1 5 20 50

In [11]:
dfhigh = df[df.AAA > 5]

dfhigh


Out[11]:
AAA BBB CCC
2 6 30 -30
3 7 40 -50

Building Criteria


In [12]:
df = pd.DataFrame({
    'AAA': [4, 5, 6, 7],
    'BBB': [10, 20, 30, 40],
    'CCC': [100, 50, -30, -50]
})

df


Out[12]:
AAA BBB CCC
0 4 10 100
1 5 20 50
2 6 30 -30
3 7 40 -50

And


In [13]:
newseries = df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA']

newseries


Out[13]:
0    4
1    5
Name: AAA, dtype: int64

Or


In [14]:
newseries = df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA']

newseries


Out[14]:
0    4
1    5
2    6
3    7
Name: AAA, dtype: int64

In [15]:
df.loc[(df['BBB'] > 25) | (df['CCC'] >= 75), 'AAA'] = 0.1

df


Out[15]:
AAA BBB CCC
0 0.1 10 100
1 5.0 20 50
2 0.1 30 -30
3 0.1 40 -50

In [16]:
df = pd.DataFrame({
    'AAA': [4,5,6,7],
    'BBB': [10,20,30,40],
    'CCC': [100,50,-30,-50]
})

df


Out[16]:
AAA BBB CCC
0 4 10 100
1 5 20 50
2 6 30 -30
3 7 40 -50

In [17]:
aValue = 43.0
df.loc[(df.CCC - aValue).abs().argsort()]


Out[17]:
AAA BBB CCC
1 5 20 50
0 4 10 100
2 6 30 -30
3 7 40 -50

In [18]:
df = pd.DataFrame({
    'AAA': [4,5,6,7],
    'BBB': [10,20,30,40],
    'CCC': [100,50,-30,-50]
})

Crit1 = df.AAA <= 5.5
Crit2 = df.BBB == 10.0
Crit3 = df.CCC > -40.0

AllCrit = Crit1 & Crit2 & Crit3

CritList = [Crit1,Crit2,Crit3]

AllCrit = functools.reduce(lambda x,y: x & y, CritList)

df[AllCrit]


Out[18]:
AAA BBB CCC
0 4 10 100

Selection

DataFrames


In [19]:
df = pd.DataFrame({
    'AAA': [4,5,6,7],
    'BBB': [10,20,30,40],
    'CCC': [100,50,-30,-50]
})

df


Out[19]:
AAA BBB CCC
0 4 10 100
1 5 20 50
2 6 30 -30
3 7 40 -50

In [20]:
df[(df.AAA <= 6) & (df.index.isin([0,2,4]))]


Out[20]:
AAA BBB CCC
0 4 10 100
2 6 30 -30

In [21]:
data = {
    'AAA': [4,5,6,7],
    'BBB': [10,20,30,40],
    'CCC': [100,50,-30,-50]
}

df = pd.DataFrame(data=data, index=['foo','bar','boo','kar'])

df


Out[21]:
AAA BBB CCC
foo 4 10 100
bar 5 20 50
boo 6 30 -30
kar 7 40 -50

In [22]:
df.loc['bar': 'kar']

df.iloc[0:3]

df.loc['bar': 'kar']


Out[22]:
AAA BBB CCC
bar 5 20 50
boo 6 30 -30
kar 7 40 -50

In [23]:
df2 = pd.DataFrame(data=data,index=[1,2,3,4])

df2.iloc[1:3]

df2.loc[1:3]


Out[23]:
AAA BBB CCC
1 4 10 100
2 5 20 50
3 6 30 -30

In [24]:
df = pd.DataFrame({
    'AAA': [4,5,6,7],
    'BBB': [10,20,30,40],
    'CCC': [100,50,-30,-50]
})

df[~((df.AAA <= 6) & (df.index.isin([0,2,4])))]


Out[24]:
AAA BBB CCC
1 5 20 50
3 7 40 -50

Panels


In [25]:
rng = pd.date_range('1/1/2013', periods=100, freq='D')

data = np.random.randn(100, 4)

cols = ['A','B','C','D']

df1 = pd.DataFrame(data, rng, cols)
df2 = pd.DataFrame(data, rng, cols)
df3 = pd.DataFrame(data, rng, cols)

pf = pd.Panel({'df1': df1, 'df2': df2, 'df3': df3})

pf


Out[25]:
<class 'pandas.core.panel.Panel'>
Dimensions: 3 (items) x 100 (major_axis) x 4 (minor_axis)
Items axis: df1 to df3
Major_axis axis: 2013-01-01 00:00:00 to 2013-04-10 00:00:00
Minor_axis axis: A to D

In [26]:
pf = pf.transpose(2, 0, 1)

pf['E'] = pd.DataFrame(data, rng, cols)

pf = pf.transpose(1, 2, 0)

pf


Out[26]:
<class 'pandas.core.panel.Panel'>
Dimensions: 3 (items) x 100 (major_axis) x 5 (minor_axis)
Items axis: df1 to df3
Major_axis axis: 2013-01-01 00:00:00 to 2013-04-10 00:00:00
Minor_axis axis: A to E

In [27]:
pf.loc[:,:,'F'] = pd.DataFrame(data, rng, cols)

pf


Out[27]:
<class 'pandas.core.panel.Panel'>
Dimensions: 3 (items) x 100 (major_axis) x 6 (minor_axis)
Items axis: df1 to df3
Major_axis axis: 2013-01-01 00:00:00 to 2013-04-10 00:00:00
Minor_axis axis: A to F

New Columns


In [28]:
df = pd.DataFrame({
    'AAA': [1,2,1,3],
    'BBB': [1,1,2,2],
    'CCC': [2,1,3,1]
})

source_cols = df.columns

new_cols = [str(x) + "_cat" for x in source_cols]

categories = { 1: 'Alpha', 2: 'Beta', 3: 'Charlie' }

df[new_cols] = df[source_cols].applymap(categories.get)

df


Out[28]:
AAA BBB CCC AAA_cat BBB_cat CCC_cat
0 1 1 2 Alpha Alpha Beta
1 2 1 1 Beta Alpha Alpha
2 1 2 3 Alpha Beta Charlie
3 3 2 1 Charlie Beta Alpha

In [29]:
df = pd.DataFrame({
    'AAA': [1,1,1,2,2,2,3,3],
    'BBB': [2,1,3,4,5,1,2,3]
})

df


Out[29]:
AAA BBB
0 1 2
1 1 1
2 1 3
3 2 4
4 2 5
5 2 1
6 3 2
7 3 3

In [30]:
df.loc[df.groupby("AAA")["BBB"].idxmin()]


Out[30]:
AAA BBB
1 1 1
5 2 1
6 3 2

In [31]:
df.sort_values(by="BBB").groupby("AAA", as_index=False).first()


Out[31]:
AAA BBB
0 1 1
1 2 1
2 3 2

MultiIndexing


In [32]:
df = pd.DataFrame({
    'row': [0,1,2],
    'One_X': [1.1,1.1,1.1],
    'One_Y': [1.2,1.2,1.2],
    'Two_X': [1.11,1.11,1.11],
    'Two_Y': [1.22,1.22,1.22]
})

df


Out[32]:
One_X One_Y Two_X Two_Y row
0 1.1 1.2 1.11 1.22 0
1 1.1 1.2 1.11 1.22 1
2 1.1 1.2 1.11 1.22 2

In [33]:
df = df.set_index('row')

df


Out[33]:
One_X One_Y Two_X Two_Y
row
0 1.1 1.2 1.11 1.22
1 1.1 1.2 1.11 1.22
2 1.1 1.2 1.11 1.22

In [34]:
df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) for c in df.columns])

df


Out[34]:
One Two
X Y X Y
row
0 1.1 1.2 1.11 1.22
1 1.1 1.2 1.11 1.22
2 1.1 1.2 1.11 1.22

In [35]:
df = df.stack(0).reset_index(1)

df


Out[35]:
level_1 X Y
row
0 One 1.10 1.20
0 Two 1.11 1.22
1 One 1.10 1.20
1 Two 1.11 1.22
2 One 1.10 1.20
2 Two 1.11 1.22

In [36]:
df.columns = ['Sample', 'All_X', 'All_Y']

Arithmetic


In [37]:
cols = pd.MultiIndex.from_tuples([ (x,y) for x in ['A','B','C'] for y in ['O','I']])

df = pd.DataFrame(np.random.randn(2,6), index=['n','m'], columns=cols)

df


Out[37]:
A B C
O I O I O I
n 1.691128 -1.218601 -1.430438 -0.188882 -2.510238 1.161971
m -0.862331 -1.001342 0.438665 1.289830 0.319945 -0.289764

In [38]:
df = df.div(df['C'], level=1)

df


Out[38]:
A B C
O I O I O I
n -0.673692 -1.048736 0.569842 -0.162553 1.0 1.0
m -2.695246 3.455720 1.371061 -4.451318 1.0 1.0

Slicing


In [39]:
coords = [('AA','one'), ('AA','six'), ('BB','one'), ('BB','two'), ('BB','six')]

index = pd.MultiIndex.from_tuples(coords)

df = pd.DataFrame([11,22,33,44,55], index, ['MyData'])

df


Out[39]:
MyData
AA one 11
six 22
BB one 33
two 44
six 55

In [40]:
df.xs('BB', level=0, axis=0)


Out[40]:
MyData
one 33
two 44
six 55

In [41]:
df.xs('six', level=1, axis=0)


Out[41]:
MyData
AA 22
BB 55

In [42]:
index = list(itertools.product(['Ada','Quinn','Violet'],['Comp','Math','Sci']))

headr = list(itertools.product(['Exams','Labs'],['I','II']))

indx = pd.MultiIndex.from_tuples(index,names=['Student','Course'])

cols = pd.MultiIndex.from_tuples(headr)

data = [[70+x+y+(x*y)%3 for x in range(4)] for y in range(9)]

df = pd.DataFrame(data,indx,cols)

df


Out[42]:
Exams Labs
I II I II
Student Course
Ada Comp 70 71 72 73
Math 71 73 75 74
Sci 72 75 75 75
Quinn Comp 73 74 75 76
Math 74 76 78 77
Sci 75 78 78 78
Violet Comp 76 77 78 79
Math 77 79 81 80
Sci 78 81 81 81

In [43]:
All = slice(None)

df.loc['Violet']


Out[43]:
Exams Labs
I II I II
Course
Comp 76 77 78 79
Math 77 79 81 80
Sci 78 81 81 81

In [44]:
df.loc[(All, 'Math'), All]


Out[44]:
Exams Labs
I II I II
Student Course
Ada Math 71 73 75 74
Quinn Math 74 76 78 77
Violet Math 77 79 81 80

In [45]:
df.loc[(slice('Ada','Quinn'), 'Math'), All]


Out[45]:
Exams Labs
I II I II
Student Course
Ada Math 71 73 75 74
Quinn Math 74 76 78 77

In [46]:
df.loc[(All,'Math'), ('Exams')]


Out[46]:
I II
Student Course
Ada Math 71 73
Quinn Math 74 76
Violet Math 77 79

In [47]:
df.loc[(All,'Math'),(All,'II')]


Out[47]:
Exams Labs
II II
Student Course
Ada Math 73 74
Quinn Math 76 77
Violet Math 79 80

Sorting


In [48]:
df.sort_values(by=('Labs', 'II'), ascending=False)


Out[48]:
Exams Labs
I II I II
Student Course
Violet Sci 78 81 81 81
Math 77 79 81 80
Comp 76 77 78 79
Quinn Sci 75 78 78 78
Math 74 76 78 77
Comp 73 74 75 76
Ada Sci 72 75 75 75
Math 71 73 75 74
Comp 70 71 72 73

Levels

panelnd

Missing Data


In [49]:
df = pd.DataFrame(np.random.randn(6,1), index=pd.date_range('2013-08-01', periods=6, freq='B'), columns=list('A'))

df.loc[df.index[3], 'A'] = np.nan

df


Out[49]:
A
2013-08-01 0.004894
2013-08-02 0.266347
2013-08-05 1.299997
2013-08-06 NaN
2013-08-07 0.856117
2013-08-08 1.966193

In [50]:
df.reindex(df.index[::-1]).ffill()


Out[50]:
A
2013-08-08 1.966193
2013-08-07 0.856117
2013-08-06 0.856117
2013-08-05 1.299997
2013-08-02 0.266347
2013-08-01 0.004894

Replace

Grouping


In [51]:
df = pd.DataFrame({
    'animal': 'cat dog cat fish dog cat cat'.split(),
    'size': list('SSMMMLL'),
    'weight': [8, 10, 11, 1, 20, 12, 12],
    'adult' : [False] * 5 + [True] * 2
})

df


Out[51]:
adult animal size weight
0 False cat S 8
1 False dog S 10
2 False cat M 11
3 False fish M 1
4 False dog M 20
5 True cat L 12
6 True cat L 12

In [52]:
df.groupby('animal').apply(lambda subf: subf['size'][subf['weight'].idxmax()])


Out[52]:
animal
cat     L
dog     M
fish    M
dtype: object

In [53]:
gb = df.groupby(['animal'])

gb.get_group('cat')


Out[53]:
adult animal size weight
0 False cat S 8
2 False cat M 11
5 True cat L 12
6 True cat L 12

In [54]:
def GrowUp(x):
    avg_weight =  sum(x[x['size'] == 'S'].weight * 1.5)
    avg_weight += sum(x[x['size'] == 'M'].weight * 1.25)
    avg_weight += sum(x[x['size'] == 'L'].weight)
    avg_weight /= len(x)
    return pd.Series(['L',avg_weight,True], index=['size', 'weight', 'adult'])

expected_df = gb.apply(GrowUp)

expected_df


Out[54]:
size weight adult
animal
cat L 12.4375 True
dog L 20.0000 True
fish L 1.2500 True

In [55]:
S = pd.Series([i / 100.0 for i in range(1, 11)])

def CumRet(x,y):
    return x * (1 + y)

def Red(x):
    return functools.reduce(CumRet, x, 1.0)

S.expanding().apply(Red)


Out[55]:
0    1.010000
1    1.030200
2    1.061106
3    1.103550
4    1.158728
5    1.228251
6    1.314229
7    1.419367
8    1.547110
9    1.701821
dtype: float64

In [56]:
df = pd.DataFrame({'A' : [1, 1, 2, 2], 'B' : [1, -1, 1, 2]})

gb = df.groupby('A')

def replace(g):
    mask = g < 0
    g.loc[mask] = g[~mask].mean()
    return g

gb.transform(replace)


Out[56]:
B
0 1.0
1 1.0
2 1.0
3 2.0

In [57]:
df = pd.DataFrame({
    'code': ['foo', 'bar', 'baz'] * 2,
    'data': [0.16, -0.21, 0.33, 0.45, -0.59, 0.62],
    'flag': [False, True] * 3
})

code_groups = df.groupby('code')

agg_n_sort_order = code_groups[['data']].transform(sum).sort_values(by='data')

sorted_df = df.loc[agg_n_sort_order.index]

sorted_df


Out[57]:
code data flag
1 bar -0.21 True
4 bar -0.59 False
0 foo 0.16 False
3 foo 0.45 True
2 baz 0.33 False
5 baz 0.62 True

In [58]:
rng = pd.date_range(start="2014-10-07",periods=10,freq='2min')

ts = pd.Series(data = list(range(10)), index = rng)

def MyCust(x):
    if len(x) > 2:
        return x[1] * 1.234
    return pd.NaT

mhc = {'Mean': np.mean, 'Max': np.max, 'Custom': MyCust}

ts.resample("5min").apply(mhc)

ts


Out[58]:
2014-10-07 00:00:00    0
2014-10-07 00:02:00    1
2014-10-07 00:04:00    2
2014-10-07 00:06:00    3
2014-10-07 00:08:00    4
2014-10-07 00:10:00    5
2014-10-07 00:12:00    6
2014-10-07 00:14:00    7
2014-10-07 00:16:00    8
2014-10-07 00:18:00    9
Freq: 2T, dtype: int64

In [59]:
df = pd.DataFrame({
    'Color': 'Red Red Red Blue'.split(),
    'Value': [100, 150, 50, 50]
})

df['Counts'] = df.groupby(['Color']).transform(len)

df


Out[59]:
Color Value Counts
0 Red 100 3
1 Red 150 3
2 Red 50 3
3 Blue 50 1

In [60]:
df = pd.DataFrame({
    'line_race': [10, 10, 8, 10, 10, 8],
    'beyer': [99, 102, 103, 103, 88, 100]
}, index=[
    'Last Gunfighter',
    'Last Gunfighter',
    'Last Gunfighter',
    'Paynter',
    'Paynter',
    'Paynter'
])

df


Out[60]:
beyer line_race
Last Gunfighter 99 10
Last Gunfighter 102 10
Last Gunfighter 103 8
Paynter 103 10
Paynter 88 10
Paynter 100 8

In [61]:
df['beyer_shifted'] = df.groupby(level=0)['beyer'].shift(1)

df


Out[61]:
beyer line_race beyer_shifted
Last Gunfighter 99 10 NaN
Last Gunfighter 102 10 99.0
Last Gunfighter 103 8 102.0
Paynter 103 10 NaN
Paynter 88 10 103.0
Paynter 100 8 88.0

In [62]:
df = pd.DataFrame({
    'host': ['other', 'other', 'that', 'this', 'this'],
    'service': ['mail', 'web', 'mail', 'mail', 'web'],
    'no': [1, 2, 1, 2, 1]
}).set_index(['host', 'service'])

mask = df.groupby(level=0).agg('idxmax')

df_count = df.loc[mask['no']].reset_index()

df_count


Out[62]:
host service no
0 other web 2
1 that mail 1
2 this mail 2

In [63]:
df = pd.DataFrame([0, 1, 0, 1, 1, 1, 0, 1, 1], columns=['A'])

df.A.groupby((df.A != df.A.shift()).cumsum()).groups

df.A.groupby((df.A != df.A.shift()).cumsum()).cumsum()


Out[63]:
0    0
1    1
2    0
3    1
4    2
5    3
6    0
7    1
8    2
Name: A, dtype: int64

Expanding Data

Splitting


In [64]:
df = pd.DataFrame(data={
    'Case': ['A','A','A','B','A','A','B','A','A'],
    'Data': np.random.randn(9)
})

dfs = list(zip(*df.groupby((1*(df['Case']=='B')).cumsum().rolling(window=3,min_periods=1).median())))[-1]

dfs[0]


Out[64]:
Case Data
0 A 0.360256
1 A -0.030799
2 A -0.473578
3 B -1.054966

In [65]:
dfs[1]


Out[65]:
Case Data
4 A 0.715025
5 A -1.595031
6 B 1.600706

In [66]:
dfs[2]


Out[66]:
Case Data
7 A -0.612295
8 A -0.559878

Pivot


In [67]:
df = pd.DataFrame(data={
    'Province': ['ON','QC','BC','AL','AL','MN','ON'],
    'City': ['Toronto','Montreal','Vancouver','Calgary','Edmonton','Winnipeg','Windsor'],
    'Sales': [13,6,16,8,4,3,1]
})

table = pd.pivot_table(df,values=['Sales'], index=['Province'], columns=['City'], aggfunc=np.sum, margins=True)

table.stack('City')


Out[67]:
Sales
Province City
AL All 12.0
Calgary 8.0
Edmonton 4.0
BC All 16.0
Vancouver 16.0
MN All 3.0
Winnipeg 3.0
ON All 14.0
Toronto 13.0
Windsor 1.0
QC All 6.0
Montreal 6.0
All All 51.0
Calgary 8.0
Edmonton 4.0
Montreal 6.0
Toronto 13.0
Vancouver 16.0
Windsor 1.0
Winnipeg 3.0

In [68]:
grades = [48,99,75,80,42,80,72,68,36,78]

df = pd.DataFrame({
    'ID': ["x%d" % r for r in range(10)],
    'Gender': ['F', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'M'],
    'ExamYear': ['2007','2007','2007','2008','2008','2008','2008','2009','2009','2009'],
    'Class': ['algebra', 'stats', 'bio', 'algebra', 'algebra', 'stats', 'stats', 'algebra', 'bio', 'bio'],
    'Participated': ['yes','yes','yes','yes','no','yes','yes','yes','yes','yes'],
    'Passed': ['yes' if x > 50 else 'no' for x in grades],
    'Employed': [True,True,True,False,False,False,False,True,True,False],
    'Grade': grades
})

df.groupby('ExamYear').agg({
    'Participated': lambda x: x.value_counts()['yes'],
    'Passed': lambda x: sum(x == 'yes'),
    'Employed' : lambda x : sum(x),
    'Grade' : lambda x : sum(x) / len(x)
})


Out[68]:
Passed Participated Grade Employed
ExamYear
2007 2 3 74.000000 3
2008 3 3 68.500000 0
2009 2 3 60.666667 2

In [69]:
df = pd.DataFrame({
    'value': np.random.randn(36)
}, index=pd.date_range('2011-01-01', freq='M', periods=36))

pd.pivot_table(df, index=df.index.month, columns=df.index.year, values='value', aggfunc='sum')


Out[69]:
2011 2012 2013
1 -1.452684 -0.175611 2.568449
2 -0.665644 -0.594292 -0.049278
3 -0.250123 -0.343518 0.386894
4 -0.017976 -0.723268 1.407324
5 0.333130 -0.069268 1.229126
6 0.473193 0.265367 0.316684
7 0.479913 -1.657287 -0.715239
8 0.783101 -0.617078 1.745561
9 0.350418 0.883339 -1.838103
10 -0.869617 -0.399451 -0.273889
11 -1.377859 0.947415 0.546281
12 -1.116096 0.337732 -1.366948

Apply


In [70]:
df = pd.DataFrame(data={
    'A': [[2,4,8,16],[100,200],[10,20,30]],
    'B': [['a','b','c'],['jj','kk'],['ccc']]
}, index=['I','II','III'])

def SeriesFromSubList(aList):
    return pd.Series(aList)

df_orgz = pd.concat(dict([ (ind,row.apply(SeriesFromSubList)) for ind,row in df.iterrows() ]))

In [71]:
df = pd.DataFrame(
    data=np.random.randn(2000,2)/10000,
    index=pd.date_range('2001-01-01',periods=2000),
    columns=['A','B']
)

df


Out[71]:
A B
2001-01-01 -0.000177 -1.062619e-04
2001-01-02 -0.000118 2.352399e-05
2001-01-03 -0.000109 -1.908927e-05
2001-01-04 0.000014 6.876731e-06
2001-01-05 -0.000104 -1.139017e-05
2001-01-06 -0.000109 1.385021e-06
2001-01-07 -0.000134 -1.066389e-05
2001-01-08 -0.000080 -5.897414e-05
2001-01-09 -0.000031 4.268051e-05
2001-01-10 -0.000093 -5.778088e-05
2001-01-11 0.000167 4.121253e-05
2001-01-12 -0.000139 2.648512e-05
2001-01-13 -0.000012 -8.737938e-06
2001-01-14 0.000066 6.144602e-06
2001-01-15 0.000053 1.586173e-05
2001-01-16 0.000047 3.042602e-06
2001-01-17 -0.000076 1.623063e-04
2001-01-18 -0.000054 3.031323e-05
2001-01-19 0.000132 9.108038e-05
2001-01-20 -0.000031 -7.483177e-05
2001-01-21 -0.000115 1.853436e-05
2001-01-22 -0.000109 -7.035200e-05
2001-01-23 -0.000158 6.830860e-05
2001-01-24 -0.000103 -1.041656e-04
2001-01-25 0.000173 2.470797e-05
2001-01-26 -0.000054 -6.445272e-05
2001-01-27 0.000052 1.223023e-05
2001-01-28 -0.000080 5.750368e-05
2001-01-29 -0.000027 -8.341443e-05
2001-01-30 0.000006 5.571757e-05
... ... ...
2006-05-25 -0.000079 2.309412e-06
2006-05-26 0.000062 -7.831948e-05
2006-05-27 0.000018 -7.913012e-05
2006-05-28 -0.000009 1.250437e-04
2006-05-29 -0.000086 2.463508e-06
2006-05-30 0.000002 9.672057e-06
2006-05-31 -0.000018 -3.416930e-05
2006-06-01 -0.000022 -1.149135e-04
2006-06-02 -0.000025 1.038285e-04
2006-06-03 -0.000196 1.479233e-04
2006-06-04 0.000024 2.573207e-05
2006-06-05 -0.000045 7.918676e-05
2006-06-06 0.000023 8.566638e-05
2006-06-07 -0.000012 5.733845e-05
2006-06-08 -0.000020 -7.910414e-05
2006-06-09 0.000220 -1.277131e-05
2006-06-10 0.000049 1.633954e-04
2006-06-11 0.000014 1.404515e-04
2006-06-12 -0.000111 -1.370796e-04
2006-06-13 0.000094 -4.485599e-05
2006-06-14 -0.000074 -5.589514e-05
2006-06-15 -0.000065 -5.347763e-05
2006-06-16 0.000005 1.654913e-04
2006-06-17 0.000085 6.933283e-05
2006-06-18 0.000046 -8.092249e-05
2006-06-19 -0.000007 -1.654069e-07
2006-06-20 -0.000053 -3.263472e-05
2006-06-21 -0.000045 7.719624e-05
2006-06-22 0.000182 1.089065e-04
2006-06-23 -0.000059 -1.130946e-05

2000 rows × 2 columns


In [72]:
def gm(aDF,Const):
    v = ((((aDF.A + aDF.B) + 1).cumprod()) - 1) * Const
    return (aDF.index[0], v.iloc[-1])

S = pd.Series(dict([ gm(df.iloc[i:min(i+51,len(df)-1)],5) for i in range(len(df)-50) ]))

S


Out[72]:
2001-01-01   -0.007433
2001-01-02   -0.006756
2001-01-03   -0.006441
2001-01-04   -0.007375
2001-01-05   -0.007382
2001-01-06   -0.006306
2001-01-07   -0.005068
2001-01-08   -0.004309
2001-01-09   -0.003795
2001-01-10   -0.004472
2001-01-11   -0.003385
2001-01-12   -0.005628
2001-01-13   -0.004453
2001-01-14   -0.005013
2001-01-15   -0.005037
2001-01-16   -0.005973
2001-01-17   -0.006561
2001-01-18   -0.007421
2001-01-19   -0.007939
2001-01-20   -0.007620
2001-01-21   -0.007140
2001-01-22   -0.007240
2001-01-23   -0.007114
2001-01-24   -0.006706
2001-01-25   -0.006379
2001-01-26   -0.007299
2001-01-27   -0.007501
2001-01-28   -0.007955
2001-01-29   -0.008009
2001-01-30   -0.006504
                ...   
2006-04-05   -0.003596
2006-04-06   -0.003943
2006-04-07   -0.003039
2006-04-08   -0.002371
2006-04-09   -0.002975
2006-04-10   -0.002760
2006-04-11   -0.002100
2006-04-12   -0.003544
2006-04-13   -0.002847
2006-04-14   -0.003952
2006-04-15   -0.003997
2006-04-16   -0.004950
2006-04-17   -0.004201
2006-04-18   -0.003734
2006-04-19   -0.004964
2006-04-20   -0.003085
2006-04-21   -0.001683
2006-04-22   -0.000210
2006-04-23   -0.001432
2006-04-24    0.000441
2006-04-25    0.000286
2006-04-26   -0.002072
2006-04-27   -0.000909
2006-04-28    0.000166
2006-04-29   -0.000473
2006-04-30    0.000304
2006-05-01    0.000091
2006-05-02   -0.000364
2006-05-03    0.001267
2006-05-04    0.000241
Length: 1950, dtype: float64

In [73]:
rng = pd.date_range(start='2014-01-01', periods=100)

df = pd.DataFrame({
    'Open': np.random.randn(len(rng)),
    'Close': np.random.randn(len(rng)),
    'Volume': np.random.randint(100,2000,len(rng))
}, index=rng)

df


Out[73]:
Close Open Volume
2014-01-01 0.191597 -0.314054 1241
2014-01-02 -1.717382 0.639787 682
2014-01-03 0.271303 -0.506967 164
2014-01-04 1.197396 -0.716909 1476
2014-01-05 0.492053 0.318468 458
2014-01-06 0.098098 1.119484 119
2014-01-07 1.100037 0.516767 749
2014-01-08 0.478678 0.274834 146
2014-01-09 -0.651548 0.638060 424
2014-01-10 -0.624340 -0.358287 149
2014-01-11 0.058846 0.669787 191
2014-01-12 -0.015866 -0.456204 1016
2014-01-13 -0.490331 1.042980 301
2014-01-14 -0.681969 -1.848677 102
2014-01-15 -0.482265 0.105843 1001
2014-01-16 -0.455891 0.446332 988
2014-01-17 0.080858 0.037827 1824
2014-01-18 -0.487353 0.176800 1418
2014-01-19 0.274929 0.040340 1621
2014-01-20 1.265596 -0.231885 152
2014-01-21 0.573440 1.553562 1556
2014-01-22 -0.556765 1.868245 715
2014-01-23 -0.444104 0.992026 130
2014-01-24 -0.538320 0.453568 926
2014-01-25 1.782628 -0.845000 1531
2014-01-26 -1.022168 -1.516973 1226
2014-01-27 1.153171 0.617113 378
2014-01-28 -0.585982 1.121890 810
2014-01-29 1.522453 0.490162 1848
2014-01-30 -0.143591 -1.057452 151
... ... ... ...
2014-03-12 1.169210 0.330771 1010
2014-03-13 0.837246 0.932081 1634
2014-03-14 -1.246848 1.614713 1877
2014-03-15 1.715415 0.353876 168
2014-03-16 -1.725376 -0.259473 802
2014-03-17 -1.864538 0.438324 361
2014-03-18 -0.665222 2.389269 1331
2014-03-19 0.329992 1.083136 1166
2014-03-20 -0.940032 -0.665577 1756
2014-03-21 0.289460 -0.308027 551
2014-03-22 0.488300 -1.393148 690
2014-03-23 -1.252622 1.201307 1685
2014-03-24 0.021909 -0.157732 1216
2014-03-25 -0.026449 0.847406 702
2014-03-26 0.301914 1.634134 229
2014-03-27 -0.608746 -0.255602 736
2014-03-28 1.761464 1.952497 741
2014-03-29 0.194342 1.240527 769
2014-03-30 -0.753210 0.615735 1829
2014-03-31 0.467182 0.986644 1322
2014-04-01 0.573887 1.207595 1949
2014-04-02 0.252623 -0.701256 1856
2014-04-03 1.212486 0.825473 182
2014-04-04 -0.777999 0.721827 1765
2014-04-05 0.960076 -0.016064 559
2014-04-06 1.152554 -1.259634 862
2014-04-07 1.420554 1.199850 1115
2014-04-08 -1.144628 -1.370731 883
2014-04-09 0.496323 0.809941 1424
2014-04-10 1.101801 0.877663 1088

100 rows × 3 columns


In [74]:
def vwap(bars):
    return ((bars.Close*bars.Volume).sum()/bars.Volume.sum())

window = 5

s = pd.concat([ (pd.Series(vwap(df.iloc[i:i+window]), index=[df.index[i+window]])) for i in range(len(df)-window) ])

s.round(2)


Out[74]:
2014-01-06    0.27
2014-01-07    0.30
2014-01-08    0.97
2014-01-09    0.98
2014-01-10    0.45
2014-01-11    0.34
2014-01-12    0.32
2014-01-13   -0.16
2014-01-14   -0.25
2014-01-15   -0.18
2014-01-16   -0.27
2014-01-17   -0.34
2014-01-18   -0.24
2014-01-19   -0.29
2014-01-20   -0.15
2014-01-21   -0.06
2014-01-22    0.15
2014-01-23    0.08
2014-01-24    0.26
2014-01-25    0.04
2014-01-26    0.55
2014-01-27    0.12
2014-01-28    0.32
2014-01-29    0.19
2014-01-30    0.73
2014-01-31    0.34
2014-02-01    0.79
2014-02-02    0.62
2014-02-03    0.88
2014-02-04   -0.43
              ... 
2014-03-12   -1.01
2014-03-13   -0.51
2014-03-14   -0.21
2014-03-15    0.08
2014-03-16   -0.12
2014-03-17   -0.16
2014-03-18   -0.57
2014-03-19   -1.10
2014-03-20   -0.59
2014-03-21   -0.78
2014-03-22   -0.52
2014-03-23   -0.30
2014-03-24   -0.49
2014-03-25   -0.55
2014-03-26   -0.33
2014-03-27   -0.38
2014-03-28   -0.54
2014-03-29    0.26
2014-03-30    0.33
2014-03-31   -0.07
2014-04-01    0.05
2014-04-02    0.27
2014-04-03    0.13
2014-04-04    0.15
2014-04-05    0.15
2014-04-06    0.15
2014-04-07    0.16
2014-04-08    0.44
2014-04-09    0.14
2014-04-10    0.58
Length: 95, dtype: float64

Merge


In [75]:
rng = pd.date_range('2000-01-01', periods=6)

df1 = pd.DataFrame(np.random.randn(6, 3), index=rng, columns=['A', 'B', 'C'])

df2 = df1.copy()

In [76]:
df = df1.append(df2, ignore_index=True)

df


Out[76]:
A B C
0 0.806820 0.068926 -0.374480
1 1.520404 -0.536845 -1.084997
2 0.495336 0.717881 -0.597878
3 -0.956341 0.871390 0.564765
4 -1.121630 -1.217427 1.403706
5 -1.099288 0.046444 -0.615359
6 0.806820 0.068926 -0.374480
7 1.520404 -0.536845 -1.084997
8 0.495336 0.717881 -0.597878
9 -0.956341 0.871390 0.564765
10 -1.121630 -1.217427 1.403706
11 -1.099288 0.046444 -0.615359

In [77]:
df = pd.DataFrame(data={
    'Area': ['A'] * 5 + ['C'] * 2,
    'Bins': [110] * 2 + [160] * 3 + [40] * 2,
    'Test_0': [0, 1, 0, 1, 2, 0, 1],
    'Data' : np.random.randn(7)
})

df


Out[77]:
Area Bins Data Test_0
0 A 110 -0.805778 0
1 A 110 0.173171 1
2 A 160 0.433174 0
3 A 160 0.337607 1
4 A 160 -0.067628 2
5 C 40 0.063513 0
6 C 40 -1.090058 1

In [78]:
df['Test_1'] = df['Test_0'] - 1

pd.merge(df, df, left_on=['Bins', 'Area','Test_0'], right_on=['Bins', 'Area','Test_1'],suffixes=('_L','_R'))


Out[78]:
Area Bins Data_L Test_0_L Test_1_L Data_R Test_0_R Test_1_R
0 A 110 -0.805778 0 -1 0.173171 1 0
1 A 160 0.433174 0 -1 0.337607 1 0
2 A 160 0.337607 1 0 -0.067628 2 1
3 C 40 0.063513 0 -1 -1.090058 1 0

Plotting


In [79]:
df = pd.DataFrame({
    'stratifying_var': np.random.uniform(0, 100, 20),
    'price': np.random.normal(100, 5, 20)
})

df['quartiles'] = pd.qcut(
    df['stratifying_var'],
    4,
    labels=['0-25%', '25-50%', '50-75%', '75-100%']
)

df.boxplot(column='price', by='quartiles')


Out[79]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8af7b63a58>

Data In/Out

CSV

Reading multiple files to create a single DataFrame

Parsing date components in multi-columns

Skip row between header and data

Option 1: pass rows explicitly to skiprows

Option 2: read column names and then data

SQL

Excel

Computation

Timedeltas

Aliasing Axis Names


In [ ]:

Creating Example Data


In [80]:
def expand_grid(data_dict):
    rows = itertools.product(*data_dict.values())
    return pd.DataFrame.from_records(rows, columns=data_dict.keys())

df = expand_grid({
    'height': [60, 70],
    'weight': [100, 140, 180],
    'sex': ['Male', 'Female']
})

df


Out[80]:
sex height weight
0 Male 60 100
1 Male 60 140
2 Male 60 180
3 Male 70 100
4 Male 70 140
5 Male 70 180
6 Female 60 100
7 Female 60 140
8 Female 60 180
9 Female 70 100
10 Female 70 140
11 Female 70 180