このノートブックは、pandasのドキュメントのCookbookを写経したものです。



In [1]:

    
import functools
import itertools

import numpy as np
import pandas as pd

Idioms



In [2]:

    
df = pd.DataFrame({
    'AAA': [4, 5, 6, 7],
    'BBB': [10, 20, 30, 40],
    'CCC': [100, 50, -30, -50]
})

df

if-then...



In [3]:

    
df.loc[df.AAA >= 5, 'BBB'] = -1

df



In [4]:

    
df.loc[df.AAA >= 5, ['BBB', 'CCC']] = 555

df



In [5]:

    
df.loc[df.AAA < 5, ['BBB', 'CCC']] = 2000

df



In [6]:

    
df_mask = pd.DataFrame({
    'AAA': [True] * 4,
    'BBB': [False] * 4,
    'CCC': [True,False] * 2
})

df.where(df_mask, -1000)



In [7]:

    
df = pd.DataFrame({
    'AAA': [4, 5, 6, 7],
    'BBB': [10, 20, 30, 40],
    'CCC': [100, 50, -30, -50]
})

df



In [8]:

    
df['logic'] = np.where(df['AAA'] > 5, 'high', 'low')

df

Splitting



In [9]:

    
df = pd.DataFrame({
    'AAA': [4, 5, 6, 7],
    'BBB': [10, 20, 30, 40],
    'CCC': [100, 50, -30, -50]
})

df



In [10]:

    
dflow = df[df.AAA <= 5]

dflow



In [11]:

    
dfhigh = df[df.AAA > 5]

dfhigh

Building Criteria



In [12]:

    
df = pd.DataFrame({
    'AAA': [4, 5, 6, 7],
    'BBB': [10, 20, 30, 40],
    'CCC': [100, 50, -30, -50]
})

df

And



In [13]:

    
newseries = df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA']

newseries









    Out[13]:





0    4
1    5
Name: AAA, dtype: int64

Or



In [14]:

    
newseries = df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA']

newseries









    Out[14]:





0    4
1    5
2    6
3    7
Name: AAA, dtype: int64



In [15]:

    
df.loc[(df['BBB'] > 25) | (df['CCC'] >= 75), 'AAA'] = 0.1

df



In [16]:

    
df = pd.DataFrame({
    'AAA': [4,5,6,7],
    'BBB': [10,20,30,40],
    'CCC': [100,50,-30,-50]
})

df



In [17]:

    
aValue = 43.0
df.loc[(df.CCC - aValue).abs().argsort()]



In [18]:

    
df = pd.DataFrame({
    'AAA': [4,5,6,7],
    'BBB': [10,20,30,40],
    'CCC': [100,50,-30,-50]
})

Crit1 = df.AAA <= 5.5
Crit2 = df.BBB == 10.0
Crit3 = df.CCC > -40.0

AllCrit = Crit1 & Crit2 & Crit3

CritList = [Crit1,Crit2,Crit3]

AllCrit = functools.reduce(lambda x,y: x & y, CritList)

df[AllCrit]

Selection

DataFrames



In [19]:

    
df = pd.DataFrame({
    'AAA': [4,5,6,7],
    'BBB': [10,20,30,40],
    'CCC': [100,50,-30,-50]
})

df



In [20]:

    
df[(df.AAA <= 6) & (df.index.isin([0,2,4]))]



In [21]:

    
data = {
    'AAA': [4,5,6,7],
    'BBB': [10,20,30,40],
    'CCC': [100,50,-30,-50]
}

df = pd.DataFrame(data=data, index=['foo','bar','boo','kar'])

df



In [22]:

    
df.loc['bar': 'kar']

df.iloc[0:3]

df.loc['bar': 'kar']



In [23]:

    
df2 = pd.DataFrame(data=data,index=[1,2,3,4])

df2.iloc[1:3]

df2.loc[1:3]



In [24]:

    
df = pd.DataFrame({
    'AAA': [4,5,6,7],
    'BBB': [10,20,30,40],
    'CCC': [100,50,-30,-50]
})

df[~((df.AAA <= 6) & (df.index.isin([0,2,4])))]

Panels



In [25]:

    
rng = pd.date_range('1/1/2013', periods=100, freq='D')

data = np.random.randn(100, 4)

cols = ['A','B','C','D']

df1 = pd.DataFrame(data, rng, cols)
df2 = pd.DataFrame(data, rng, cols)
df3 = pd.DataFrame(data, rng, cols)

pf = pd.Panel({'df1': df1, 'df2': df2, 'df3': df3})

pf









    Out[25]:





<class 'pandas.core.panel.Panel'>
Dimensions: 3 (items) x 100 (major_axis) x 4 (minor_axis)
Items axis: df1 to df3
Major_axis axis: 2013-01-01 00:00:00 to 2013-04-10 00:00:00
Minor_axis axis: A to D



In [26]:

    
pf = pf.transpose(2, 0, 1)

pf['E'] = pd.DataFrame(data, rng, cols)

pf = pf.transpose(1, 2, 0)

pf









    Out[26]:





<class 'pandas.core.panel.Panel'>
Dimensions: 3 (items) x 100 (major_axis) x 5 (minor_axis)
Items axis: df1 to df3
Major_axis axis: 2013-01-01 00:00:00 to 2013-04-10 00:00:00
Minor_axis axis: A to E



In [27]:

    
pf.loc[:,:,'F'] = pd.DataFrame(data, rng, cols)

pf









    Out[27]:





<class 'pandas.core.panel.Panel'>
Dimensions: 3 (items) x 100 (major_axis) x 6 (minor_axis)
Items axis: df1 to df3
Major_axis axis: 2013-01-01 00:00:00 to 2013-04-10 00:00:00
Minor_axis axis: A to F

New Columns



In [28]:

    
df = pd.DataFrame({
    'AAA': [1,2,1,3],
    'BBB': [1,1,2,2],
    'CCC': [2,1,3,1]
})

source_cols = df.columns

new_cols = [str(x) + "_cat" for x in source_cols]

categories = { 1: 'Alpha', 2: 'Beta', 3: 'Charlie' }

df[new_cols] = df[source_cols].applymap(categories.get)

df









    Out[28]:







  
    
      
      AAA
      BBB
      CCC
      AAA_cat
      BBB_cat
      CCC_cat
    
  
  
    
      0
      1
      1
      2
      Alpha
      Alpha
      Beta
    
    
      1
      2
      1
      1
      Beta
      Alpha
      Alpha
    
    
      2
      1
      2
      3
      Alpha
      Beta
      Charlie
    
    
      3
      3
      2
      1
      Charlie
      Beta
      Alpha



In [29]:

    
df = pd.DataFrame({
    'AAA': [1,1,1,2,2,2,3,3],
    'BBB': [2,1,3,4,5,1,2,3]
})

df



In [30]:

    
df.loc[df.groupby("AAA")["BBB"].idxmin()]



In [31]:

    
df.sort_values(by="BBB").groupby("AAA", as_index=False).first()

MultiIndexing



In [32]:

    
df = pd.DataFrame({
    'row': [0,1,2],
    'One_X': [1.1,1.1,1.1],
    'One_Y': [1.2,1.2,1.2],
    'Two_X': [1.11,1.11,1.11],
    'Two_Y': [1.22,1.22,1.22]
})

df



In [33]:

    
df = df.set_index('row')

df



In [34]:

    
df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) for c in df.columns])

df



In [35]:

    
df = df.stack(0).reset_index(1)

df



In [36]:

    
df.columns = ['Sample', 'All_X', 'All_Y']

Arithmetic



In [37]:

    
cols = pd.MultiIndex.from_tuples([ (x,y) for x in ['A','B','C'] for y in ['O','I']])

df = pd.DataFrame(np.random.randn(2,6), index=['n','m'], columns=cols)

df



In [38]:

    
df = df.div(df['C'], level=1)

df

Slicing



In [39]:

    
coords = [('AA','one'), ('AA','six'), ('BB','one'), ('BB','two'), ('BB','six')]

index = pd.MultiIndex.from_tuples(coords)

df = pd.DataFrame([11,22,33,44,55], index, ['MyData'])

df



In [40]:

    
df.xs('BB', level=0, axis=0)



In [41]:

    
df.xs('six', level=1, axis=0)



In [42]:

    
index = list(itertools.product(['Ada','Quinn','Violet'],['Comp','Math','Sci']))

headr = list(itertools.product(['Exams','Labs'],['I','II']))

indx = pd.MultiIndex.from_tuples(index,names=['Student','Course'])

cols = pd.MultiIndex.from_tuples(headr)

data = [[70+x+y+(x*y)%3 for x in range(4)] for y in range(9)]

df = pd.DataFrame(data,indx,cols)

df



In [43]:

    
All = slice(None)

df.loc['Violet']



In [44]:

    
df.loc[(All, 'Math'), All]



In [45]:

    
df.loc[(slice('Ada','Quinn'), 'Math'), All]



In [46]:

    
df.loc[(All,'Math'), ('Exams')]



In [47]:

    
df.loc[(All,'Math'),(All,'II')]

Sorting



In [48]:

    
df.sort_values(by=('Labs', 'II'), ascending=False)

Levels

panelnd

Missing Data



In [49]:

    
df = pd.DataFrame(np.random.randn(6,1), index=pd.date_range('2013-08-01', periods=6, freq='B'), columns=list('A'))

df.loc[df.index[3], 'A'] = np.nan

df



In [50]:

    
df.reindex(df.index[::-1]).ffill()

Replace

Grouping



In [51]:

    
df = pd.DataFrame({
    'animal': 'cat dog cat fish dog cat cat'.split(),
    'size': list('SSMMMLL'),
    'weight': [8, 10, 11, 1, 20, 12, 12],
    'adult' : [False] * 5 + [True] * 2
})

df



In [52]:

    
df.groupby('animal').apply(lambda subf: subf['size'][subf['weight'].idxmax()])









    Out[52]:





animal
cat     L
dog     M
fish    M
dtype: object



In [53]:

    
gb = df.groupby(['animal'])

gb.get_group('cat')



In [54]:

    
def GrowUp(x):
    avg_weight =  sum(x[x['size'] == 'S'].weight * 1.5)
    avg_weight += sum(x[x['size'] == 'M'].weight * 1.25)
    avg_weight += sum(x[x['size'] == 'L'].weight)
    avg_weight /= len(x)
    return pd.Series(['L',avg_weight,True], index=['size', 'weight', 'adult'])

expected_df = gb.apply(GrowUp)

expected_df



In [55]:

    
S = pd.Series([i / 100.0 for i in range(1, 11)])

def CumRet(x,y):
    return x * (1 + y)

def Red(x):
    return functools.reduce(CumRet, x, 1.0)

S.expanding().apply(Red)









    Out[55]:





0    1.010000
1    1.030200
2    1.061106
3    1.103550
4    1.158728
5    1.228251
6    1.314229
7    1.419367
8    1.547110
9    1.701821
dtype: float64



In [56]:

    
df = pd.DataFrame({'A' : [1, 1, 2, 2], 'B' : [1, -1, 1, 2]})

gb = df.groupby('A')

def replace(g):
    mask = g < 0
    g.loc[mask] = g[~mask].mean()
    return g

gb.transform(replace)



In [57]:

    
df = pd.DataFrame({
    'code': ['foo', 'bar', 'baz'] * 2,
    'data': [0.16, -0.21, 0.33, 0.45, -0.59, 0.62],
    'flag': [False, True] * 3
})

code_groups = df.groupby('code')

agg_n_sort_order = code_groups[['data']].transform(sum).sort_values(by='data')

sorted_df = df.loc[agg_n_sort_order.index]

sorted_df



In [58]:

    
rng = pd.date_range(start="2014-10-07",periods=10,freq='2min')

ts = pd.Series(data = list(range(10)), index = rng)

def MyCust(x):
    if len(x) > 2:
        return x[1] * 1.234
    return pd.NaT

mhc = {'Mean': np.mean, 'Max': np.max, 'Custom': MyCust}

ts.resample("5min").apply(mhc)

ts









    Out[58]:





2014-10-07 00:00:00    0
2014-10-07 00:02:00    1
2014-10-07 00:04:00    2
2014-10-07 00:06:00    3
2014-10-07 00:08:00    4
2014-10-07 00:10:00    5
2014-10-07 00:12:00    6
2014-10-07 00:14:00    7
2014-10-07 00:16:00    8
2014-10-07 00:18:00    9
Freq: 2T, dtype: int64



In [59]:

    
df = pd.DataFrame({
    'Color': 'Red Red Red Blue'.split(),
    'Value': [100, 150, 50, 50]
})

df['Counts'] = df.groupby(['Color']).transform(len)

df



In [60]:

    
df = pd.DataFrame({
    'line_race': [10, 10, 8, 10, 10, 8],
    'beyer': [99, 102, 103, 103, 88, 100]
}, index=[
    'Last Gunfighter',
    'Last Gunfighter',
    'Last Gunfighter',
    'Paynter',
    'Paynter',
    'Paynter'
])

df









    Out[60]:







  
    
      
      beyer
      line_race
    
  
  
    
      Last Gunfighter
      99
      10
    
    
      Last Gunfighter
      102
      10
    
    
      Last Gunfighter
      103
      8
    
    
      Paynter
      103
      10
    
    
      Paynter
      88
      10
    
    
      Paynter
      100
      8



In [61]:

    
df['beyer_shifted'] = df.groupby(level=0)['beyer'].shift(1)

df









    Out[61]:







  
    
      
      beyer
      line_race
      beyer_shifted
    
  
  
    
      Last Gunfighter
      99
      10
      NaN
    
    
      Last Gunfighter
      102
      10
      99.0
    
    
      Last Gunfighter
      103
      8
      102.0
    
    
      Paynter
      103
      10
      NaN
    
    
      Paynter
      88
      10
      103.0
    
    
      Paynter
      100
      8
      88.0



In [62]:

    
df = pd.DataFrame({
    'host': ['other', 'other', 'that', 'this', 'this'],
    'service': ['mail', 'web', 'mail', 'mail', 'web'],
    'no': [1, 2, 1, 2, 1]
}).set_index(['host', 'service'])

mask = df.groupby(level=0).agg('idxmax')

df_count = df.loc[mask['no']].reset_index()

df_count



In [63]:

    
df = pd.DataFrame([0, 1, 0, 1, 1, 1, 0, 1, 1], columns=['A'])

df.A.groupby((df.A != df.A.shift()).cumsum()).groups

df.A.groupby((df.A != df.A.shift()).cumsum()).cumsum()









    Out[63]:





0    0
1    1
2    0
3    1
4    2
5    3
6    0
7    1
8    2
Name: A, dtype: int64

Expanding Data

Splitting



In [64]:

    
df = pd.DataFrame(data={
    'Case': ['A','A','A','B','A','A','B','A','A'],
    'Data': np.random.randn(9)
})

dfs = list(zip(*df.groupby((1*(df['Case']=='B')).cumsum().rolling(window=3,min_periods=1).median())))[-1]

dfs[0]



In [65]:

    
dfs[1]



In [66]:

    
dfs[2]

Pivot



In [67]:

    
df = pd.DataFrame(data={
    'Province': ['ON','QC','BC','AL','AL','MN','ON'],
    'City': ['Toronto','Montreal','Vancouver','Calgary','Edmonton','Winnipeg','Windsor'],
    'Sales': [13,6,16,8,4,3,1]
})

table = pd.pivot_table(df,values=['Sales'], index=['Province'], columns=['City'], aggfunc=np.sum, margins=True)

table.stack('City')









    Out[67]:







  
    
      
      
      Sales
    
    
      Province
      City
      
    
  
  
    
      AL
      All
      12.0
    
    
      Calgary
      8.0
    
    
      Edmonton
      4.0
    
    
      BC
      All
      16.0
    
    
      Vancouver
      16.0
    
    
      MN
      All
      3.0
    
    
      Winnipeg
      3.0
    
    
      ON
      All
      14.0
    
    
      Toronto
      13.0
    
    
      Windsor
      1.0
    
    
      QC
      All
      6.0
    
    
      Montreal
      6.0
    
    
      All
      All
      51.0
    
    
      Calgary
      8.0
    
    
      Edmonton
      4.0
    
    
      Montreal
      6.0
    
    
      Toronto
      13.0
    
    
      Vancouver
      16.0
    
    
      Windsor
      1.0
    
    
      Winnipeg
      3.0



In [68]:

    
grades = [48,99,75,80,42,80,72,68,36,78]

df = pd.DataFrame({
    'ID': ["x%d" % r for r in range(10)],
    'Gender': ['F', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'M'],
    'ExamYear': ['2007','2007','2007','2008','2008','2008','2008','2009','2009','2009'],
    'Class': ['algebra', 'stats', 'bio', 'algebra', 'algebra', 'stats', 'stats', 'algebra', 'bio', 'bio'],
    'Participated': ['yes','yes','yes','yes','no','yes','yes','yes','yes','yes'],
    'Passed': ['yes' if x > 50 else 'no' for x in grades],
    'Employed': [True,True,True,False,False,False,False,True,True,False],
    'Grade': grades
})

df.groupby('ExamYear').agg({
    'Participated': lambda x: x.value_counts()['yes'],
    'Passed': lambda x: sum(x == 'yes'),
    'Employed' : lambda x : sum(x),
    'Grade' : lambda x : sum(x) / len(x)
})









    Out[68]:







  
    
      
      Passed
      Participated
      Grade
      Employed
    
    
      ExamYear
      
      
      
      
    
  
  
    
      2007
      2
      3
      74.000000
      3
    
    
      2008
      3
      3
      68.500000
      0
    
    
      2009
      2
      3
      60.666667
      2



In [69]:

    
df = pd.DataFrame({
    'value': np.random.randn(36)
}, index=pd.date_range('2011-01-01', freq='M', periods=36))

pd.pivot_table(df, index=df.index.month, columns=df.index.year, values='value', aggfunc='sum')

Apply



In [70]:

    
df = pd.DataFrame(data={
    'A': [[2,4,8,16],[100,200],[10,20,30]],
    'B': [['a','b','c'],['jj','kk'],['ccc']]
}, index=['I','II','III'])

def SeriesFromSubList(aList):
    return pd.Series(aList)

df_orgz = pd.concat(dict([ (ind,row.apply(SeriesFromSubList)) for ind,row in df.iterrows() ]))



In [71]:

    
df = pd.DataFrame(
    data=np.random.randn(2000,2)/10000,
    index=pd.date_range('2001-01-01',periods=2000),
    columns=['A','B']
)

df









    Out[71]:







  
    
      
      A
      B
    
  
  
    
      2001-01-01
      -0.000177
      -1.062619e-04
    
    
      2001-01-02
      -0.000118
      2.352399e-05
    
    
      2001-01-03
      -0.000109
      -1.908927e-05
    
    
      2001-01-04
      0.000014
      6.876731e-06
    
    
      2001-01-05
      -0.000104
      -1.139017e-05
    
    
      2001-01-06
      -0.000109
      1.385021e-06
    
    
      2001-01-07
      -0.000134
      -1.066389e-05
    
    
      2001-01-08
      -0.000080
      -5.897414e-05
    
    
      2001-01-09
      -0.000031
      4.268051e-05
    
    
      2001-01-10
      -0.000093
      -5.778088e-05
    
    
      2001-01-11
      0.000167
      4.121253e-05
    
    
      2001-01-12
      -0.000139
      2.648512e-05
    
    
      2001-01-13
      -0.000012
      -8.737938e-06
    
    
      2001-01-14
      0.000066
      6.144602e-06
    
    
      2001-01-15
      0.000053
      1.586173e-05
    
    
      2001-01-16
      0.000047
      3.042602e-06
    
    
      2001-01-17
      -0.000076
      1.623063e-04
    
    
      2001-01-18
      -0.000054
      3.031323e-05
    
    
      2001-01-19
      0.000132
      9.108038e-05
    
    
      2001-01-20
      -0.000031
      -7.483177e-05
    
    
      2001-01-21
      -0.000115
      1.853436e-05
    
    
      2001-01-22
      -0.000109
      -7.035200e-05
    
    
      2001-01-23
      -0.000158
      6.830860e-05
    
    
      2001-01-24
      -0.000103
      -1.041656e-04
    
    
      2001-01-25
      0.000173
      2.470797e-05
    
    
      2001-01-26
      -0.000054
      -6.445272e-05
    
    
      2001-01-27
      0.000052
      1.223023e-05
    
    
      2001-01-28
      -0.000080
      5.750368e-05
    
    
      2001-01-29
      -0.000027
      -8.341443e-05
    
    
      2001-01-30
      0.000006
      5.571757e-05
    
    
      ...
      ...
      ...
    
    
      2006-05-25
      -0.000079
      2.309412e-06
    
    
      2006-05-26
      0.000062
      -7.831948e-05
    
    
      2006-05-27
      0.000018
      -7.913012e-05
    
    
      2006-05-28
      -0.000009
      1.250437e-04
    
    
      2006-05-29
      -0.000086
      2.463508e-06
    
    
      2006-05-30
      0.000002
      9.672057e-06
    
    
      2006-05-31
      -0.000018
      -3.416930e-05
    
    
      2006-06-01
      -0.000022
      -1.149135e-04
    
    
      2006-06-02
      -0.000025
      1.038285e-04
    
    
      2006-06-03
      -0.000196
      1.479233e-04
    
    
      2006-06-04
      0.000024
      2.573207e-05
    
    
      2006-06-05
      -0.000045
      7.918676e-05
    
    
      2006-06-06
      0.000023
      8.566638e-05
    
    
      2006-06-07
      -0.000012
      5.733845e-05
    
    
      2006-06-08
      -0.000020
      -7.910414e-05
    
    
      2006-06-09
      0.000220
      -1.277131e-05
    
    
      2006-06-10
      0.000049
      1.633954e-04
    
    
      2006-06-11
      0.000014
      1.404515e-04
    
    
      2006-06-12
      -0.000111
      -1.370796e-04
    
    
      2006-06-13
      0.000094
      -4.485599e-05
    
    
      2006-06-14
      -0.000074
      -5.589514e-05
    
    
      2006-06-15
      -0.000065
      -5.347763e-05
    
    
      2006-06-16
      0.000005
      1.654913e-04
    
    
      2006-06-17
      0.000085
      6.933283e-05
    
    
      2006-06-18
      0.000046
      -8.092249e-05
    
    
      2006-06-19
      -0.000007
      -1.654069e-07
    
    
      2006-06-20
      -0.000053
      -3.263472e-05
    
    
      2006-06-21
      -0.000045
      7.719624e-05
    
    
      2006-06-22
      0.000182
      1.089065e-04
    
    
      2006-06-23
      -0.000059
      -1.130946e-05
    
  

2000 rows × 2 columns



In [72]:

    
def gm(aDF,Const):
    v = ((((aDF.A + aDF.B) + 1).cumprod()) - 1) * Const
    return (aDF.index[0], v.iloc[-1])

S = pd.Series(dict([ gm(df.iloc[i:min(i+51,len(df)-1)],5) for i in range(len(df)-50) ]))

S









    Out[72]:





2001-01-01   -0.007433
2001-01-02   -0.006756
2001-01-03   -0.006441
2001-01-04   -0.007375
2001-01-05   -0.007382
2001-01-06   -0.006306
2001-01-07   -0.005068
2001-01-08   -0.004309
2001-01-09   -0.003795
2001-01-10   -0.004472
2001-01-11   -0.003385
2001-01-12   -0.005628
2001-01-13   -0.004453
2001-01-14   -0.005013
2001-01-15   -0.005037
2001-01-16   -0.005973
2001-01-17   -0.006561
2001-01-18   -0.007421
2001-01-19   -0.007939
2001-01-20   -0.007620
2001-01-21   -0.007140
2001-01-22   -0.007240
2001-01-23   -0.007114
2001-01-24   -0.006706
2001-01-25   -0.006379
2001-01-26   -0.007299
2001-01-27   -0.007501
2001-01-28   -0.007955
2001-01-29   -0.008009
2001-01-30   -0.006504
                ...   
2006-04-05   -0.003596
2006-04-06   -0.003943
2006-04-07   -0.003039
2006-04-08   -0.002371
2006-04-09   -0.002975
2006-04-10   -0.002760
2006-04-11   -0.002100
2006-04-12   -0.003544
2006-04-13   -0.002847
2006-04-14   -0.003952
2006-04-15   -0.003997
2006-04-16   -0.004950
2006-04-17   -0.004201
2006-04-18   -0.003734
2006-04-19   -0.004964
2006-04-20   -0.003085
2006-04-21   -0.001683
2006-04-22   -0.000210
2006-04-23   -0.001432
2006-04-24    0.000441
2006-04-25    0.000286
2006-04-26   -0.002072
2006-04-27   -0.000909
2006-04-28    0.000166
2006-04-29   -0.000473
2006-04-30    0.000304
2006-05-01    0.000091
2006-05-02   -0.000364
2006-05-03    0.001267
2006-05-04    0.000241
Length: 1950, dtype: float64



In [73]:

    
rng = pd.date_range(start='2014-01-01', periods=100)

df = pd.DataFrame({
    'Open': np.random.randn(len(rng)),
    'Close': np.random.randn(len(rng)),
    'Volume': np.random.randint(100,2000,len(rng))
}, index=rng)

df









    Out[73]:







  
    
      
      Close
      Open
      Volume
    
  
  
    
      2014-01-01
      0.191597
      -0.314054
      1241
    
    
      2014-01-02
      -1.717382
      0.639787
      682
    
    
      2014-01-03
      0.271303
      -0.506967
      164
    
    
      2014-01-04
      1.197396
      -0.716909
      1476
    
    
      2014-01-05
      0.492053
      0.318468
      458
    
    
      2014-01-06
      0.098098
      1.119484
      119
    
    
      2014-01-07
      1.100037
      0.516767
      749
    
    
      2014-01-08
      0.478678
      0.274834
      146
    
    
      2014-01-09
      -0.651548
      0.638060
      424
    
    
      2014-01-10
      -0.624340
      -0.358287
      149
    
    
      2014-01-11
      0.058846
      0.669787
      191
    
    
      2014-01-12
      -0.015866
      -0.456204
      1016
    
    
      2014-01-13
      -0.490331
      1.042980
      301
    
    
      2014-01-14
      -0.681969
      -1.848677
      102
    
    
      2014-01-15
      -0.482265
      0.105843
      1001
    
    
      2014-01-16
      -0.455891
      0.446332
      988
    
    
      2014-01-17
      0.080858
      0.037827
      1824
    
    
      2014-01-18
      -0.487353
      0.176800
      1418
    
    
      2014-01-19
      0.274929
      0.040340
      1621
    
    
      2014-01-20
      1.265596
      -0.231885
      152
    
    
      2014-01-21
      0.573440
      1.553562
      1556
    
    
      2014-01-22
      -0.556765
      1.868245
      715
    
    
      2014-01-23
      -0.444104
      0.992026
      130
    
    
      2014-01-24
      -0.538320
      0.453568
      926
    
    
      2014-01-25
      1.782628
      -0.845000
      1531
    
    
      2014-01-26
      -1.022168
      -1.516973
      1226
    
    
      2014-01-27
      1.153171
      0.617113
      378
    
    
      2014-01-28
      -0.585982
      1.121890
      810
    
    
      2014-01-29
      1.522453
      0.490162
      1848
    
    
      2014-01-30
      -0.143591
      -1.057452
      151
    
    
      ...
      ...
      ...
      ...
    
    
      2014-03-12
      1.169210
      0.330771
      1010
    
    
      2014-03-13
      0.837246
      0.932081
      1634
    
    
      2014-03-14
      -1.246848
      1.614713
      1877
    
    
      2014-03-15
      1.715415
      0.353876
      168
    
    
      2014-03-16
      -1.725376
      -0.259473
      802
    
    
      2014-03-17
      -1.864538
      0.438324
      361
    
    
      2014-03-18
      -0.665222
      2.389269
      1331
    
    
      2014-03-19
      0.329992
      1.083136
      1166
    
    
      2014-03-20
      -0.940032
      -0.665577
      1756
    
    
      2014-03-21
      0.289460
      -0.308027
      551
    
    
      2014-03-22
      0.488300
      -1.393148
      690
    
    
      2014-03-23
      -1.252622
      1.201307
      1685
    
    
      2014-03-24
      0.021909
      -0.157732
      1216
    
    
      2014-03-25
      -0.026449
      0.847406
      702
    
    
      2014-03-26
      0.301914
      1.634134
      229
    
    
      2014-03-27
      -0.608746
      -0.255602
      736
    
    
      2014-03-28
      1.761464
      1.952497
      741
    
    
      2014-03-29
      0.194342
      1.240527
      769
    
    
      2014-03-30
      -0.753210
      0.615735
      1829
    
    
      2014-03-31
      0.467182
      0.986644
      1322
    
    
      2014-04-01
      0.573887
      1.207595
      1949
    
    
      2014-04-02
      0.252623
      -0.701256
      1856
    
    
      2014-04-03
      1.212486
      0.825473
      182
    
    
      2014-04-04
      -0.777999
      0.721827
      1765
    
    
      2014-04-05
      0.960076
      -0.016064
      559
    
    
      2014-04-06
      1.152554
      -1.259634
      862
    
    
      2014-04-07
      1.420554
      1.199850
      1115
    
    
      2014-04-08
      -1.144628
      -1.370731
      883
    
    
      2014-04-09
      0.496323
      0.809941
      1424
    
    
      2014-04-10
      1.101801
      0.877663
      1088
    
  

100 rows × 3 columns



In [74]:

    
def vwap(bars):
    return ((bars.Close*bars.Volume).sum()/bars.Volume.sum())

window = 5

s = pd.concat([ (pd.Series(vwap(df.iloc[i:i+window]), index=[df.index[i+window]])) for i in range(len(df)-window) ])

s.round(2)









    Out[74]:





2014-01-06    0.27
2014-01-07    0.30
2014-01-08    0.97
2014-01-09    0.98
2014-01-10    0.45
2014-01-11    0.34
2014-01-12    0.32
2014-01-13   -0.16
2014-01-14   -0.25
2014-01-15   -0.18
2014-01-16   -0.27
2014-01-17   -0.34
2014-01-18   -0.24
2014-01-19   -0.29
2014-01-20   -0.15
2014-01-21   -0.06
2014-01-22    0.15
2014-01-23    0.08
2014-01-24    0.26
2014-01-25    0.04
2014-01-26    0.55
2014-01-27    0.12
2014-01-28    0.32
2014-01-29    0.19
2014-01-30    0.73
2014-01-31    0.34
2014-02-01    0.79
2014-02-02    0.62
2014-02-03    0.88
2014-02-04   -0.43
              ... 
2014-03-12   -1.01
2014-03-13   -0.51
2014-03-14   -0.21
2014-03-15    0.08
2014-03-16   -0.12
2014-03-17   -0.16
2014-03-18   -0.57
2014-03-19   -1.10
2014-03-20   -0.59
2014-03-21   -0.78
2014-03-22   -0.52
2014-03-23   -0.30
2014-03-24   -0.49
2014-03-25   -0.55
2014-03-26   -0.33
2014-03-27   -0.38
2014-03-28   -0.54
2014-03-29    0.26
2014-03-30    0.33
2014-03-31   -0.07
2014-04-01    0.05
2014-04-02    0.27
2014-04-03    0.13
2014-04-04    0.15
2014-04-05    0.15
2014-04-06    0.15
2014-04-07    0.16
2014-04-08    0.44
2014-04-09    0.14
2014-04-10    0.58
Length: 95, dtype: float64

Merge



In [75]:

    
rng = pd.date_range('2000-01-01', periods=6)

df1 = pd.DataFrame(np.random.randn(6, 3), index=rng, columns=['A', 'B', 'C'])

df2 = df1.copy()



In [76]:

    
df = df1.append(df2, ignore_index=True)

df



In [77]:

    
df = pd.DataFrame(data={
    'Area': ['A'] * 5 + ['C'] * 2,
    'Bins': [110] * 2 + [160] * 3 + [40] * 2,
    'Test_0': [0, 1, 0, 1, 2, 0, 1],
    'Data' : np.random.randn(7)
})

df



In [78]:

    
df['Test_1'] = df['Test_0'] - 1

pd.merge(df, df, left_on=['Bins', 'Area','Test_0'], right_on=['Bins', 'Area','Test_1'],suffixes=('_L','_R'))

Plotting



In [79]:

    
df = pd.DataFrame({
    'stratifying_var': np.random.uniform(0, 100, 20),
    'price': np.random.normal(100, 5, 20)
})

df['quartiles'] = pd.qcut(
    df['stratifying_var'],
    4,
    labels=['0-25%', '25-50%', '50-75%', '75-100%']
)

df.boxplot(column='price', by='quartiles')









    Out[79]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f8af7b63a58>

Data In/Out

CSV

Reading multiple files to create a single DataFrame

Parsing date components in multi-columns

Skip row between header and data

Option 1: pass rows explicitly to skiprows

Option 2: read column names and then data

SQL

Excel

Computation

Timedeltas

Aliasing Axis Names



In [ ]:

Creating Example Data



In [80]:

    
def expand_grid(data_dict):
    rows = itertools.product(*data_dict.values())
    return pd.DataFrame.from_records(rows, columns=data_dict.keys())

df = expand_grid({
    'height': [60, 70],
    'weight': [100, 140, 180],
    'sex': ['Male', 'Female']
})

df

	A		B		C
	O	I	O	I	O	I
n	1.691128	-1.218601	-1.430438	-0.188882	-2.510238	1.161971
m	-0.862331	-1.001342	0.438665	1.289830	0.319945	-0.289764

	A		B		C
	O	I	O	I	O	I
n	-0.673692	-1.048736	0.569842	-0.162553	1.0	1.0
m	-2.695246	3.455720	1.371061	-4.451318	1.0	1.0

	A
2013-08-01	0.004894
2013-08-02	0.266347
2013-08-05	1.299997
2013-08-06	NaN
2013-08-07	0.856117
2013-08-08	1.966193

	A
2013-08-08	1.966193
2013-08-07	0.856117
2013-08-06	0.856117
2013-08-05	1.299997
2013-08-02	0.266347
2013-08-01	0.004894

	Case	Data
0	A	0.360256
1	A	-0.030799
2	A	-0.473578
3	B	-1.054966

	AAA	BBB	CCC	AAA_cat	BBB_cat	CCC_cat
0	1	1	2	Alpha	Alpha	Beta
1	2	1	1	Beta	Alpha	Alpha
2	1	2	3	Alpha	Beta	Charlie
3	3	2	1	Charlie	Beta	Alpha

	level_1	X	Y
row
0	One	1.10	1.20
0	Two	1.11	1.22
1	One	1.10	1.20
1	Two	1.11	1.22
2	One	1.10	1.20
2	Two	1.11	1.22

		Exams		Labs
		I	II	I	II
Student	Course
Ada	Comp	70	71	72	73
	Math	71	73	75	74
	Sci	72	75	75	75
Quinn	Comp	73	74	75	76
	Math	74	76	78	77
	Sci	75	78	78	78
Violet	Comp	76	77	78	79
	Math	77	79	81	80
	Sci	78	81	81	81

	adult	animal	size	weight
0	False	cat	S	8
1	False	dog	S	10
2	False	cat	M	11
3	False	fish	M	1
4	False	dog	M	20
5	True	cat	L	12
6	True	cat	L	12

	code	data	flag
1	bar	-0.21	True
4	bar	-0.59	False
0	foo	0.16	False
3	foo	0.45	True
2	baz	0.33	False
5	baz	0.62	True

	beyer	line_race
Last Gunfighter	99	10
Last Gunfighter	102	10
Last Gunfighter	103	8
Paynter	103	10
Paynter	88	10
Paynter	100	8

	beyer	line_race	beyer_shifted
Last Gunfighter	99	10	NaN
Last Gunfighter	102	10	99.0
Last Gunfighter	103	8	102.0
Paynter	103	10	NaN
Paynter	88	10	103.0
Paynter	100	8	88.0

		Sales
Province	City
AL	All	12.0
	Calgary	8.0
	Edmonton	4.0
BC	All	16.0
BC	Vancouver	16.0
MN	All	3.0
MN	Winnipeg	3.0
ON	All	14.0
	Toronto	13.0
	Windsor	1.0
QC	All	6.0
QC	Montreal	6.0
All	All	51.0
	Calgary	8.0
	Edmonton	4.0
	Montreal	6.0
	Toronto	13.0
	Vancouver	16.0
	Windsor	1.0
	Winnipeg	3.0

	2011	2012	2013
1	-1.452684	-0.175611	2.568449
2	-0.665644	-0.594292	-0.049278
3	-0.250123	-0.343518	0.386894
4	-0.017976	-0.723268	1.407324
5	0.333130	-0.069268	1.229126
6	0.473193	0.265367	0.316684
7	0.479913	-1.657287	-0.715239
8	0.783101	-0.617078	1.745561
9	0.350418	0.883339	-1.838103
10	-0.869617	-0.399451	-0.273889
11	-1.377859	0.947415	0.546281
12	-1.116096	0.337732	-1.366948

	A	B
2001-01-01	-0.000177	-1.062619e-04
2001-01-02	-0.000118	2.352399e-05
2001-01-03	-0.000109	-1.908927e-05
2001-01-04	0.000014	6.876731e-06
2001-01-05	-0.000104	-1.139017e-05
2001-01-06	-0.000109	1.385021e-06
2001-01-07	-0.000134	-1.066389e-05
2001-01-08	-0.000080	-5.897414e-05
2001-01-09	-0.000031	4.268051e-05
2001-01-10	-0.000093	-5.778088e-05
2001-01-11	0.000167	4.121253e-05
2001-01-12	-0.000139	2.648512e-05
2001-01-13	-0.000012	-8.737938e-06
2001-01-14	0.000066	6.144602e-06
2001-01-15	0.000053	1.586173e-05
2001-01-16	0.000047	3.042602e-06
2001-01-17	-0.000076	1.623063e-04
2001-01-18	-0.000054	3.031323e-05
2001-01-19	0.000132	9.108038e-05
2001-01-20	-0.000031	-7.483177e-05
2001-01-21	-0.000115	1.853436e-05
2001-01-22	-0.000109	-7.035200e-05
2001-01-23	-0.000158	6.830860e-05
2001-01-24	-0.000103	-1.041656e-04
2001-01-25	0.000173	2.470797e-05
2001-01-26	-0.000054	-6.445272e-05
2001-01-27	0.000052	1.223023e-05
2001-01-28	-0.000080	5.750368e-05
2001-01-29	-0.000027	-8.341443e-05
2001-01-30	0.000006	5.571757e-05
...	...	...
2006-05-25	-0.000079	2.309412e-06
2006-05-26	0.000062	-7.831948e-05
2006-05-27	0.000018	-7.913012e-05
2006-05-28	-0.000009	1.250437e-04
2006-05-29	-0.000086	2.463508e-06
2006-05-30	0.000002	9.672057e-06
2006-05-31	-0.000018	-3.416930e-05
2006-06-01	-0.000022	-1.149135e-04
2006-06-02	-0.000025	1.038285e-04
2006-06-03	-0.000196	1.479233e-04
2006-06-04	0.000024	2.573207e-05
2006-06-05	-0.000045	7.918676e-05
2006-06-06	0.000023	8.566638e-05
2006-06-07	-0.000012	5.733845e-05
2006-06-08	-0.000020	-7.910414e-05
2006-06-09	0.000220	-1.277131e-05
2006-06-10	0.000049	1.633954e-04
2006-06-11	0.000014	1.404515e-04
2006-06-12	-0.000111	-1.370796e-04
2006-06-13	0.000094	-4.485599e-05
2006-06-14	-0.000074	-5.589514e-05
2006-06-15	-0.000065	-5.347763e-05
2006-06-16	0.000005	1.654913e-04
2006-06-17	0.000085	6.933283e-05
2006-06-18	0.000046	-8.092249e-05
2006-06-19	-0.000007	-1.654069e-07
2006-06-20	-0.000053	-3.263472e-05
2006-06-21	-0.000045	7.719624e-05
2006-06-22	0.000182	1.089065e-04
2006-06-23	-0.000059	-1.130946e-05

	Close	Open	Volume
2014-01-01	0.191597	-0.314054	1241
2014-01-02	-1.717382	0.639787	682
2014-01-03	0.271303	-0.506967	164
2014-01-04	1.197396	-0.716909	1476
2014-01-05	0.492053	0.318468	458
2014-01-06	0.098098	1.119484	119
2014-01-07	1.100037	0.516767	749
2014-01-08	0.478678	0.274834	146
2014-01-09	-0.651548	0.638060	424
2014-01-10	-0.624340	-0.358287	149
2014-01-11	0.058846	0.669787	191
2014-01-12	-0.015866	-0.456204	1016
2014-01-13	-0.490331	1.042980	301
2014-01-14	-0.681969	-1.848677	102
2014-01-15	-0.482265	0.105843	1001
2014-01-16	-0.455891	0.446332	988
2014-01-17	0.080858	0.037827	1824
2014-01-18	-0.487353	0.176800	1418
2014-01-19	0.274929	0.040340	1621
2014-01-20	1.265596	-0.231885	152
2014-01-21	0.573440	1.553562	1556
2014-01-22	-0.556765	1.868245	715
2014-01-23	-0.444104	0.992026	130
2014-01-24	-0.538320	0.453568	926
2014-01-25	1.782628	-0.845000	1531
2014-01-26	-1.022168	-1.516973	1226
2014-01-27	1.153171	0.617113	378
2014-01-28	-0.585982	1.121890	810
2014-01-29	1.522453	0.490162	1848
2014-01-30	-0.143591	-1.057452	151
...	...	...	...
2014-03-12	1.169210	0.330771	1010
2014-03-13	0.837246	0.932081	1634
2014-03-14	-1.246848	1.614713	1877
2014-03-15	1.715415	0.353876	168
2014-03-16	-1.725376	-0.259473	802
2014-03-17	-1.864538	0.438324	361
2014-03-18	-0.665222	2.389269	1331
2014-03-19	0.329992	1.083136	1166
2014-03-20	-0.940032	-0.665577	1756
2014-03-21	0.289460	-0.308027	551
2014-03-22	0.488300	-1.393148	690
2014-03-23	-1.252622	1.201307	1685
2014-03-24	0.021909	-0.157732	1216
2014-03-25	-0.026449	0.847406	702
2014-03-26	0.301914	1.634134	229
2014-03-27	-0.608746	-0.255602	736
2014-03-28	1.761464	1.952497	741
2014-03-29	0.194342	1.240527	769
2014-03-30	-0.753210	0.615735	1829
2014-03-31	0.467182	0.986644	1322
2014-04-01	0.573887	1.207595	1949
2014-04-02	0.252623	-0.701256	1856
2014-04-03	1.212486	0.825473	182
2014-04-04	-0.777999	0.721827	1765
2014-04-05	0.960076	-0.016064	559
2014-04-06	1.152554	-1.259634	862
2014-04-07	1.420554	1.199850	1115
2014-04-08	-1.144628	-1.370731	883
2014-04-09	0.496323	0.809941	1424
2014-04-10	1.101801	0.877663	1088

	A	B	C
0	0.806820	0.068926	-0.374480
1	1.520404	-0.536845	-1.084997
2	0.495336	0.717881	-0.597878
3	-0.956341	0.871390	0.564765
4	-1.121630	-1.217427	1.403706
5	-1.099288	0.046444	-0.615359
6	0.806820	0.068926	-0.374480
7	1.520404	-0.536845	-1.084997
8	0.495336	0.717881	-0.597878
9	-0.956341	0.871390	0.564765
10	-1.121630	-1.217427	1.403706
11	-1.099288	0.046444	-0.615359

	Area	Bins	Data	Test_0
0	A	110	-0.805778	0
1	A	110	0.173171	1
2	A	160	0.433174	0
3	A	160	0.337607	1
4	A	160	-0.067628	2
5	C	40	0.063513	0
6	C	40	-1.090058	1

	sex	height	weight
0	Male	60	100
1	Male	60	140
2	Male	60	180
3	Male	70	100
4	Male	70	140
5	Male	70	180
6	Female	60	100
7	Female	60	140
8	Female	60	180
9	Female	70	100
10	Female	70	140
11	Female	70	180