In [ ]:
%run setup_env.py
%matplotlib inline

10 Minutes to pandas

Object Creation


In [ ]:
log.info('creating Series')
a = pd.Series((1,2,4,np.nan,5,9))
display_pretty(a)

log.info('creating DataFrame using index and columns')
dates = pd.date_range('20140301', periods=6, freq='M')
df = pd.DataFrame(np.random.rand(6, 4), index=dates, columns=list('ABCD'))
display_pretty(df)

log.info('creating DataFrame using dict')
df2 = pd.DataFrame({
    'C1': 1,
    'C2': np.arange(4),
    'C3': pd.Series(1, index=range(4)),
    'C4': pd.Categorical(['A', 'B', 'C', 'D']),
    'C5': 'Ubuntu',
    'C6': 'OK',
})
display_pretty(df2)

Viewing data

  • df.head
  • df.tail
  • df.index
  • df.columns
  • df.values
  • df.describe
  • df.sort
  • df.sort_index

Selection

getting

df['A']
df[:3]
df['20140301':'20140501']

selection by Label

df.loc[dates[0]]
df.loc[:, ['A', 'B']]
df.loc['20140331':'20140531', ['A', 'B']]
df.loc['20140331', ['A', 'B']]
df.loc[dates[0], 'A']
df.at[dates[0], 'A']

selection by Position

df.iloc[3]
df.iloc[3:5, 0:2]
df.iloc[:, [0, 3]]
df.iloc[1,1]
df.iat[1,1]

Boolean Indexing

df[df.A > 0]
df[df > 0]
df2[df2['C4'].isin(['A'])]

Setting

Missing Data

df.dropna(how='any')
df.fillna(value=3)
df.isnull(df)

Operation

Stats

df.mean()
df.mean(1)
df.sub    #

Apply

df.apply(np.cumsum)
df.apply(lambda x: x.max() - x.min())

Histogramming

s = pd.Series(np.random.randint(0, 7, 10))
s.value_counts()

String Methods

df2['C4'].str.lower()

Merge

concat

df = pd.DataFrame(np.random.randn(10, 4))
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)

join


In [ ]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
pd.merge(left, right, on='key')

append

df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df.append(df.iloc[3], ignore_index=True)

In [ ]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df.append(df.iloc[3], ignore_index=True)

Grouping


In [ ]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                         'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                         'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df.groupby('A').sum()

In [ ]:
df.groupby(['A', 'B']).sum()

Reshaping

Stack


In [ ]:
df2 = df.groupby(['A', 'B']).sum()
stacked = df2.stack()
log.info('stacke')
display_pretty(stacked)
log.info('unstack')
display_pretty(stacked.unstack())
log.info('unstack 1')
display_pretty(stacked.unstack(1))
log.info('unstack 0')
display_pretty(stacked.unstack(0))

Pivot tables


In [ ]:
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

Time Series


In [ ]:
rng = pd.date_range('2014/1/1', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.resample('5Min', how='sum')
ts.to_period('M')

Categoricals

Plotting


In [ ]:
ts = pd.Series(np.random.randn(1000), index=pd.date_range('2014/1/1', periods=1000))
ts.cumsum().plot()

In [ ]:
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
plt.figure(); df.plot(); plt.legend(loc='best')

Getting Data In/Out

CSV

df.to_csv
pd.read_csv

HDF5

Excel

df.to_excel('foo.xlsx', sheet_name='Sheet1')
pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])