An introduction to Pandas Library

Python Coffee February 18th, 2016


In [ ]:
import pandas as pd
import matplotlib.pyplot as pl
import datetime as dt
%matplotlib notebook

In [ ]:
pd.options.display.max_rows = 20
pd.options.display.max_columns = 100

Reading Data


In [ ]:
pwv = pd.read_csv('pwv_APEX_3h.csv', na_values='NaN')

In [ ]:
measurements = pd.read_csv('sc_measurements.csv')
sources = pd.read_csv('sc_sources.csv')

In [ ]:
targets = pd.read_csv('sg_targets.csv')

In [ ]:
execblocks = pd.read_csv('aqua_exeblock.csv', index_col=0)
execblocks.rename(columns={'SB_UID.1': 'SB_UID'}, inplace=True)

Inspecting Data


In [ ]:
pwv.values

In [ ]:
pwv

In [ ]:
# Get a quick summary of the dataframe
pwv.info()

In [ ]:
# Get the first columns
pwv.head()

In [ ]:
# Get the last rows
pwv.tail()

In [ ]:
# Extract/Slice data (rows)
pwv[:3]

In [ ]:
pwv.ix[:3,1:3]

In [ ]:
pwv[['Month', 'Day']][:3]

In [ ]:
pwv['PWV']

In [ ]:
?pd.read_csv

In [ ]:
print pwv.PWV.min()
print pwv.PWV.max()
print pwv.PWV.mean()
print pwv.PWV.median()
print pwv.PWV.std()

In [ ]:
pwv.PWV.describe()

In [ ]:
pwv.describe()

In [ ]:
pl.figure(1)
pwv.PWV.hist(bins=40)

In [ ]:
pl.figure(2)
pwv.PWV.plot()

In [ ]:
pl.figure(3)
pwv.plot(y='PWV')

Selecting/Querying Data


In [ ]:
execblocks.info()

In [ ]:
execblocks.head()

In [ ]:
execblocks[['QA0STATUS', 'SE_STATUS']][:7]

In [ ]:
execblocks['QA0STATUS'].unique()

In [ ]:
execblocks['QA0STATUS'].value_counts()

In [ ]:
pl.figure(7)
execblocks['QA0STATUS'].value_counts().plot(kind='bar')

In [ ]:
execblocks['SB_UID'].value_counts()

In [ ]:
pl.figure()
execblocks['SB_UID'].value_counts()[:10].plot(kind='bar')

In [ ]:
execblocks['QA0STATUS'] == "Pass"

In [ ]:
execblocks[execblocks['QA0STATUS'] == "Pass"]

In [ ]:
execblocks.query('QA0STATUS == "Pass"')

In [ ]:
execblocks.query('QA0STATUS == "Pass" and SE_STATUS == "FAIL"')

In [ ]:
sblist = execblocks['SB_UID'].value_counts()[:10].index.values
sblist

In [ ]:
mostobserved = execblocks.query('SB_UID in @sblist')
mostobserved.head()

In [ ]:
mostobserved.groupby(['SB_UID', 'QA0STATUS']).aggregate({'EXECBLOCKUID': pd.np.count_nonzero, 'delta': pd.np.mean})

In [ ]:
table1 = mostobserved.groupby(['SB_UID', 'QA0STATUS']).aggregate({'EXECBLOCKUID': pd.np.count_nonzero, 'delta': pd.np.mean})
table1.unstack()

In [ ]:
table1.to_excel('table1.xls')

In [ ]:
execblocks.dropna().apply(lambda x: x['delta'] * 45., axis=1)

In [ ]:
execblocks['day'] = execblocks.apply(lambda x: x['STARTTIME'][:10], axis=1)

In [ ]:
execblocks

In [ ]:
effi = execblocks.groupby(['day', 'QA0STATUS']).aggregate({'EXECBLOCKUID': pd.np.count_nonzero})
effi

In [ ]:


In [ ]:
effipl = effi.unstack()['EXECBLOCKUID'].reset_index()

In [ ]:
effipl['Date'] = effipl.apply(lambda x: dt.datetime.strptime(x['day'], '%Y-%m-%d'), axis=1)

In [ ]:
effipl

In [ ]:
ax = effipl.plot(x='Date', y='Pass')
effipl.plot(x='Date', y='Fail', ax=ax)

In [ ]:
pwv['Date'] = pwv.apply(lambda x: 
                        dt.datetime(int(x['Year']), int(x['Month']), int(x['Day']), int(x['Hour'])), 
                        axis=1)

In [ ]:
pwv.set_index('Date', inplace=True, drop=False)