In [3]:
from __future__ import division, print_function
In [4]:
from IPython.display import HTML, Image, display as disp
In [5]:
with open("../css/css.css", "r") as f:
style = f.read()
HTML(style)
Out[5]:
In [6]:
!./shortlog.sh ../../pandas
pandas fills the need for a Python library that is awesome at data munging. With pandas you don't need to go outside of Python to do data analysis. You can write a web app using django and have data dashboards that are backed by pandas.
pandas
is the library for tabular you've always wished you had. It supports operations such as
groupby
merge
mean
, sum
, any
, all
, etc.)pandas
was designed to work with "medium" data, i.e., data that fit into memory.
In [7]:
disp(Image("../img/m.jpg"))
In [8]:
disp(Image("../img/u.jpg"))
pip install sh
urllib
et al are less than desirable
In [9]:
# imports that I'll use throughout the talk
import numpy as np
import pandas as pd
from pandas import DataFrame, Series, Index
from numpy.random import randn, randint, rand, choice
import matplotlib.pyplot as plt
pd.options.display.max_rows = 10
try:
from mpltools import style
style.use('ggplot')
except ImportError:
pass
# because of our bg color
plt.rc('text', color='white')
plt.rc('axes', labelcolor='white')
plt.rc('xtick', color='white')
plt.rc('ytick', color='white')
In [10]:
%matplotlib inline
In [11]:
df = pd.DataFrame(randn(10, 2), columns=list('ab'))
df
Out[11]:
In [ ]:
# if you have boto installed and have set up credentials
# df = pd.read_csv('s3://nypug/tips.csv')
In [12]:
df = pd.read_csv('https://s3.amazonaws.com/nyqpug/tips.csv')
df
Out[12]:
In [13]:
df.dtypes
Out[13]:
In [14]:
# column access by name
df['day'] # ⟵ that's a Series object
Out[14]:
In [15]:
# by attribute
df.time
Out[15]:
In [16]:
df.time.value_counts()
Out[16]:
In [17]:
# multiple columns
df[['tip', 'sex']]
Out[17]:
In [18]:
t = df.set_index('day')
t.head()
Out[18]:
In [ ]:
t.loc['Sun']
In [19]:
df.loc[df.day == 'Sun']
Out[19]:
In [21]:
df.loc[:, 'smoker']
Out[21]:
In [22]:
df.iloc[:, 3] # same as df.loc[:, 'smoker']
Out[22]:
In [23]:
df['pct_tip'] = df.tip / df.total_bill
df['avg_price'] = df.total_bill / df.size
In [24]:
df.avg_price.hist(bins=20)
Out[24]:
In [25]:
del df['avg_price']
del df['pct_tip']
In [26]:
# multiple columns, multiple rows
df.loc[[0, 2], ['sex', 'tip']]
Out[26]:
In [27]:
df.loc[:10, ['total_bill', 'tip']] # note this is inclusive
Out[27]:
In [28]:
# and with iloc
df.iloc[:5] # exclusive endpoints
Out[28]:
Apply a function over the columns of a DataFrame
In [29]:
# np.ptp is peak-to-peak difference, i.e., range
df[['total_bill', 'tip', 'size']].apply(np.ptp)
Out[29]:
In [31]:
days = ['Sun', 'Mon', 'Tues', 'Wed', 'Thur', 'Fri', 'Sat']
In [32]:
df.day.map(days.index)
Out[32]:
For example, the DataFrame.sum()
method results in a Series
object and by default sums across the rows. If you want to sum across the columns pass axis=1
, similar to the numpy
convention
In general, reduction operations result in a Series
object.
Some example reduction functions:
mean
, median
, and mode
count
std
, var
NOTE: These ignore NaN
s
In [33]:
df.sum() # whoa! + is defined for strings
Out[33]:
In [34]:
df.sum(numeric_only=True)
Out[34]:
In [37]:
df.count() / df.shape[0] # we don't have any nans
Out[37]:
In [35]:
df.var()
Out[35]:
In [36]:
df.mean()
Out[36]:
pandas
' GroupBy
functionality allows you to perform operations on subsets of a DataFrame
and then combines the results for you at the end
In [38]:
df.head()
Out[38]:
In [39]:
gb = df.groupby('sex')
gb.mean()
Out[39]:
In [40]:
gb = df.groupby(['sex', 'smoker'])
gb.std()
Out[40]:
In [41]:
# can pass multiple reducers to agg
gb.agg(['mean', 'std', 'median'])
Out[41]:
GroupBy.apply()
take a callable
and calls it on each group
In [42]:
def stdize(x):
return (x - x.mean()) / x.std()
df['tb_std'] = gb.total_bill.apply(stdize)
df
Out[42]:
In [43]:
ax = df.tb_std.plot(kind='kde', lw=3)
df.tb_std.hist(ax=ax, normed=True)
ax.set_xlabel('$z$-score', fontsize=20)
ax.set_title(r'Total Bill Stdized Across Sex$\times$ Smoker')
ax.axis('tight')
plt.gcf().tight_layout()
Often times you have data sets of different shapes with a common column that you want to join on
pandas
does this a couple of ways, but the main entry point is pandas.merge
In [45]:
# create some frames
n = 2000
n2 = n // 2
visits = DataFrame({'page_visits_per_day': np.random.poisson(10, size=n),
'user_id': randint(9, size=n)})
likes = DataFrame({'likes_per_day': np.random.poisson(30, size=n2),
'user_id': randint(6, size=n2)})
visits
Out[45]:
In [46]:
likes
Out[46]:
In [47]:
merg = pd.merge(visits, likes)
merg.sort('user_id')
Out[47]:
In [48]:
s = df.day
s
Out[48]:
In [49]:
is_weekend = s.str.startswith('S')
is_weekend
Out[49]:
In [50]:
correct = s[is_weekend].str.contains(r'^(?:Sat|Sun)$')
correct.all()
Out[50]:
In [51]:
s.str.len()
Out[51]:
In [52]:
s.str[:2]
Out[52]:
In [53]:
n = 10000
idx = pd.date_range(start='today', periods=n, freq='D')
idx
Out[53]:
In [55]:
s = Series(np.random.poisson(10, size=n), index=idx, name='login_count')
s
Out[55]:
In [56]:
s.resample('W', how='sum')
Out[56]:
In [57]:
# multiple functions
rs = s.resample('W', how=['mean', 'std', 'count'])
rs
Out[57]:
In [58]:
mu = rs['mean']
fig, ax = plt.subplots(figsize=(12, 6))
ax.step(mu.index[:50], mu.iloc[:50], lw=3, where='post')
fig.tight_layout()
ax.set_xlabel('Time')
ax.set_ylabel('Average Count / Week')
ax.set_title('Count vs. Time')
Out[58]:
In [59]:
%%writefile tmp.csv
a,b,c
1,d,3.0
4,e,6.28
2,f,4.4
In [60]:
df = pd.read_csv('tmp.csv')
df
Out[60]:
In [61]:
df.dtypes
Out[61]:
In [62]:
df.to_csv('tmp.csv')
In [63]:
%cat tmp.csv # saves the index created by default
In [64]:
df = DataFrame(dict(a=randn(10), b=randint(10, size=10),
c=choice(list('abc'), size=10)))
df
Out[64]:
In [65]:
df.to_hdf('tmp.h5', 'df', format='table')
In [66]:
pd.read_hdf('tmp.h5', 'df')
Out[66]:
In [68]:
!ptdump tmp.h5