In [1]:
import pandas as pd, seaborn as sns, numpy as np
sns.set(font='Bitstream Vera Sans')
sns.set_context('poster', {'figsize':(12,8)})
pd.options.display.max_rows = 8
%matplotlib inline
%cd ~/2014_fall_ASTR599/notebooks/
%run talktools.py
%cd ~/notebook/
This notebook was put together by [Abraham Flaxman](http://www.math.cmu.edu/~adf/) for UW's [Astro 599](http://www.astro.washington.edu/users/vanderplas/Astr599_2014/) course. Source and license info is on [GitHub](https://github.com/jakevdp/2014_fall_ASTR599/).
In [2]:
import pandas as pd
In [3]:
import pandas as pd # we did this already!
# did it work?
In [4]:
pd.__version__
Out[4]:
In [ ]:
pd.DataFrame([shift-tab]
In [6]:
# loading a csv
df = pd.read_csv(fname)
In [ ]:
# loading an excel file
df = pd.read_[tab]
In [ ]:
# loading a stata file
df = pd.read_
In [7]:
df = pd.DataFrame({'a': [10,20,30],
'b': [40,50,60]})
In [8]:
df
Out[8]:
In [18]:
# think of DataFrames as numpy arrays plus
df.columns
df.index
df.b.index is df.index
Out[18]:
From Global Health Data Exchange, load PHMRC VA adult data, CSV format:
In [19]:
url = 'http://ghdx.healthdata.org/sites/default/files/'\
'record-attached-files/IHME_PHMRC_VA_DATA_ADULT_Y2013M09D11_0.csv'
In [20]:
df = pd.read_csv(url)
In [21]:
df = pd.read_csv(url, low_memory=False)
In [22]:
df
Out[22]:
In [23]:
# also load codebook (excel doc)
url = 'http://ghdx.healthdata.org/sites/default/files/'\
'record-attached-files/IHME_PHMRC_VA_DATA_CODEBOOK_Y2013M09D11_0.xlsx'
cb = pd.read_excel(url)
In [27]:
cb.head()
Out[27]:
In [29]:
# each column of pd.DataFrame is a pd.Series
cb.module
Out[29]:
In [44]:
# can uses square-brackets instead of "dot"
# (useful if column name has spaces!)
cb.iloc[3, 3]
Out[44]:
In [45]:
# what's in this series?
cb.module.value_counts()
Out[45]:
In [46]:
# accessing individual cells
df.head()
Out[46]:
In [47]:
# to access by row and column
# can use names
df.loc[4, 'gs_text34']
Out[47]:
In [48]:
# or use numbers
df.iloc[4, 3]
Out[48]:
In [49]:
# same because columns 3 is gs_text
# how to check?
df.columns[3]
Out[49]:
In [50]:
df.head()
Out[50]:
In [51]:
# make the row names more interesting than numbers starting from zero
df.index = ['person %d'%(i+1) for i in df.index]
In [52]:
# have a look at the first ten rows and first five columns
df.iloc[:10, :5]
Out[52]:
In [53]:
# slicing with named indices is possible, too
df.loc['person 10', 'site':'gs_text34']
Out[53]:
notice anything weird about that, though?
In [54]:
# logical operations (element-wise)
df.module == "Adult"
Out[54]:
In [55]:
# usefule for selecting subset of rows
dfa = df[df.module == "Adult"]
In [56]:
# summarize
dfa.va34.describe(percentiles=[.025,.975])
Out[56]:
In [57]:
# percent of occurrences
100 * dfa.gs_text34.value_counts(normalize=True).round(3)
Out[57]:
In [58]:
# calculate average
dfa.word_fever.mean()
Out[58]:
In [59]:
# create visual summaries
dfa.site.value_counts(ascending=True).plot(kind='barh', figsize=(12,8))
Out[59]:
In [60]:
df.groupby('gs_text34')
Out[60]:
In [ ]:
df.word_activ
In [61]:
df.groupby('gs_text34').word_fever.mean()
Out[61]:
In [64]:
(dfa.groupby('gs_text34').word_activ.mean() * 100).order().plot(kind='barh', figsize=(12,12))
Out[64]:
In [63]:
df.filter(like='word').head()
Out[63]:
In [ ]:
# get a smaller table to experiment with
t = df.filter(like='word').describe()
t
In [ ]:
# transpose table
t.T.head()
In [ ]:
# convert from "wide" to "long"
t_long = t.T.stack()
t_long
In [ ]:
# convert back
t_long.unstack()
In [ ]:
# challenge: find the most common word
In [ ]:
df.head()
In [ ]:
piv = pd.pivot_table(df, values='word_fever',
index=['site'], columns=['gs_text34'],
aggfunc=sum)
piv
In [ ]:
causes = list(piv.columns)
piv['site'] = piv.index
fever_cnts = pd.melt(piv, id_vars=['site'], value_vars=causes,
var_name='cause of death',
value_name='number w fever mentioned')
fever_cnts.head()
In [ ]:
# cleaner to drop the rows with NaNs
fever_cnts.dropna().head()
In [ ]:
# put together first three and last three rows of df
df.iloc[:3].append(df.iloc[-3:])
In [ ]:
df.iloc[:5,20:25]
In [ ]:
cb.iloc[10:15]
In [ ]:
pd.merge(
In [ ]:
merged_df = pd.merge(df.T, cb, left_index=True, right_on='variable')
merged_df.filter(['variable', 'question', 'person 1', 'person 2']).dropna().head(10)
In [ ]:
df.groupby('gs_text34').word_fever.mean()
In [ ]:
for g, dfg in df.groupby('gs_text34'):
print g
# process DataFrame for this group
break
In [ ]:
# apply
def my_func(row):
return row['word_xray'] > 0
my_func(df.iloc[0])
df.apply(my_func, axis=1)
In [ ]:
rng = pd.date_range('1/1/2012', periods=1000, freq='S')
In [ ]:
rng
In [ ]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
In [ ]:
ts.resample('5Min', how='sum')
In [ ]:
# in here:
cb.head()
In [ ]:
# hint: look here
cb.question.str
In [ ]:
var =
The "endorsement rate" for a sign or symptom of disease is the fraction of verbal autopsy interviews where the respondent answered "Yes" to the question about that sign/symptom.
In [ ]:
# create a new column with 0/1 values instead of Yes/No/Don't Know/Refused to Answer
df['fever'] = df[var] == 'Yes'
In [ ]:
# use groupby like we did in class
In [ ]:
import matplotlib.pyplot as plt
# do some plotting like we did in class
plt.xlabel('Endorsement Rate (%)')
In [ ]:
# this might be familiar now...
In [ ]:
# could be good to start by making some numeric columns like above
In [ ]:
# groupby is a way to go, but not the only way
In [ ]:
# note: there are lots of ways to do this
# try to find one that is so simple, you will understand it next time you look
In [ ]:
# some interesting differences. if you want to know why, you might be a social scientist...