In [1]:
import pandas as pd
Downloaded tuberculosis (TB) dataset from the World Health Organisation (http://www.who.int/tb/country/data/download/en/).
In [2]:
data = pd.read_csv('data/TB_notifications_2016-12-18.csv')
In [3]:
data.shape
Out[3]:
In [4]:
data.head()
Out[4]:
In [5]:
data.filter(regex='^new').head()
Out[5]:
In [6]:
data.filter(regex='^new.*[0-9]$').head()
Out[6]:
In [7]:
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.filter.html
data.filter(like='514').head()
Out[7]:
In [8]:
data.filter(like='514').max()
Out[8]:
In [9]:
(data.filter(like='514') + 1).max()
Out[9]:
In [10]:
# Pass an anonymous function to apply()
data.filter(like='514').apply(lambda x: x + 1).max()
Out[10]:
Subset the data to mimic the who dataset from R package tidyr.
In [11]:
new = data.filter(regex='^new.*(f|m).*[0-9]$')
In [12]:
crit = new.columns.map(lambda x: x.endswith('04') | x.endswith('514'))
In [13]:
new.columns[~crit]
Out[13]:
In [14]:
# http://pandas.pydata.org/pandas-docs/stable/merging.html
df = pd.concat([data[['country', 'g_whoregion', 'year']], data[new.columns[~crit]]], axis=1)
df.head()
Out[14]:
What are the variables in this dataset?
In [15]:
import re
In [16]:
df.rename(columns=lambda x: re.sub('newrel', 'new_rel', x), inplace=True)
In [17]:
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.melt.html
melted = pd.melt(df, id_vars=['country', 'g_whoregion', 'year'], var_name='key', value_name='cases')
In [18]:
melted.head()
Out[18]:
In [19]:
melted[['new', 'type', 'sexage']] = melted['key'].apply(lambda x: pd.Series(x.split('_')))
In [20]:
melted.head()
Out[20]:
In [21]:
melted.drop('new', axis=1, inplace=True)
In [22]:
melted.drop('key', axis=1, inplace=True)
In [23]:
melted.head()
Out[23]:
In [24]:
melted[['sex']] = melted['sexage'].apply(lambda x: pd.Series(x[0]))
In [25]:
melted[['age_range']] = melted['sexage'].apply(lambda x: pd.Series(x[1:]))
In [26]:
melted.drop('sexage', axis=1, inplace=True)
In [27]:
melted.head()
Out[27]:
In [28]:
melted.tail()
Out[28]:
In [29]:
melted.to_csv('data/tidy_who.csv', index=False)
How about types? Function convert_objects() seems worth looking into.
In [30]:
# http://pandas.pydata.org/pandas-docs/stable/gotchas.html#support-for-integer-na
melted['cases'].dtypes
Out[30]:
And?
In [31]:
melted.groupby('type').size()
Out[31]:
In [32]:
melted['type'].value_counts()
Out[32]:
In [33]:
import matplotlib
%matplotlib inline
In [34]:
melted.groupby(('year', 'sex'))['cases'].sum().unstack().plot()
Out[34]: