In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
In [2]:
filename='TABLE_III._Deaths_in_122_U.S._cities.csv'
df = pd.read_csv(filename)
df.head()
Out[2]:
In [3]:
df.describe()
Out[3]:
We are dealing with many NaN values, but its not clear how to treat them all. I will take the NaNs into account afterwards.
In [4]:
# Get a subsample of the dataset with deaths for all ages
# Do the report only for 2016 for simplicity
df_2016 = df[df['MMWR YEAR']==2016]
death_per_area = df_2016[['Reporting Area','All causes, by age (years), All Ages**']]
death_per_area.head()
death_per_area.columns=['Reporting Area','Deaths']
# 2. Drop NaNs:
print(len(death_per_area))
death_per_area = death_per_area.dropna()
print(len(death_per_area))
In [5]:
df.mean()
Out[5]:
In [18]:
means=df.mean().values[3:8]
categories=['>=65','45-64','25-44','1-24','LT-1']
categories_ids=[1,2,3,4,5]
means
Out[18]:
In [22]:
# Initialize the matplotlib figure
#f, ax = plt.pyplot.subplots(figsize=(15, 6))
#Set context, increase font size
sns.set_context("poster", font_scale=1.5)
#Create a figure
plt.pyplot.figure(figsize=(15, 4))
#Define the axis object
ax = sns.barplot(x=categories, y=means, palette="Blues_d")
#set parameters
ax.set(xlabel='Age category', ylabel='Deaths mean', title= "Deaths per age category")
#show the plot
sns.plt.show()
This plot shows the number of deaths per age category. As expected the number of deaths increases with age.
In [ ]: