Lets make pictures


In [1]:
import pandas as pd
from pandas import DataFrame
import pickle
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
%matplotlib inline

In [2]:
df = pd.read_pickle('.././data/pickle/pypf_prep.pkl')
df.index = df.Year
#df1 = df #lets save this result
#df = df[df['Agegroup'] != 'ALL AGES'] #lets throw away all ages rows

In [3]:
#graph a la Navaratnam 2011

grp = df[(df['Cause'] == 'IPF') & (df['Year'] < pd.to_datetime('2009')) & (df['Agegroup'] == 'ALL AGES')].groupby('Year')

data = grp['Estimated deaths age standardised to 2008 population'].sum()

major = mdates.YearLocator(2)
minor = mdates.MonthLocator()

plt.figure(figsize=(7, 7))
x = data.index
y = data.values

plt.xticks(rotation=70)
plt.ylim((0,3500))
plt.ylabel('Deaths')
plt.xlabel('Year')
plt.title('Mortality trends in England and Wales for IPF')

plt.annotate("ICD-9(1979)", (pd.to_datetime('1979'), 1000), (pd.to_datetime('1976'), 1500), 
               arrowprops=dict(arrowstyle="->")) 
plt.annotate("ICD-10(2000)", (pd.to_datetime('2000'), 2500), (pd.to_datetime('1997'), 3000), 
               arrowprops=dict(arrowstyle="->")) 
plt.grid(True)
p1, = plt.plot(x, y, 'k--', linewidth=2.0) 

ax = plt.gca()
ax.xaxis.set_major_locator(major)
ax.legend([p1], ["Expected number of deaths"], loc='upper left', frameon=False)

#ax.xaxis.set_minor_locator(minor)

plt.savefig('.././fig/IPF mortality trends in England and Wales to 2008.png')



In [4]:
#graph a la Navaratnam 2011

grp = df[(df['Cause'] == 'IPF') &(df['Agegroup'] == 'ALL AGES')].groupby('Year')

data = grp['Estimated deaths age standardised to 2008 population'].sum()

major = mdates.YearLocator(2)
minor = mdates.MonthLocator()

plt.figure(figsize=(7, 7))
x = data.index
y = data.values

plt.xticks(rotation=70)
plt.ylim((0,4000))
plt.ylabel('Deaths')
plt.xlabel('Year')
plt.title('Mortality trends in England and Wales for IPF')

plt.annotate("ICD-9(1979)", (pd.to_datetime('1979'), 1000), (pd.to_datetime('1976'), 1500), 
               arrowprops=dict(arrowstyle="->")) 
plt.annotate("ICD-10(2000)", (pd.to_datetime('2000'), 2500), (pd.to_datetime('1997'), 3000), 
               arrowprops=dict(arrowstyle="->")) 
plt.grid(True)
p1, = plt.plot(x, y, 'k--', linewidth=2.0) 

ax = plt.gca()
ax.xaxis.set_major_locator(major)
ax.legend([p1], ["Expected number of deaths"], loc='upper left', frameon=False)

#ax.xaxis.set_minor_locator(minor)

plt.savefig('.././fig/IPF mortality trends in England and Wales to 2012.png')



In [5]:
grp = df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Male') & (df['Agegroup'] == 'ALL AGES')].groupby('Year')
data = grp['Estimated deaths age standardised to 2008 population'].sum()

grp = df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Female') & (df['Agegroup'] == 'ALL AGES')].groupby('Year')
data1 = grp['Estimated deaths age standardised to 2008 population'].sum()

major = mdates.YearLocator(2)
minor = mdates.MonthLocator()

plt.figure(figsize=(7, 7))
x = data.index
y = data.values

x1 = data1.index
y1 = data1.values


plt.xticks(rotation=70)

plt.ylim((0,3000))
plt.ylabel('Estimated number of deaths age standardised to 2008 population')
plt.xlabel('Year')
plt.title('Mortality trends in England and Wales for Males and Females for IPF')
#plt.annotate("ICD-9(1979)", (pd.to_datetime('1979'), 1000), (pd.to_datetime('1976'), 1500), 
              # arrowprops=dict(arrowstyle="->")) 
#plt.annotate("ICD-10(2000)", (pd.to_datetime('2000'), 2500), (pd.to_datetime('1997'), 3000), 
              # arrowprops=dict(arrowstyle="->")) 
plt.grid(True)
p1, = plt.plot(x, y, 'k-', linewidth=2.0) 
p2, = plt.plot(x1, y1,'k--', linewidth=2.0) 

ax = plt.gca()
ax.xaxis.set_major_locator(major)
ax.legend([p1, p2], ["Expected number of deaths (male)", "Expected number of deaths (female)"], loc='upper left', frameon=False)

plt.savefig('.././fig/IPF mortality trends in England and Wales for Males and Females.png')



In [6]:
df_male = df[(df['Sex'] == 'Male') & (df['Agegroup'] == 'ALL AGES')]

grp = df_male[df_male['Cause'] == 'IPF'].groupby('Year')
data = grp['Estimated deaths age standardised to 2008 population'].sum()


grp = df_male[(df_male['Cause'] == 'All Mesothelioma') & (df_male['Year'] < pd.to_datetime('2001'))].groupby('Year') 
data1 = grp['Estimated deaths age standardised to 2008 population'].sum()#all meso

grp = df_male[(df_male['Cause'] == 'All Mesothelioma') & (df_male['Year'] > pd.to_datetime('2000'))].groupby('Year') 
data2 = grp['Estimated deaths age standardised to 2008 population'].sum()#all meso

grp = df_male[df_male['Cause'] == 'Asbestosis'].groupby('Year')
data3 = grp['Estimated deaths age standardised to 2008 population'].sum()

#grp = df_male[df_male['Cause'] == 'Mesothelioma_other'].groupby('Year') #all non pulmonary
#data = grp['Estimated deaths age standardised to 2008 population'].sum() 

#grp = df_male[df_male['Cause'] == 'Mesothelioma'].groupby('Year') #pulmonary
#data1 = grp['Estimated deaths age standardised to 2008 population'].sum()


#make an (ugly) graph a la Navaratnam
#grp = dffinal6[(dffinal6['Cause'] == 'IPF') & (dffinal6['Sex'] == 'Male')].groupby('Year')
#data = grp['Estimated deaths age standardised to 2008 population'].sum()

#grp = dffinal6[(dffinal6['Cause'] == 'IPF') & (dffinal6['Sex'] == 'Female')].groupby('Year')
#data1 = grp['Estimated deaths age standardised to 2008 population'].sum()

plt.figure(figsize=(7, 7))
x = data.index
y = data.values

x1 = data1.index
y1 = data1.values

x2 = data2.index
y2 = data2.values

x3 = data3.index
y3 = data3.values

major = mdates.YearLocator(2)
minor = mdates.MonthLocator()
plt.xticks(rotation=70)

plt.ylim((0,3000))
plt.ylabel('Estimated number of deaths age standardised to 2008 population')
plt.xlabel('Year')
plt.title('Mortality trends in England and Wales for Males \n for IPF, Mesothelioma, and Asbestosis')
plt.annotate("ICD-9(1979)", (pd.to_datetime('1979'), 500), (pd.to_datetime('1976'), 1000), 
               arrowprops=dict(arrowstyle="->")) 
plt.annotate("ICD-10(2000)", (pd.to_datetime('2000'), 1500), (pd.to_datetime('1997'), 2000), 
               arrowprops=dict(arrowstyle="->")) 
plt.grid(True)
p1, = plt.plot(x, y, 'k-', linewidth=2.0) 
p2, = plt.plot(x1, y1,'b--', linewidth=2.0) 
p3, = plt.plot(x2, y2, 'b-', linewidth=2.0) 
p4, = plt.plot(x3, y3, 'r-', linewidth=2.0) 

ax = plt.gca()
ax.xaxis.set_major_locator(major)
ax.legend([p1, p2, p3, p4], ["Idiopathic Pulmonary Fibrosis", "Mesothelioma (pre ICD-10)", "Mesothelioma (ICD-10)", "Asbestosis"], loc='upper left', frameon=False)
plt.savefig('.././fig/IPF mortality trends in England and Wales for Males for IPF, Mesothelioma, and Asbestosis.png')



In [7]:
df_male = df[(df['Sex'] == 'Male') & (df['Agegroup'] == 'ALL AGES')]

grp = df_male[df_male['Cause'] == 'IPF'].groupby('Year')
data = DataFrame(grp['Estimated deaths age standardised to 2008 population'].sum())
data = data.pct_change()


grp = df_male[(df_male['Cause'] == 'All Mesothelioma') & (df_male['Year'] < pd.to_datetime('2001'))].groupby('Year') 
data1 = DataFrame(grp['Estimated deaths age standardised to 2008 population'].sum())#all meso
data1 = data1.pct_change()


grp = df_male[(df_male['Cause'] == 'All Mesothelioma') & (df_male['Year'] > pd.to_datetime('2000'))].groupby('Year') 
data2 = DataFrame(grp['Estimated deaths age standardised to 2008 population'].sum())#all meso
data2 = data2.pct_change()



grp = df_male[df_male['Cause'] == 'Asbestosis'].groupby('Year')
data3 = DataFrame(grp['Estimated deaths age standardised to 2008 population'].sum())
data3 = data3.pct_change()



#grp = df_male[df_male['Cause'] == 'Mesothelioma_other'].groupby('Year') #all non pulmonary
#data = grp['Estimated deaths age standardised to 2008 population'].sum() 

#grp = df_male[df_male['Cause'] == 'Mesothelioma'].groupby('Year') #pulmonary
#data1 = grp['Estimated deaths age standardised to 2008 population'].sum()


#make an (ugly) graph a la Navaratnam
#grp = dffinal6[(dffinal6['Cause'] == 'IPF') & (dffinal6['Sex'] == 'Male')].groupby('Year')
#data = grp['Estimated deaths age standardised to 2008 population'].sum()

#grp = dffinal6[(dffinal6['Cause'] == 'IPF') & (dffinal6['Sex'] == 'Female')].groupby('Year')
#data1 = grp['Estimated deaths age standardised to 2008 population'].sum()

In [8]:
df1 = DataFrame(data1)

In [9]:
data.values


Out[9]:
array([[        nan],
       [ 0.0293261 ],
       [ 0.10409772],
       [-0.14198728],
       [ 0.04576732],
       [ 0.15554248],
       [ 0.07613366],
       [ 0.12430156],
       [ 0.10774118],
       [ 0.01907658],
       [ 0.26515021],
       [ 0.12604908],
       [-0.00271688],
       [ 0.0195477 ],
       [ 0.03745367],
       [-0.01545349],
       [ 0.12985665],
       [-0.0236159 ],
       [ 0.03681614],
       [ 0.14705037],
       [ 0.00837537],
       [-0.01525441],
       [ 0.10592986],
       [ 0.07317156],
       [ 0.03895109],
       [ 0.03361469],
       [ 0.03121786],
       [ 0.04545206],
       [ 0.02945093],
       [ 0.05483315],
       [ 0.02212873],
       [ 0.00171588],
       [ 0.00684932],
       [ 0.12112405],
       [ 0.01568766],
       [-0.00456942],
       [ 0.08870428],
       [ 0.12925826],
       [ 0.01619492]])

In [10]:
np.corrcoef(data.values, data1.values)[0, 1]


/home/drcjar/.virtualenvs/litsearch2/lib/python3.5/site-packages/numpy/lib/function_base.py:2487: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
/home/drcjar/.virtualenvs/litsearch2/lib/python3.5/site-packages/numpy/lib/function_base.py:2496: RuntimeWarning: divide by zero encountered in double_scalars
  c *= 1. / np.float64(fact)
/home/drcjar/.virtualenvs/litsearch2/lib/python3.5/site-packages/numpy/lib/function_base.py:2496: RuntimeWarning: invalid value encountered in multiply
  c *= 1. / np.float64(fact)
Out[10]:
nan

In [12]:
print (data.mean())
print (data1.mean())
print (data2.mean())
print (data3.mean())


Estimated deaths age standardised to 2008 population    0.053762
dtype: float64
Estimated deaths age standardised to 2008 population    0.092655
dtype: float64
Estimated deaths age standardised to 2008 population    0.022435
dtype: float64
Estimated deaths age standardised to 2008 population    0.094559
dtype: float64

In [13]:
plt.figure(figsize=(15, 10))
x = data.index
y = data.values * 100

x1 = data1.index
y1 = data1.values * 100

x2 = data2.index
y2 = data2.values * 100

x3 = data3.index
y3 = data3.values * 100

major = mdates.YearLocator(1)
minor = mdates.MonthLocator()
plt.xticks(rotation=70)

#plt.ylim((0,100))
plt.ylabel('Percent change')
plt.xlabel('Year')
plt.title('Mortality trends in England and Wales for Males \n for IPF, Mesothelioma, and Asbestosis')
#plt.annotate("ICD-9(1979)", (pd.to_datetime('1979'), 1000), (pd.to_datetime('1976'), 1500), 
              # arrowprops=dict(arrowstyle="->")) 
#plt.annotate("ICD-10(2000)", (pd.to_datetime('2000'), 2500), (pd.to_datetime('1997'), 3000), 
              # arrowprops=dict(arrowstyle="->")) 
plt.grid(True)
p1, = plt.plot(x, y, 'k-', linewidth=2.0) 
p2, = plt.plot(x1, y1,'b--', linewidth=2.0) 
p3, = plt.plot(x2, y2, 'b-', linewidth=2.0) 
p4, = plt.plot(x3, y3, 'r-', linewidth=2.0) 

ax = plt.gca()
ax.xaxis.set_major_locator(major)
ax.legend([p1, p2, p3, p4], ["Idiopathic Pulmonary Fibrosis", "Mesothelioma (pre ICD-10)", "Mesothelioma (ICD-10)", "Asbestosis"], loc='upper left', frameon=False)

plt.savefig('.././fig/IPF mortality trends in England and Wales for Males for IPF, Mesothelioma, and Asbestosis.png')


Lets make mortality rate with confidence intervals as per navaratum


In [14]:
import numpy as np
import scipy as sp
import scipy.stats

def mean_confidence_interval(data, confidence=0.95):
    a = 1.0*np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
    #eturn m, m-h, m+h
    return '%.2f (%.2f - %.2f)' % (m, m-h, m+h)

In [15]:
#for a given period and cause this function calculates the national period population (in person-years), number of deaths 
#in the period and the standardised (to a 2008 population) number of deaths using our data by rolling up the region numbers
#and years


def year_analysis(period, cause):
    perpop = []
    perdeath = []
    perstddeath = []
    
    for i, item in enumerate(period):
        regpop = df[(df['Agegroup'] == 'ALL AGES') & (df['Year'] == pd.to_datetime(period[i])) & (df['Cause'] == cause)].groupby('Region').Population.sum()
        natpop = sum(regpop)
        perpop.append(natpop)
        
        regdeath = df[(df['Agegroup'] == 'ALL AGES') & (df['Year'] == pd.to_datetime(period[i])) & (df['Cause'] == cause)].groupby('Region')['Deaths'].sum()
        natdeath = sum(regdeath)
        perdeath.append(natdeath)  
        
        regstddeath = df[(df['Agegroup'] == 'ALL AGES') & (df['Year'] == pd.to_datetime(period[i])) & (df['Cause'] == cause)].groupby('Region')['Estimated deaths age standardised to 2008 population'].sum()
        natstddeath = sum(regstddeath)
        perstddeath.append(natstddeath)  
        
    n_years = len(period)
    
    period = ('%s - %s' % (period[0], period[n_years - 1]))
    
    deaths = sum(perdeath)
    
    person_years = ('%i' % (sum(perpop) / 1000000))
     
    standardised_deaths = ('%i' % sum(perstddeath))
    
    stdperrate = [(d / 54454723) * 100000 for d in perstddeath] #54454723 is 2008 nat pop
    
    stdperrate = mean_confidence_interval(stdperrate)
    
    return period, deaths, person_years, standardised_deaths, stdperrate
    
    
#corrected meso is a special case...


def year_analysis2(period, cause):
    perpop = []
    perdeath = []
    perstddeath = []
    
    for i, item in enumerate(period):
        regpop = df[(df['Agegroup'] == 'ALL AGES') & (df['Year'] == pd.to_datetime(period[i])) & (df['Cause'] == cause)].groupby('Region').Population.sum()
        natpop = sum(regpop)
        perpop.append(natpop)
        
        regdeath = df[(df['Agegroup'] == 'ALL AGES') & (df['Year'] == pd.to_datetime(period[i])) & (df['Cause'] == cause)].groupby('Region')['Deaths'].sum()
        natdeath = sum(regdeath)
        perdeath.append(natdeath)  
        
        regstddeath = df[(df['Agegroup'] == 'ALL AGES') & (df['Year'] == pd.to_datetime(period[i])) & (df['Cause'] == cause)].groupby('Region')['Corrected Meso Deaths'].sum()
        natstddeath = sum(regstddeath)
        perstddeath.append(natstddeath)  
        
    n_years = len(period)
    
    period = ('%s - %s' % (period[0], period[n_years - 1]))
    
    deaths = sum(perdeath)
    
    person_years = ('%i' % (sum(perpop) / 1000000))
     
    standardised_deaths = ('%i' % sum(perstddeath))
    
    stdperrate = [(d / 54454723) * 100000 for d in perstddeath] #54454723 is 2008 nat pop
    
    stdperrate = mean_confidence_interval(stdperrate)
    
    return period, deaths, person_years, standardised_deaths, stdperrate

In [16]:
periods = {}
periods[0] = ['1979', '1980', '1981', '1982', '1983']
periods[1] = ['1984', '1985', '1986', '1987', '1988']
periods[2] = ['1989', '1990', '1991', '1992', '1993', '1994']
periods[3] = ['1995', '1996', '1997', '1998', '1999', '2000']
periods[4] = ['2001', '2002', '2003', '2004']
periods[5] = ['2005', '2006', '2007', '2008']
periods[6] = ['2009', '2010', '2011', '2012']

ipf_results = []
asbestos_results = []
all_meso_results = []
cor_meso_results = []

for i, item in enumerate(periods):
    ipf_results.append(year_analysis(periods[i], 'IPF'))
    asbestos_results.append(year_analysis(periods[i], 'Asbestosis'))
    all_meso_results.append(year_analysis(periods[i], 'All Mesothelioma'))
    
    
dfipf = pd.DataFrame(ipf_results, columns=['Period', 'Deaths', 'Person years (million)', 'Standardised Deaths (2008 population)', 'Standardised mortality rate per 100,000 (95% CI)'])
dfasb = pd.DataFrame(asbestos_results, columns=['Period', 'Deaths', 'Person years (million)', 'Standardised Deaths (2008 population)', 'Standardised mortality rate per 100,000 (95% CI)'])
dfmes1 = pd.DataFrame(all_meso_results, columns=['Period', 'Deaths', 'Person years (million)', 'Standardised Deaths (2008 population)', 'Standardised mortality rate per 100,000 (95% CI)'])

In [17]:
dfmes1 = dfmes1[dfmes1['Deaths'] != 0] #throw away rows we lack data for
dfmes = pd.concat([dfmes1]) #combine 
dfmes = dfmes.sort() #sort


/home/drcjar/.virtualenvs/litsearch2/lib/python3.5/site-packages/ipykernel/__main__.py:3: FutureWarning: sort(....) is deprecated, use sort_index(.....)
  app.launch_new_instance()

In [18]:
dfipf


Out[18]:
Period Deaths Person years (million) Standardised Deaths (2008 population) Standardised mortality rate per 100,000 (95% CI)
0 1979 - 1983 3846.0 247 4029 1.48 (1.22 - 1.74)
1 1984 - 1988 5851.0 249 6378 2.34 (2.19 - 2.50)
2 1989 - 1994 8584.0 304 9192 2.81 (2.51 - 3.12)
3 1995 - 2000 11548.0 310 12162 3.72 (3.35 - 4.10)
4 2001 - 2004 9597.0 210 9917 4.55 (4.15 - 4.96)
5 2005 - 2008 11089.0 215 11193 5.14 (4.60 - 5.67)
6 2009 - 2012 13770.0 222 13470 6.18 (5.02 - 7.35)

In [19]:
dfasb


Out[19]:
Period Deaths Person years (million) Standardised Deaths (2008 population) Standardised mortality rate per 100,000 (95% CI)
0 1979 - 1983 172.0 247 170 0.06 (0.04 - 0.09)
1 1984 - 1988 208.0 249 227 0.08 (0.07 - 0.10)
2 1989 - 1994 325.0 304 349 0.11 (0.10 - 0.12)
3 1995 - 2000 348.0 310 367 0.11 (0.10 - 0.12)
4 2001 - 2004 364.0 210 376 0.17 (0.13 - 0.22)
5 2005 - 2008 495.0 215 500 0.23 (0.20 - 0.26)
6 2009 - 2012 723.0 222 708 0.33 (0.28 - 0.37)

In [20]:
dfmes


Out[20]:
Period Deaths Person years (million) Standardised Deaths (2008 population) Standardised mortality rate per 100,000 (95% CI)
0 1979 - 1983 2446.223216 247 2540 0.93 (0.72 - 1.15)
1 1984 - 1988 3192.247558 249 3491 1.28 (1.17 - 1.40)
2 1989 - 1994 5453.305493 304 5875 1.80 (1.39 - 2.21)
3 1995 - 2000 8816.424768 310 9298 2.85 (2.42 - 3.27)
4 2001 - 2004 6502.000000 210 6741 3.10 (3.01 - 3.18)
5 2005 - 2008 7368.000000 215 7448 3.42 (3.20 - 3.64)
6 2009 - 2012 8474.000000 222 8290 3.81 (3.68 - 3.93)

In [21]:
df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Male') & (df['Agegroup'] == 'ALL AGES')].Deaths.sum()


Out[21]:
41249.0

In [22]:
df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Female') & (df['Agegroup'] == 'ALL AGES')].Deaths.sum()


Out[22]:
25992.0

In [23]:
df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Male') & (df['Agegroup'] == 'ALL AGES')].Population.sum() / 1000000


Out[23]:
980.56322699999998

In [24]:
df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Female') & (df['Agegroup'] == 'ALL AGES')].Population.sum() / 1000000


Out[24]:
1028.579778

In [25]:
grp = df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Male') & (df['Agegroup'] == 'ALL AGES')].groupby('Year')
grp = grp['Estimated deaths age standardised to 2008 population'].sum()
grp = DataFrame(grp).reset_index()
grp['Estimated deaths age standardised to 2008 population'].map(lambda x: (x/54454723) * 100000).mean()


Out[25]:
2.006538435815437

In [26]:
grp = df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Female') & (df['Agegroup'] == 'ALL AGES')].groupby('Year')
grp = grp['Estimated deaths age standardised to 2008 population'].sum()
grp = DataFrame(grp).reset_index()
grp['Estimated deaths age standardised to 2008 population'].map(lambda x: (x/54454723) * 100000).mean()


Out[26]:
1.250705869329551

In [27]:
2 / 1.25


Out[27]:
1.6

In [28]:
grp = df[(df['Cause'] == 'IPF')].groupby('Agegroup')['Rate per 100,000 (standardised)'].mean()
grp.sort()
grp


/home/drcjar/.virtualenvs/litsearch2/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
  from ipykernel import kernelapp as app
Out[28]:
Agegroup
UNDER 25     0.021286
25-34        0.048154
35-44        0.164044
45-54        0.731656
55-64        3.363908
ALL AGES     3.445875
65-74       11.624223
75-84       26.663022
85+         37.749784
Name: Rate per 100,000 (standardised), dtype: float64

In [29]:
grp = df[(df['Cause'] == 'IPF')].groupby('Region')['Rate per 100,000 (standardised)'].mean()
grp.sort()
grp


/home/drcjar/.virtualenvs/litsearch2/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
  from ipykernel import kernelapp as app
Out[29]:
Region
EAST                         7.976114
SOUTH EAST                   8.312335
LONDON                       8.707275
YORKSHIRE AND THE HUMBER     8.908680
SOUTH WEST                   9.190130
EAST MIDLANDS                9.333159
WEST MIDLANDS                9.459794
NORTH EAST                  10.289851
WALES                       10.495049
NORTH WEST                  10.707523
Name: Rate per 100,000 (standardised), dtype: float64

In [30]:
grp = df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Female')].groupby('Region')['Rate per 100,000 (standardised)'].mean()
grp.sort()
grp


/home/drcjar/.virtualenvs/litsearch2/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
  from ipykernel import kernelapp as app
Out[30]:
Region
EAST                        4.737318
SOUTH EAST                  4.900534
YORKSHIRE AND THE HUMBER    5.053850
LONDON                      5.411803
SOUTH WEST                  5.640149
WEST MIDLANDS               5.641900
NORTH EAST                  5.692070
EAST MIDLANDS               5.746785
WALES                       6.168030
NORTH WEST                  6.214628
Name: Rate per 100,000 (standardised), dtype: float64

In [31]:
grp = df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Male')].groupby('Region')['Rate per 100,000 (standardised)'].mean()
grp.sort()
grp


/home/drcjar/.virtualenvs/litsearch2/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
  from ipykernel import kernelapp as app
Out[31]:
Region
EAST                        11.214911
SOUTH EAST                  11.724135
LONDON                      12.002747
SOUTH WEST                  12.740111
YORKSHIRE AND THE HUMBER    12.763510
EAST MIDLANDS               12.919534
WEST MIDLANDS               13.277687
WALES                       14.822068
NORTH EAST                  14.887633
NORTH WEST                  15.200417
Name: Rate per 100,000 (standardised), dtype: float64

In [32]:
grp = df[(df['Cause'] == 'Asbestosis') & (df['Sex'] == 'Male')].groupby('Region')['Rate per 100,000 (standardised)'].mean()
grp.sort()
grp


/home/drcjar/.virtualenvs/litsearch2/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
  from ipykernel import kernelapp as app
Out[32]:
Region
EAST                        0.547326
WALES                       0.643478
WEST MIDLANDS               0.760362
YORKSHIRE AND THE HUMBER    0.802830
SOUTH EAST                  0.872112
EAST MIDLANDS               0.960083
SOUTH WEST                  1.098402
LONDON                      1.316267
NORTH WEST                  1.622810
NORTH EAST                  3.072687
Name: Rate per 100,000 (standardised), dtype: float64

In [33]:
grp = df[(df['Cause'] == 'All Mesothelioma') & (df['Sex'] == 'Male')].groupby('Region')['Rate per 100,000 (standardised)'].mean()
grp.sort()
grp


/home/drcjar/.virtualenvs/litsearch2/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
  from ipykernel import kernelapp as app
Out[33]:
Region
WALES                        6.184247
EAST MIDLANDS                6.482373
WEST MIDLANDS                6.525374
SOUTH WEST                   7.618154
EAST                         7.861819
LONDON                       8.220906
YORKSHIRE AND THE HUMBER     8.646171
NORTH WEST                   8.965068
SOUTH EAST                   9.205414
NORTH EAST                  17.543943
Name: Rate per 100,000 (standardised), dtype: float64

In [34]:
df.head()


Out[34]:
Region Agegroup Deaths Sex Year Cause Population 2008 population Rate per 100,000 population Estimated deaths age standardised to 2008 population Rate per 100,000 (standardised)
Year
1974-01-01 NORTH EAST ALL AGES 26.0 Male 1974-01-01 IPF 1536100.0 1256065.0 1.692598 21.260133 1.692598
1974-01-01 NORTH EAST ALL AGES 1.0 Male 1974-01-01 Asbestosis 1536100.0 1256065.0 0.065100 0.817697 0.065100
1974-01-01 NORTH EAST ALL AGES 12.5 Male 1974-01-01 All Mesothelioma 1536100.0 1256065.0 0.813749 10.221218 0.813749
1974-01-01 NORTH EAST UNDER 25 0.0 Male 1974-01-01 IPF 615500.0 405899.0 0.000000 0.000000 0.000000
1974-01-01 NORTH EAST 25-34 0.0 Male 1974-01-01 IPF 203300.0 149126.0 0.000000 0.000000 0.000000

In [35]:
def makepictures(age):
    
    print(age)

    df_male = df[(df['Sex'] == 'Male') & (df['Agegroup'] == age)]

    grp = df_male[df_male['Cause'] == 'IPF'].groupby('Year')
    data = grp['Estimated deaths age standardised to 2008 population'].sum()


    grp = df_male[(df_male['Cause'] == 'All Mesothelioma') & (df_male['Year'] < pd.to_datetime('2001'))].groupby('Year') 
    data1 = grp['Estimated deaths age standardised to 2008 population'].sum()#all meso

    grp = df_male[(df_male['Cause'] == 'All Mesothelioma') & (df_male['Year'] > pd.to_datetime('2000'))].groupby('Year') 
    data2 = grp['Estimated deaths age standardised to 2008 population'].sum()#all meso

    grp = df_male[df_male['Cause'] == 'Asbestosis'].groupby('Year')
    data3 = grp['Estimated deaths age standardised to 2008 population'].sum()
  
    #grp = df_male[df_male['Cause'] == 'Mesothelioma_other'].groupby('Year') #all non pulmonary
    #data = grp['Estimated deaths age standardised to 2008 population'].sum() 

    #grp = df_male[df_male['Cause'] == 'Mesothelioma'].groupby('Year') #pulmonary
    #data1 = grp['Estimated deaths age standardised to 2008 population'].sum()
    
    #make an (ugly) graph a la Navaratnam
    #grp = dffinal6[(dffinal6['Cause'] == 'IPF') & (dffinal6['Sex'] == 'Male')].groupby('Year')
    #data = grp['Estimated deaths age standardised to 2008 population'].sum()

    #grp = dffinal6[(dffinal6['Cause'] == 'IPF') & (dffinal6['Sex'] == 'Female')].groupby('Year')
    #data1 = grp['Estimated deaths age standardised to 2008 population'].sum()

    plt.figure(figsize=(7, 7))
    x = data.index
    y = data.values

    x1 = data1.index
    y1 = data1.values

    x2 = data2.index
    y2 = data2.values

    x3 = data3.index
    y3 = data3.values

    major = mdates.YearLocator(2)
    minor = mdates.MonthLocator()
    plt.xticks(rotation=70)

    plt.ylim((0,1000))
    plt.ylabel('Estimated number of deaths age standardised to 2008 population')
    plt.xlabel('Year')
    plt.title('Mortality trends in England and Wales for Males Age %s \n for IPF, Mesothelioma, and Asbestosis' % age)
    #plt.annotate("ICD-9(1979)", (pd.to_datetime('1979'), 1000), (pd.to_datetime('1976'), 1500), 
              # arrowprops=dict(arrowstyle="->")) 
    #plt.annotate("ICD-10(2000)", (pd.to_datetime('2000'), 2500), (pd.to_datetime('1997'), 3000), 
              # arrowprops=dict(arrowstyle="->")) 
    plt.grid(True)
    p1, = plt.plot(x, y, 'k-', linewidth=2.0) 
    p2, = plt.plot(x1, y1,'b--', linewidth=2.0) 
    p3, = plt.plot(x2, y2, 'b-', linewidth=2.0) 
    p4, = plt.plot(x3, y3, 'r-', linewidth=2.0) 

    ax = plt.gca()
    ax.xaxis.set_major_locator(major)
    ax.legend([p1, p2, p3, p4], ["Idiopathic Pulmonary Fibrosis", "Mesothelioma (pre ICD-10)", "Mesothelioma (ICD-10)", "Asbestosis"], loc='upper left', frameon=False)

    plt.savefig('.././fig/IPF mortality trends in England and Wales for Males Age %s for IPF, Mesothelioma, and Asbestosis.png' % age)

In [61]:
ipfdeaths = (df[(df.Year == '2012') & (df.Cause == 'IPF') & (df.Agegroup == 'ALL AGES')].Deaths.sum())
print ('{} IPF deaths in 2012 in England and Wales'. format(ipfdeaths))

totaldeaths =  499331 # in 2012 from ons
print ('{} total deaths in England and Wales'.format(499331))

percentipfdeaths = ipfdeaths / totaldeaths * 100
print ('{} IPF deaths as percent of total deaths in England and Wales'.format(percentipfdeaths))


3902.0 IPF deaths in 2012 in England and Wales
499331 total deaths in England and Wales
0.7814455741782504 IPF deaths as percent of total deaths in England and Wales