In [1]:
import pandas as pd
from pandas import DataFrame
import pickle
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
%matplotlib inline
In [2]:
df = pd.read_pickle('.././data/pickle/pypf_prep.pkl')
df.index = df.Year
#df1 = df #lets save this result
#df = df[df['Agegroup'] != 'ALL AGES'] #lets throw away all ages rows
In [3]:
#graph a la Navaratnam 2011
grp = df[(df['Cause'] == 'IPF') & (df['Year'] < pd.to_datetime('2009')) & (df['Agegroup'] == 'ALL AGES')].groupby('Year')
data = grp['Estimated deaths age standardised to 2008 population'].sum()
major = mdates.YearLocator(2)
minor = mdates.MonthLocator()
plt.figure(figsize=(7, 7))
x = data.index
y = data.values
plt.xticks(rotation=70)
plt.ylim((0,3500))
plt.ylabel('Deaths')
plt.xlabel('Year')
plt.title('Mortality trends in England and Wales for IPF')
plt.annotate("ICD-9(1979)", (pd.to_datetime('1979'), 1000), (pd.to_datetime('1976'), 1500),
arrowprops=dict(arrowstyle="->"))
plt.annotate("ICD-10(2000)", (pd.to_datetime('2000'), 2500), (pd.to_datetime('1997'), 3000),
arrowprops=dict(arrowstyle="->"))
plt.grid(True)
p1, = plt.plot(x, y, 'k--', linewidth=2.0)
ax = plt.gca()
ax.xaxis.set_major_locator(major)
ax.legend([p1], ["Expected number of deaths"], loc='upper left', frameon=False)
#ax.xaxis.set_minor_locator(minor)
plt.savefig('.././fig/IPF mortality trends in England and Wales to 2008.png')
In [4]:
#graph a la Navaratnam 2011
grp = df[(df['Cause'] == 'IPF') &(df['Agegroup'] == 'ALL AGES')].groupby('Year')
data = grp['Estimated deaths age standardised to 2008 population'].sum()
major = mdates.YearLocator(2)
minor = mdates.MonthLocator()
plt.figure(figsize=(7, 7))
x = data.index
y = data.values
plt.xticks(rotation=70)
plt.ylim((0,4000))
plt.ylabel('Deaths')
plt.xlabel('Year')
plt.title('Mortality trends in England and Wales for IPF')
plt.annotate("ICD-9(1979)", (pd.to_datetime('1979'), 1000), (pd.to_datetime('1976'), 1500),
arrowprops=dict(arrowstyle="->"))
plt.annotate("ICD-10(2000)", (pd.to_datetime('2000'), 2500), (pd.to_datetime('1997'), 3000),
arrowprops=dict(arrowstyle="->"))
plt.grid(True)
p1, = plt.plot(x, y, 'k--', linewidth=2.0)
ax = plt.gca()
ax.xaxis.set_major_locator(major)
ax.legend([p1], ["Expected number of deaths"], loc='upper left', frameon=False)
#ax.xaxis.set_minor_locator(minor)
plt.savefig('.././fig/IPF mortality trends in England and Wales to 2012.png')
In [5]:
grp = df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Male') & (df['Agegroup'] == 'ALL AGES')].groupby('Year')
data = grp['Estimated deaths age standardised to 2008 population'].sum()
grp = df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Female') & (df['Agegroup'] == 'ALL AGES')].groupby('Year')
data1 = grp['Estimated deaths age standardised to 2008 population'].sum()
major = mdates.YearLocator(2)
minor = mdates.MonthLocator()
plt.figure(figsize=(7, 7))
x = data.index
y = data.values
x1 = data1.index
y1 = data1.values
plt.xticks(rotation=70)
plt.ylim((0,3000))
plt.ylabel('Estimated number of deaths age standardised to 2008 population')
plt.xlabel('Year')
plt.title('Mortality trends in England and Wales for Males and Females for IPF')
#plt.annotate("ICD-9(1979)", (pd.to_datetime('1979'), 1000), (pd.to_datetime('1976'), 1500),
# arrowprops=dict(arrowstyle="->"))
#plt.annotate("ICD-10(2000)", (pd.to_datetime('2000'), 2500), (pd.to_datetime('1997'), 3000),
# arrowprops=dict(arrowstyle="->"))
plt.grid(True)
p1, = plt.plot(x, y, 'k-', linewidth=2.0)
p2, = plt.plot(x1, y1,'k--', linewidth=2.0)
ax = plt.gca()
ax.xaxis.set_major_locator(major)
ax.legend([p1, p2], ["Expected number of deaths (male)", "Expected number of deaths (female)"], loc='upper left', frameon=False)
plt.savefig('.././fig/IPF mortality trends in England and Wales for Males and Females.png')
In [6]:
df_male = df[(df['Sex'] == 'Male') & (df['Agegroup'] == 'ALL AGES')]
grp = df_male[df_male['Cause'] == 'IPF'].groupby('Year')
data = grp['Estimated deaths age standardised to 2008 population'].sum()
grp = df_male[(df_male['Cause'] == 'All Mesothelioma') & (df_male['Year'] < pd.to_datetime('2001'))].groupby('Year')
data1 = grp['Estimated deaths age standardised to 2008 population'].sum()#all meso
grp = df_male[(df_male['Cause'] == 'All Mesothelioma') & (df_male['Year'] > pd.to_datetime('2000'))].groupby('Year')
data2 = grp['Estimated deaths age standardised to 2008 population'].sum()#all meso
grp = df_male[df_male['Cause'] == 'Asbestosis'].groupby('Year')
data3 = grp['Estimated deaths age standardised to 2008 population'].sum()
#grp = df_male[df_male['Cause'] == 'Mesothelioma_other'].groupby('Year') #all non pulmonary
#data = grp['Estimated deaths age standardised to 2008 population'].sum()
#grp = df_male[df_male['Cause'] == 'Mesothelioma'].groupby('Year') #pulmonary
#data1 = grp['Estimated deaths age standardised to 2008 population'].sum()
#make an (ugly) graph a la Navaratnam
#grp = dffinal6[(dffinal6['Cause'] == 'IPF') & (dffinal6['Sex'] == 'Male')].groupby('Year')
#data = grp['Estimated deaths age standardised to 2008 population'].sum()
#grp = dffinal6[(dffinal6['Cause'] == 'IPF') & (dffinal6['Sex'] == 'Female')].groupby('Year')
#data1 = grp['Estimated deaths age standardised to 2008 population'].sum()
plt.figure(figsize=(7, 7))
x = data.index
y = data.values
x1 = data1.index
y1 = data1.values
x2 = data2.index
y2 = data2.values
x3 = data3.index
y3 = data3.values
major = mdates.YearLocator(2)
minor = mdates.MonthLocator()
plt.xticks(rotation=70)
plt.ylim((0,3000))
plt.ylabel('Estimated number of deaths age standardised to 2008 population')
plt.xlabel('Year')
plt.title('Mortality trends in England and Wales for Males \n for IPF, Mesothelioma, and Asbestosis')
plt.annotate("ICD-9(1979)", (pd.to_datetime('1979'), 500), (pd.to_datetime('1976'), 1000),
arrowprops=dict(arrowstyle="->"))
plt.annotate("ICD-10(2000)", (pd.to_datetime('2000'), 1500), (pd.to_datetime('1997'), 2000),
arrowprops=dict(arrowstyle="->"))
plt.grid(True)
p1, = plt.plot(x, y, 'k-', linewidth=2.0)
p2, = plt.plot(x1, y1,'b--', linewidth=2.0)
p3, = plt.plot(x2, y2, 'b-', linewidth=2.0)
p4, = plt.plot(x3, y3, 'r-', linewidth=2.0)
ax = plt.gca()
ax.xaxis.set_major_locator(major)
ax.legend([p1, p2, p3, p4], ["Idiopathic Pulmonary Fibrosis", "Mesothelioma (pre ICD-10)", "Mesothelioma (ICD-10)", "Asbestosis"], loc='upper left', frameon=False)
plt.savefig('.././fig/IPF mortality trends in England and Wales for Males for IPF, Mesothelioma, and Asbestosis.png')
In [7]:
df_male = df[(df['Sex'] == 'Male') & (df['Agegroup'] == 'ALL AGES')]
grp = df_male[df_male['Cause'] == 'IPF'].groupby('Year')
data = DataFrame(grp['Estimated deaths age standardised to 2008 population'].sum())
data = data.pct_change()
grp = df_male[(df_male['Cause'] == 'All Mesothelioma') & (df_male['Year'] < pd.to_datetime('2001'))].groupby('Year')
data1 = DataFrame(grp['Estimated deaths age standardised to 2008 population'].sum())#all meso
data1 = data1.pct_change()
grp = df_male[(df_male['Cause'] == 'All Mesothelioma') & (df_male['Year'] > pd.to_datetime('2000'))].groupby('Year')
data2 = DataFrame(grp['Estimated deaths age standardised to 2008 population'].sum())#all meso
data2 = data2.pct_change()
grp = df_male[df_male['Cause'] == 'Asbestosis'].groupby('Year')
data3 = DataFrame(grp['Estimated deaths age standardised to 2008 population'].sum())
data3 = data3.pct_change()
#grp = df_male[df_male['Cause'] == 'Mesothelioma_other'].groupby('Year') #all non pulmonary
#data = grp['Estimated deaths age standardised to 2008 population'].sum()
#grp = df_male[df_male['Cause'] == 'Mesothelioma'].groupby('Year') #pulmonary
#data1 = grp['Estimated deaths age standardised to 2008 population'].sum()
#make an (ugly) graph a la Navaratnam
#grp = dffinal6[(dffinal6['Cause'] == 'IPF') & (dffinal6['Sex'] == 'Male')].groupby('Year')
#data = grp['Estimated deaths age standardised to 2008 population'].sum()
#grp = dffinal6[(dffinal6['Cause'] == 'IPF') & (dffinal6['Sex'] == 'Female')].groupby('Year')
#data1 = grp['Estimated deaths age standardised to 2008 population'].sum()
In [8]:
df1 = DataFrame(data1)
In [9]:
data.values
Out[9]:
In [10]:
np.corrcoef(data.values, data1.values)[0, 1]
Out[10]:
In [12]:
print (data.mean())
print (data1.mean())
print (data2.mean())
print (data3.mean())
In [13]:
plt.figure(figsize=(15, 10))
x = data.index
y = data.values * 100
x1 = data1.index
y1 = data1.values * 100
x2 = data2.index
y2 = data2.values * 100
x3 = data3.index
y3 = data3.values * 100
major = mdates.YearLocator(1)
minor = mdates.MonthLocator()
plt.xticks(rotation=70)
#plt.ylim((0,100))
plt.ylabel('Percent change')
plt.xlabel('Year')
plt.title('Mortality trends in England and Wales for Males \n for IPF, Mesothelioma, and Asbestosis')
#plt.annotate("ICD-9(1979)", (pd.to_datetime('1979'), 1000), (pd.to_datetime('1976'), 1500),
# arrowprops=dict(arrowstyle="->"))
#plt.annotate("ICD-10(2000)", (pd.to_datetime('2000'), 2500), (pd.to_datetime('1997'), 3000),
# arrowprops=dict(arrowstyle="->"))
plt.grid(True)
p1, = plt.plot(x, y, 'k-', linewidth=2.0)
p2, = plt.plot(x1, y1,'b--', linewidth=2.0)
p3, = plt.plot(x2, y2, 'b-', linewidth=2.0)
p4, = plt.plot(x3, y3, 'r-', linewidth=2.0)
ax = plt.gca()
ax.xaxis.set_major_locator(major)
ax.legend([p1, p2, p3, p4], ["Idiopathic Pulmonary Fibrosis", "Mesothelioma (pre ICD-10)", "Mesothelioma (ICD-10)", "Asbestosis"], loc='upper left', frameon=False)
plt.savefig('.././fig/IPF mortality trends in England and Wales for Males for IPF, Mesothelioma, and Asbestosis.png')
In [14]:
import numpy as np
import scipy as sp
import scipy.stats
def mean_confidence_interval(data, confidence=0.95):
a = 1.0*np.array(data)
n = len(a)
m, se = np.mean(a), scipy.stats.sem(a)
h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
#eturn m, m-h, m+h
return '%.2f (%.2f - %.2f)' % (m, m-h, m+h)
In [15]:
#for a given period and cause this function calculates the national period population (in person-years), number of deaths
#in the period and the standardised (to a 2008 population) number of deaths using our data by rolling up the region numbers
#and years
def year_analysis(period, cause):
perpop = []
perdeath = []
perstddeath = []
for i, item in enumerate(period):
regpop = df[(df['Agegroup'] == 'ALL AGES') & (df['Year'] == pd.to_datetime(period[i])) & (df['Cause'] == cause)].groupby('Region').Population.sum()
natpop = sum(regpop)
perpop.append(natpop)
regdeath = df[(df['Agegroup'] == 'ALL AGES') & (df['Year'] == pd.to_datetime(period[i])) & (df['Cause'] == cause)].groupby('Region')['Deaths'].sum()
natdeath = sum(regdeath)
perdeath.append(natdeath)
regstddeath = df[(df['Agegroup'] == 'ALL AGES') & (df['Year'] == pd.to_datetime(period[i])) & (df['Cause'] == cause)].groupby('Region')['Estimated deaths age standardised to 2008 population'].sum()
natstddeath = sum(regstddeath)
perstddeath.append(natstddeath)
n_years = len(period)
period = ('%s - %s' % (period[0], period[n_years - 1]))
deaths = sum(perdeath)
person_years = ('%i' % (sum(perpop) / 1000000))
standardised_deaths = ('%i' % sum(perstddeath))
stdperrate = [(d / 54454723) * 100000 for d in perstddeath] #54454723 is 2008 nat pop
stdperrate = mean_confidence_interval(stdperrate)
return period, deaths, person_years, standardised_deaths, stdperrate
#corrected meso is a special case...
def year_analysis2(period, cause):
perpop = []
perdeath = []
perstddeath = []
for i, item in enumerate(period):
regpop = df[(df['Agegroup'] == 'ALL AGES') & (df['Year'] == pd.to_datetime(period[i])) & (df['Cause'] == cause)].groupby('Region').Population.sum()
natpop = sum(regpop)
perpop.append(natpop)
regdeath = df[(df['Agegroup'] == 'ALL AGES') & (df['Year'] == pd.to_datetime(period[i])) & (df['Cause'] == cause)].groupby('Region')['Deaths'].sum()
natdeath = sum(regdeath)
perdeath.append(natdeath)
regstddeath = df[(df['Agegroup'] == 'ALL AGES') & (df['Year'] == pd.to_datetime(period[i])) & (df['Cause'] == cause)].groupby('Region')['Corrected Meso Deaths'].sum()
natstddeath = sum(regstddeath)
perstddeath.append(natstddeath)
n_years = len(period)
period = ('%s - %s' % (period[0], period[n_years - 1]))
deaths = sum(perdeath)
person_years = ('%i' % (sum(perpop) / 1000000))
standardised_deaths = ('%i' % sum(perstddeath))
stdperrate = [(d / 54454723) * 100000 for d in perstddeath] #54454723 is 2008 nat pop
stdperrate = mean_confidence_interval(stdperrate)
return period, deaths, person_years, standardised_deaths, stdperrate
In [16]:
periods = {}
periods[0] = ['1979', '1980', '1981', '1982', '1983']
periods[1] = ['1984', '1985', '1986', '1987', '1988']
periods[2] = ['1989', '1990', '1991', '1992', '1993', '1994']
periods[3] = ['1995', '1996', '1997', '1998', '1999', '2000']
periods[4] = ['2001', '2002', '2003', '2004']
periods[5] = ['2005', '2006', '2007', '2008']
periods[6] = ['2009', '2010', '2011', '2012']
ipf_results = []
asbestos_results = []
all_meso_results = []
cor_meso_results = []
for i, item in enumerate(periods):
ipf_results.append(year_analysis(periods[i], 'IPF'))
asbestos_results.append(year_analysis(periods[i], 'Asbestosis'))
all_meso_results.append(year_analysis(periods[i], 'All Mesothelioma'))
dfipf = pd.DataFrame(ipf_results, columns=['Period', 'Deaths', 'Person years (million)', 'Standardised Deaths (2008 population)', 'Standardised mortality rate per 100,000 (95% CI)'])
dfasb = pd.DataFrame(asbestos_results, columns=['Period', 'Deaths', 'Person years (million)', 'Standardised Deaths (2008 population)', 'Standardised mortality rate per 100,000 (95% CI)'])
dfmes1 = pd.DataFrame(all_meso_results, columns=['Period', 'Deaths', 'Person years (million)', 'Standardised Deaths (2008 population)', 'Standardised mortality rate per 100,000 (95% CI)'])
In [17]:
dfmes1 = dfmes1[dfmes1['Deaths'] != 0] #throw away rows we lack data for
dfmes = pd.concat([dfmes1]) #combine
dfmes = dfmes.sort() #sort
In [18]:
dfipf
Out[18]:
In [19]:
dfasb
Out[19]:
In [20]:
dfmes
Out[20]:
In [21]:
df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Male') & (df['Agegroup'] == 'ALL AGES')].Deaths.sum()
Out[21]:
In [22]:
df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Female') & (df['Agegroup'] == 'ALL AGES')].Deaths.sum()
Out[22]:
In [23]:
df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Male') & (df['Agegroup'] == 'ALL AGES')].Population.sum() / 1000000
Out[23]:
In [24]:
df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Female') & (df['Agegroup'] == 'ALL AGES')].Population.sum() / 1000000
Out[24]:
In [25]:
grp = df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Male') & (df['Agegroup'] == 'ALL AGES')].groupby('Year')
grp = grp['Estimated deaths age standardised to 2008 population'].sum()
grp = DataFrame(grp).reset_index()
grp['Estimated deaths age standardised to 2008 population'].map(lambda x: (x/54454723) * 100000).mean()
Out[25]:
In [26]:
grp = df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Female') & (df['Agegroup'] == 'ALL AGES')].groupby('Year')
grp = grp['Estimated deaths age standardised to 2008 population'].sum()
grp = DataFrame(grp).reset_index()
grp['Estimated deaths age standardised to 2008 population'].map(lambda x: (x/54454723) * 100000).mean()
Out[26]:
In [27]:
2 / 1.25
Out[27]:
In [28]:
grp = df[(df['Cause'] == 'IPF')].groupby('Agegroup')['Rate per 100,000 (standardised)'].mean()
grp.sort()
grp
Out[28]:
In [29]:
grp = df[(df['Cause'] == 'IPF')].groupby('Region')['Rate per 100,000 (standardised)'].mean()
grp.sort()
grp
Out[29]:
In [30]:
grp = df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Female')].groupby('Region')['Rate per 100,000 (standardised)'].mean()
grp.sort()
grp
Out[30]:
In [31]:
grp = df[(df['Cause'] == 'IPF') & (df['Sex'] == 'Male')].groupby('Region')['Rate per 100,000 (standardised)'].mean()
grp.sort()
grp
Out[31]:
In [32]:
grp = df[(df['Cause'] == 'Asbestosis') & (df['Sex'] == 'Male')].groupby('Region')['Rate per 100,000 (standardised)'].mean()
grp.sort()
grp
Out[32]:
In [33]:
grp = df[(df['Cause'] == 'All Mesothelioma') & (df['Sex'] == 'Male')].groupby('Region')['Rate per 100,000 (standardised)'].mean()
grp.sort()
grp
Out[33]:
In [34]:
df.head()
Out[34]:
In [35]:
def makepictures(age):
print(age)
df_male = df[(df['Sex'] == 'Male') & (df['Agegroup'] == age)]
grp = df_male[df_male['Cause'] == 'IPF'].groupby('Year')
data = grp['Estimated deaths age standardised to 2008 population'].sum()
grp = df_male[(df_male['Cause'] == 'All Mesothelioma') & (df_male['Year'] < pd.to_datetime('2001'))].groupby('Year')
data1 = grp['Estimated deaths age standardised to 2008 population'].sum()#all meso
grp = df_male[(df_male['Cause'] == 'All Mesothelioma') & (df_male['Year'] > pd.to_datetime('2000'))].groupby('Year')
data2 = grp['Estimated deaths age standardised to 2008 population'].sum()#all meso
grp = df_male[df_male['Cause'] == 'Asbestosis'].groupby('Year')
data3 = grp['Estimated deaths age standardised to 2008 population'].sum()
#grp = df_male[df_male['Cause'] == 'Mesothelioma_other'].groupby('Year') #all non pulmonary
#data = grp['Estimated deaths age standardised to 2008 population'].sum()
#grp = df_male[df_male['Cause'] == 'Mesothelioma'].groupby('Year') #pulmonary
#data1 = grp['Estimated deaths age standardised to 2008 population'].sum()
#make an (ugly) graph a la Navaratnam
#grp = dffinal6[(dffinal6['Cause'] == 'IPF') & (dffinal6['Sex'] == 'Male')].groupby('Year')
#data = grp['Estimated deaths age standardised to 2008 population'].sum()
#grp = dffinal6[(dffinal6['Cause'] == 'IPF') & (dffinal6['Sex'] == 'Female')].groupby('Year')
#data1 = grp['Estimated deaths age standardised to 2008 population'].sum()
plt.figure(figsize=(7, 7))
x = data.index
y = data.values
x1 = data1.index
y1 = data1.values
x2 = data2.index
y2 = data2.values
x3 = data3.index
y3 = data3.values
major = mdates.YearLocator(2)
minor = mdates.MonthLocator()
plt.xticks(rotation=70)
plt.ylim((0,1000))
plt.ylabel('Estimated number of deaths age standardised to 2008 population')
plt.xlabel('Year')
plt.title('Mortality trends in England and Wales for Males Age %s \n for IPF, Mesothelioma, and Asbestosis' % age)
#plt.annotate("ICD-9(1979)", (pd.to_datetime('1979'), 1000), (pd.to_datetime('1976'), 1500),
# arrowprops=dict(arrowstyle="->"))
#plt.annotate("ICD-10(2000)", (pd.to_datetime('2000'), 2500), (pd.to_datetime('1997'), 3000),
# arrowprops=dict(arrowstyle="->"))
plt.grid(True)
p1, = plt.plot(x, y, 'k-', linewidth=2.0)
p2, = plt.plot(x1, y1,'b--', linewidth=2.0)
p3, = plt.plot(x2, y2, 'b-', linewidth=2.0)
p4, = plt.plot(x3, y3, 'r-', linewidth=2.0)
ax = plt.gca()
ax.xaxis.set_major_locator(major)
ax.legend([p1, p2, p3, p4], ["Idiopathic Pulmonary Fibrosis", "Mesothelioma (pre ICD-10)", "Mesothelioma (ICD-10)", "Asbestosis"], loc='upper left', frameon=False)
plt.savefig('.././fig/IPF mortality trends in England and Wales for Males Age %s for IPF, Mesothelioma, and Asbestosis.png' % age)
In [61]:
ipfdeaths = (df[(df.Year == '2012') & (df.Cause == 'IPF') & (df.Agegroup == 'ALL AGES')].Deaths.sum())
print ('{} IPF deaths in 2012 in England and Wales'. format(ipfdeaths))
totaldeaths = 499331 # in 2012 from ons
print ('{} total deaths in England and Wales'.format(499331))
percentipfdeaths = ipfdeaths / totaldeaths * 100
print ('{} IPF deaths as percent of total deaths in England and Wales'.format(percentipfdeaths))