Herein we make usable mortality data for IPF, mesothelioma, and asbestosis which is stratified by age, sex, and region.


In [15]:
import pandas as pd
import xlrd 
import numpy as np

In [16]:
workbook = xlrd.open_workbook('.././data/eng/Lung disease tables FINAL.xlsx') #what ONS sent part one
print workbook.sheet_names()
df = pd.read_excel('.././data/eng/Lung disease tables FINAL.xlsx', 'Contents') #lets load the contents worksheet
df.head() #lets look at the contents


[u'Contents', u'Table 1', u'Table 2', u'Table 3', u'Table 4', u'Table 5', u'Table 6']
Out[16]:
Contents Unnamed: 1
0 NaN NaN
1 Table 1 Number of deaths in each standard region of En...
2 Table2 Number of deaths in each standard region of En...
3 Table 3 Number of deaths in each standard region of En...
4 Table 4 Number of deaths in each region of England and...

In [17]:
df.columns = ['Name', 'Desc'] #lets have Name and Description columns for the worksheets(
df.fillna('n/a', inplace=True) #get rid on NaNs
df = df[df['Name'].str.contains('Table')] #get rid of rows that aren't about our worksheets of interest
df['Name'] = df['Name'].str.replace('Table2', 'Table 2') #correct ONS typo
sheets = df.to_dict(orient='records') #lets save our work as dict

In [18]:
#lets make a shorthand table-disease lookup
dis_lookup = {'Table 1':'IPF', 'Table 2':'Asbestosis', 'Table 3':'Pulmonary Mesothelioma', 
              'Table 4':'IPF', 'Table 5':'Asbestosis', 'Table 6':'Pulmonary Mesothelioma', 
              'Table 7':'All Mesothelioma'} 

#just a few transforms needed to make the ons data usable
transforms = {'YORKSHIRE AND THE HUMBER':'YORKSHIRE AND THE HUMBER',
              'YORKSHIRE & HUMBERSIDE':'YORKSHIRE AND THE HUMBER',
              'YORKSHIRE & HUMBER':'YORKSHIRE AND THE HUMBER',
              'NORTH WEST':'NORTH WEST',
              'EAST MIDLANDS':'EAST MIDLANDS', 'WEST MIDLANDS':'WEST MIDLANDS',
              'WEST MIDLANDS (REGION)':'WEST MIDLANDS', 'EAST':'EAST', 
              'SOUTH EAST':'SOUTH EAST', 'SOUTH WEST':'SOUTH WEST', 'WALES':'WALES', 
              'NORTH EAST':'NORTH EAST', 'LONDON':'LONDON', 
              'EASTERN':'EAST', 'EAST OF ENGLAND':'EAST', 'EAST ANGLIA':'EAST', 'NORTH':'NORTH EAST', 
              'YORKSHIRE AND HUMBERSIDE':'YORKSHIRE AND THE HUMBER',
              'ALL AGES':'ALL AGES', 'UNDER 25':'UNDER 25', '25-34':'25-34', '35-44':'35-44', 
              '45-54':'45-54', '55-64':'55-64', '65-74':'65-74', '75-84':'75-84', '85+':'85+',
               '<25':'UNDER 25', '25 TO 34':'25-34', '35 TO 44':'35-44','45 TO 54':'45-54', '55 TO 64':'55-64', 
               '65 TO 74':'65-74', '75 TO 84':'75-84'}

#what we're expecting
REGIONS = ['YORKSHIRE AND THE HUMBER', 'NORTH WEST','EAST MIDLANDS', 'WEST MIDLANDS', 
               'EAST', 'SOUTH EAST', 'SOUTH WEST', 'WALES', 'NORTH EAST', 'LONDON']

AGEGROUPS = ['ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']

SEX = ['Male', 'Female']

CAUSE = ['IPF', 'Asbestosis', 'Mesothelioma', 'Mesothelioma_other', 'Mesothelioma all']

YEARS = ['1974'] 
for year in range(1975, 2013):
    YEARS.append(str(year))
    
#this works because years are dupilcated in the dataset for women and pandas applies a .1 to indicate it's a duplicate
#and men always appear first in our data

def sex(year):
    year = year % 1
    if year == 0.0:
        return 'Male'
    else:
        return 'Female'

In [19]:
#lets load the worksheets of interest into lists of dataframes using our dict
df_list = [pd.read_excel('.././data/eng/Lung disease tables FINAL.xlsx', sheet['Name'], skiprows=4) for sheet in sheets]

df_list.append(pd.read_excel('.././data/eng/Mesothelioma table 2 FINAL.xls', 'Table 1', skiprows=3)) #add meso data

sheets.append({'Desc': u'Number of deaths in each region of England and Wales where the underlying cause was Mesothelioma (all types), deaths registered between 2001 and 2012.',
  'Name': u'Table 7'}) #add meso data

In [20]:
for i, item in enumerate(df_list):
    df_list[i].rename(columns={'Area of usual residence':'Region'}, inplace=True)
    df_list[i]['Region'] = df_list[i]['Region'].fillna(method='pad') #fix the first column
    df_list[i]['Region'] = df_list[i]['Region'].str.strip()
    df_list[i]['Region'] = df_list[i]['Region'].str.upper()
    df_list[i]['Region'] = df_list[i]['Region'].map(transforms.get)   
    df_list[i] = df_list[i][df_list[i]['Region'].notnull()] #throw away rows that don't contain a region

    df_list[i]['Agegroup'] = df_list[i]['Agegroup'].str.strip()
    df_list[i]['Agegroup'] = df_list[i]['Agegroup'].str.upper()
    df_list[i]['Agegroup'] = df_list[i]['Agegroup'].map(transforms.get)

    df_list[i] = df_list[i].dropna(axis=1, how='all') #throw away columns that don't contain data
    
    df_list[i] = pd.melt(df_list[i], id_vars=['Region', 'Agegroup'])
    
    df_list[i]['variable'] = df_list[i]['variable'].convert_objects(convert_numeric=True)
    
    df_list[i]['Sex'] = df_list[i]['variable'].map(sex)
    
    df_list[i]['variable'] = df_list[i]['variable'].astype(int) #convert year to int
    
    df_list[i].rename(columns={'variable':'Year'}, inplace=True)
    
    df_list[i].rename(columns={'value':'Deaths'}, inplace=True)
    
    df_list[i] = df_list[i][['Region', 'Agegroup', 'Deaths', 'Sex', 'Year']]
    
    df_list[i]['Cause'] = dis_lookup[sheets[i]['Name']]


/home/drcjar/.virtualenvs/pypf/lib/python2.7/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/drcjar/.virtualenvs/pypf/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/drcjar/.virtualenvs/pypf/lib/python2.7/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/drcjar/.virtualenvs/pypf/lib/python2.7/site-packages/ipykernel/__main__.py:17: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.

In [21]:
mesocorrection = pd.read_csv('.././data/eng/mesocorrection.csv')
mesocorrection.Year = mesocorrection.Year.astype(int)
mesocorrection.index = mesocorrection.Year
mesocorrection_lookup = mesocorrection['Meso correction factor'].to_dict()
#df['Deaths'] = (df['Deaths'] / df['Year'].map(lambda x: mesocorrection_lookup[x])) * 100

In [22]:
#get corrective factor from year using mesocorrection lookup
#apply corrective factor to convert Pulmonary Mesothelioma Deaths to All Mesothelioma
#rename to All Mesothelioma to reflect conversion
#throw away data for 2001 onwards because we'll use table 7 for that

df_list[2]['Deaths'] = (df_list[2]['Deaths'] / df_list[2]['Year'].map(mesocorrection_lookup.get)) * 100
df_list[5]['Deaths'] = (df_list[5]['Deaths'] / df_list[5]['Year'].map(mesocorrection_lookup.get)) * 100

df_list[2]['Cause'] = df_list[2]['Cause'].str.replace('Pulmonary Mesothelioma', 'All Mesothelioma')
df_list[5]['Cause'] = df_list[5]['Cause'].str.replace('Pulmonary Mesothelioma', 'All Mesothelioma')

df_list[2] = df_list[2][df_list[2]['Year'] < 2001]
df_list[5] = df_list[5][df_list[5]['Year'] < 2001]

In [23]:
df = pd.concat(df_list) #combine sheets 
df['Year'] = df['Year'].astype(int)
df['Year'] = pd.to_datetime(df['Year'], format='%Y')
ons1 = df #save result
#df['Country'] = 'England & Wales' #add a label for country

In [24]:
df.to_pickle('./../data/pickle/pypf.pkl')

In [ ]:


In [ ]: