Herein we make usable mortality data for IPF, mesothelioma, and asbestosis which is stratified by age, sex, and region.


In [1]:
import pandas as pd
import xlrd 
import numpy as np

In [2]:
workbook = xlrd.open_workbook('.././data/eng/Lung disease tables FINAL.xlsx') #what ONS sent part one
print workbook.sheet_names()
df = pd.read_excel('.././data/eng/Lung disease tables FINAL.xlsx', 'Contents') #lets load the contents worksheet
df.head() #lets look at the contents


[u'Contents', u'Table 1', u'Table 2', u'Table 3', u'Table 4', u'Table 5', u'Table 6']
Out[2]:
Contents Unnamed: 1
0 NaN NaN
1 Table 1 Number of deaths in each standard region of En...
2 Table2 Number of deaths in each standard region of En...
3 Table 3 Number of deaths in each standard region of En...
4 Table 4 Number of deaths in each region of England and...

5 rows × 2 columns


In [3]:
df.columns = ['Name', 'Desc'] #lets have Name and Description columns for the worksheets(
df.fillna('n/a', inplace=True) #get rid on NaNs
df = df[df['Name'].str.contains('Table')] #get rid of rows that aren't about our worksheets of interest
df['Name'] = df['Name'].str.replace('Table2', 'Table 2') #correct ONS typo
sheets = df.to_dict(outtype='records') #lets save our work as dict

In [4]:
#lets make a shorthand table-disease lookup
dis_lookup = {'Table 1':'IPF', 'Table 2':'Asbestosis', 'Table 3':'Pulmonary Mesothelioma', 
              'Table 4':'IPF', 'Table 5':'Asbestosis', 'Table 6':'Pulmonary Mesothelioma', 
              'Table 7':'All Mesothelioma'} 

#just a few transforms needed to make the ons data usable
transforms = {'YORKSHIRE AND THE HUMBER':'YORKSHIRE AND THE HUMBER',
              'YORKSHIRE & HUMBERSIDE':'YORKSHIRE AND THE HUMBER',
              'YORKSHIRE & HUMBER':'YORKSHIRE AND THE HUMBER',
              'NORTH WEST':'NORTH WEST',
              'EAST MIDLANDS':'EAST MIDLANDS', 'WEST MIDLANDS':'WEST MIDLANDS',
              'WEST MIDLANDS (REGION)':'WEST MIDLANDS', 'EAST':'EAST', 
              'SOUTH EAST':'SOUTH EAST', 'SOUTH WEST':'SOUTH WEST', 'WALES':'WALES', 
              'NORTH EAST':'NORTH EAST', 'LONDON':'LONDON', 
              'EASTERN':'EAST', 'EAST OF ENGLAND':'EAST', 'EAST ANGLIA':'EAST', 'NORTH':'NORTH EAST', 
              'YORKSHIRE AND HUMBERSIDE':'YORKSHIRE AND THE HUMBER',
              'ALL AGES':'ALL AGES', 'UNDER 25':'UNDER 25', '25-34':'25-34', '35-44':'35-44', 
              '45-54':'45-54', '55-64':'55-64', '65-74':'65-74', '75-84':'75-84', '85+':'85+',
               '<25':'UNDER 25', '25 TO 34':'25-34', '35 TO 44':'35-44','45 TO 54':'45-54', '55 TO 64':'55-64', 
               '65 TO 74':'65-74', '75 TO 84':'75-84'}

#what we're expecting
REGIONS = ['YORKSHIRE AND THE HUMBER', 'NORTH WEST','EAST MIDLANDS', 'WEST MIDLANDS', 
               'EAST', 'SOUTH EAST', 'SOUTH WEST', 'WALES', 'NORTH EAST', 'LONDON']

AGEGROUPS = ['ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']

SEX = ['Male', 'Female']

CAUSE = ['IPF', 'Asbestosis', 'Mesothelioma', 'Mesothelioma_other', 'Mesothelioma all']

YEARS = ['1974'] 
for year in range(1975, 2013):
    YEARS.append(str(year))
    
#this works because years are dupilcated in the dataset for women and pandas applies a .1 to indicate it's a duplicate
def sex(year):
    year = year % 1
    if year == 0.0:
        return 'Male'
    else:
        return 'Female'

In [5]:
#lets load the worksheets of interest into lists of dataframes using our dict
df_list = [pd.read_excel('.././data/eng/Lung disease tables FINAL.xlsx', sheet['Name'], skiprows=4) for sheet in sheets]

df_list.append(pd.read_excel('.././data/eng/Mesothelioma table 2 FINAL.xls', 'Table 1', skiprows=3)) #add meso data

sheets.append({'Desc': u'Number of deaths in each region of England and Wales where the underlying cause was Mesothelioma (all types), deaths registered between 2001 and 2012.',
  'Name': u'Table 7'}) #add meso data

In [6]:
for i, item in enumerate(df_list):
    df_list[i].rename(columns={'Area of usual residence':'Region'}, inplace=True)
    df_list[i]['Region'] = df_list[i]['Region'].fillna(method='pad') #fix the first column
    df_list[i]['Region'] = df_list[i]['Region'].str.strip()
    df_list[i]['Region'] = df_list[i]['Region'].str.upper()
    df_list[i]['Region'] = df_list[i]['Region'].map(transforms.get)   
    df_list[i] = df_list[i][df_list[i]['Region'].notnull()] #throw away rows that don't contain a region

    df_list[i]['Agegroup'] = df_list[i]['Agegroup'].str.strip()
    df_list[i]['Agegroup'] = df_list[i]['Agegroup'].str.upper()
    df_list[i]['Agegroup'] = df_list[i]['Agegroup'].map(transforms.get)

    df_list[i] = df_list[i].dropna(axis=1, how='all') #throw away columns that don't contain data
    
    df_list[i] = pd.melt(df_list[i], id_vars=['Region', 'Agegroup'])
    
    df_list[i]['variable'] = df_list[i]['variable'].convert_objects(convert_numeric=True)
    
    df_list[i]['Sex'] = df_list[i]['variable'].map(sex)
    
    df_list[i].rename(columns={'variable':'Year'}, inplace=True)
    
    df_list[i].rename(columns={'value':'Deaths'}, inplace=True)
    
    df_list[i] = df_list[i][['Region', 'Agegroup', 'Deaths', 'Sex', 'Year']]
    
    df_list[i]['Cause'] = dis_lookup[sheets[i]['Name']]


-c:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
-c:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
-c:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead

In [7]:
df = pd.concat(df_list) #combine sheets 
df['Year'] = df['Year'].astype(int)
df['Year'] = pd.to_datetime(df['Year'], format='%Y')
ons1 = df #save result
#df['Country'] = 'England & Wales' #add a label for country

In [8]:
df.to_pickle('./../data/pickle/pypf.pkl')