In [15]:
import pandas as pd
import xlrd
import numpy as np
In [16]:
workbook = xlrd.open_workbook('.././data/eng/Lung disease tables FINAL.xlsx') #what ONS sent part one
print workbook.sheet_names()
df = pd.read_excel('.././data/eng/Lung disease tables FINAL.xlsx', 'Contents') #lets load the contents worksheet
df.head() #lets look at the contents
Out[16]:
In [17]:
df.columns = ['Name', 'Desc'] #lets have Name and Description columns for the worksheets(
df.fillna('n/a', inplace=True) #get rid on NaNs
df = df[df['Name'].str.contains('Table')] #get rid of rows that aren't about our worksheets of interest
df['Name'] = df['Name'].str.replace('Table2', 'Table 2') #correct ONS typo
sheets = df.to_dict(orient='records') #lets save our work as dict
In [18]:
#lets make a shorthand table-disease lookup
dis_lookup = {'Table 1':'IPF', 'Table 2':'Asbestosis', 'Table 3':'Pulmonary Mesothelioma',
'Table 4':'IPF', 'Table 5':'Asbestosis', 'Table 6':'Pulmonary Mesothelioma',
'Table 7':'All Mesothelioma'}
#just a few transforms needed to make the ons data usable
transforms = {'YORKSHIRE AND THE HUMBER':'YORKSHIRE AND THE HUMBER',
'YORKSHIRE & HUMBERSIDE':'YORKSHIRE AND THE HUMBER',
'YORKSHIRE & HUMBER':'YORKSHIRE AND THE HUMBER',
'NORTH WEST':'NORTH WEST',
'EAST MIDLANDS':'EAST MIDLANDS', 'WEST MIDLANDS':'WEST MIDLANDS',
'WEST MIDLANDS (REGION)':'WEST MIDLANDS', 'EAST':'EAST',
'SOUTH EAST':'SOUTH EAST', 'SOUTH WEST':'SOUTH WEST', 'WALES':'WALES',
'NORTH EAST':'NORTH EAST', 'LONDON':'LONDON',
'EASTERN':'EAST', 'EAST OF ENGLAND':'EAST', 'EAST ANGLIA':'EAST', 'NORTH':'NORTH EAST',
'YORKSHIRE AND HUMBERSIDE':'YORKSHIRE AND THE HUMBER',
'ALL AGES':'ALL AGES', 'UNDER 25':'UNDER 25', '25-34':'25-34', '35-44':'35-44',
'45-54':'45-54', '55-64':'55-64', '65-74':'65-74', '75-84':'75-84', '85+':'85+',
'<25':'UNDER 25', '25 TO 34':'25-34', '35 TO 44':'35-44','45 TO 54':'45-54', '55 TO 64':'55-64',
'65 TO 74':'65-74', '75 TO 84':'75-84'}
#what we're expecting
REGIONS = ['YORKSHIRE AND THE HUMBER', 'NORTH WEST','EAST MIDLANDS', 'WEST MIDLANDS',
'EAST', 'SOUTH EAST', 'SOUTH WEST', 'WALES', 'NORTH EAST', 'LONDON']
AGEGROUPS = ['ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']
SEX = ['Male', 'Female']
CAUSE = ['IPF', 'Asbestosis', 'Mesothelioma', 'Mesothelioma_other', 'Mesothelioma all']
YEARS = ['1974']
for year in range(1975, 2013):
YEARS.append(str(year))
#this works because years are dupilcated in the dataset for women and pandas applies a .1 to indicate it's a duplicate
#and men always appear first in our data
def sex(year):
year = year % 1
if year == 0.0:
return 'Male'
else:
return 'Female'
In [19]:
#lets load the worksheets of interest into lists of dataframes using our dict
df_list = [pd.read_excel('.././data/eng/Lung disease tables FINAL.xlsx', sheet['Name'], skiprows=4) for sheet in sheets]
df_list.append(pd.read_excel('.././data/eng/Mesothelioma table 2 FINAL.xls', 'Table 1', skiprows=3)) #add meso data
sheets.append({'Desc': u'Number of deaths in each region of England and Wales where the underlying cause was Mesothelioma (all types), deaths registered between 2001 and 2012.',
'Name': u'Table 7'}) #add meso data
In [20]:
for i, item in enumerate(df_list):
df_list[i].rename(columns={'Area of usual residence':'Region'}, inplace=True)
df_list[i]['Region'] = df_list[i]['Region'].fillna(method='pad') #fix the first column
df_list[i]['Region'] = df_list[i]['Region'].str.strip()
df_list[i]['Region'] = df_list[i]['Region'].str.upper()
df_list[i]['Region'] = df_list[i]['Region'].map(transforms.get)
df_list[i] = df_list[i][df_list[i]['Region'].notnull()] #throw away rows that don't contain a region
df_list[i]['Agegroup'] = df_list[i]['Agegroup'].str.strip()
df_list[i]['Agegroup'] = df_list[i]['Agegroup'].str.upper()
df_list[i]['Agegroup'] = df_list[i]['Agegroup'].map(transforms.get)
df_list[i] = df_list[i].dropna(axis=1, how='all') #throw away columns that don't contain data
df_list[i] = pd.melt(df_list[i], id_vars=['Region', 'Agegroup'])
df_list[i]['variable'] = df_list[i]['variable'].convert_objects(convert_numeric=True)
df_list[i]['Sex'] = df_list[i]['variable'].map(sex)
df_list[i]['variable'] = df_list[i]['variable'].astype(int) #convert year to int
df_list[i].rename(columns={'variable':'Year'}, inplace=True)
df_list[i].rename(columns={'value':'Deaths'}, inplace=True)
df_list[i] = df_list[i][['Region', 'Agegroup', 'Deaths', 'Sex', 'Year']]
df_list[i]['Cause'] = dis_lookup[sheets[i]['Name']]
In [21]:
mesocorrection = pd.read_csv('.././data/eng/mesocorrection.csv')
mesocorrection.Year = mesocorrection.Year.astype(int)
mesocorrection.index = mesocorrection.Year
mesocorrection_lookup = mesocorrection['Meso correction factor'].to_dict()
#df['Deaths'] = (df['Deaths'] / df['Year'].map(lambda x: mesocorrection_lookup[x])) * 100
In [22]:
#get corrective factor from year using mesocorrection lookup
#apply corrective factor to convert Pulmonary Mesothelioma Deaths to All Mesothelioma
#rename to All Mesothelioma to reflect conversion
#throw away data for 2001 onwards because we'll use table 7 for that
df_list[2]['Deaths'] = (df_list[2]['Deaths'] / df_list[2]['Year'].map(mesocorrection_lookup.get)) * 100
df_list[5]['Deaths'] = (df_list[5]['Deaths'] / df_list[5]['Year'].map(mesocorrection_lookup.get)) * 100
df_list[2]['Cause'] = df_list[2]['Cause'].str.replace('Pulmonary Mesothelioma', 'All Mesothelioma')
df_list[5]['Cause'] = df_list[5]['Cause'].str.replace('Pulmonary Mesothelioma', 'All Mesothelioma')
df_list[2] = df_list[2][df_list[2]['Year'] < 2001]
df_list[5] = df_list[5][df_list[5]['Year'] < 2001]
In [23]:
df = pd.concat(df_list) #combine sheets
df['Year'] = df['Year'].astype(int)
df['Year'] = pd.to_datetime(df['Year'], format='%Y')
ons1 = df #save result
#df['Country'] = 'England & Wales' #add a label for country
In [24]:
df.to_pickle('./../data/pickle/pypf.pkl')
In [ ]:
In [ ]: