In [1]:
import pandas as pd
import xlrd
import numpy as np
In [2]:
workbook = xlrd.open_workbook('./data/eng/Lung disease tables FINAL.xlsx') #what ONS sent part one
print workbook.sheet_names()
df = pd.read_excel('./data/eng/Lung disease tables FINAL.xlsx', 'Contents') #lets load the contents worksheet
df.head() #lets look at the contents
Out[2]:
In [3]:
df.columns = ['Name', 'Desc'] #lets have Name and Description columns for the worksheets
df.fillna('n/a', inplace=True) #get rid on NaNs
df = df[df['Name'].str.contains('Table')] #get rid of rows that aren't about our worksheets of interest
df['Name'] = df['Name'].str.replace('Table2', 'Table 2') #correct ONS typo
sheets = df.to_dict(outtype='records') #lets save our work as dict
In [4]:
#lets make a shorthand table-disease lookup
dis_lookup = {'Table 1':'IPF', 'Table 2':'Asbestosis', 'Table 3':'Pulmonary Mesothelioma',
'Table 4':'IPF', 'Table 5':'Asbestosis', 'Table 6':'Pulmonary Mesothelioma'}
#a few transforms needed to make the ons data usable
transforms = {'YORKSHIRE AND THE HUMBER':'YORKSHIRE AND THE HUMBER',
'YORKSHIRE & HUMBERSIDE':'YORKSHIRE AND THE HUMBER',
'YORKSHIRE & HUMBER':'YORKSHIRE AND THE HUMBER',
'NORTH WEST':'NORTH WEST',
'EAST MIDLANDS':'EAST MIDLANDS', 'WEST MIDLANDS':'WEST MIDLANDS',
'WEST MIDLANDS (REGION)':'WEST MIDLANDS', 'EAST':'EAST',
'SOUTH EAST':'SOUTH EAST', 'SOUTH WEST':'SOUTH WEST', 'WALES':'WALES',
'NORTH EAST':'NORTH EAST', 'LONDON':'LONDON',
'EASTERN':'EAST', 'EAST OF ENGLAND':'EAST', 'EAST ANGLIA':'EAST', 'NORTH':'NORTH EAST',
'YORKSHIRE AND HUMBERSIDE':'YORKSHIRE AND THE HUMBER'}
sex_transforms_74_80 = {'Unnamed: 2':'Male', 'Unnamed: 3':'Male', 'Unnamed: 4':'Male', 'Unnamed: 5':'Male',
'Unnamed: 6':'Male', 'Unnamed: 7':'Male', 'Unnamed: 8':'Male', 'Unnamed: 10':'Female',
'Unnamed: 11':'Female', 'Unnamed: 12':'Female', 'Unnamed: 13':'Female', 'Unnamed: 14':'Female',
'Unnamed: 15':'Female', 'Unnamed: 16':'Female'}
sex_transforms_81_2012 = {'Unnamed: 2':'Male', 'Unnamed: 3':'Male', 'Unnamed: 4':'Male', 'Unnamed: 5':'Male',
'Unnamed: 6':'Male', 'Unnamed: 7':'Male', 'Unnamed: 8':'Male', 'Unnamed: 9':'Male',
'Unnamed: 10':'Male', 'Unnamed: 11':'Male', 'Unnamed: 12':'Male', 'Unnamed: 13':'Male',
'Unnamed: 14':'Male', 'Unnamed: 15':'Male', 'Unnamed: 16':'Male', 'Unnamed: 17':'Male',
'Unnamed: 18':'Male', 'Unnamed: 19':'Male', 'Unnamed: 20':'Male', 'Unnamed: 21':'Male',
'Unnamed: 22':'Male', 'Unnamed: 23':'Male', 'Unnamed: 24':'Male', 'Unnamed: 25':'Male',
'Unnamed: 26':'Male', 'Unnamed: 27':'Male', 'Unnamed: 28':'Male', 'Unnamed: 29':'Male',
'Unnamed: 30':'Male', 'Unnamed: 31':'Male', 'Unnamed: 32':'Male', 'Unnamed: 33':'Male',
'Unnamed: 35':'Female', 'Unnamed: 36':'Female', 'Unnamed: 37':'Female', 'Unnamed: 38':'Female',
'Unnamed: 39':'Female', 'Unnamed: 40':'Female', 'Unnamed: 41':'Female', 'Unnamed: 42':'Female',
'Unnamed: 43':'Female', 'Unnamed: 44':'Female', 'Unnamed: 45':'Female', 'Unnamed: 46':'Female',
'Unnamed: 47':'Female', 'Unnamed: 48':'Female', 'Unnamed: 49':'Female', 'Unnamed: 50':'Female',
'Unnamed: 51':'Female', 'Unnamed: 52':'Female', 'Unnamed: 53':'Female', 'Unnamed: 54':'Female',
'Unnamed: 55':'Female', 'Unnamed: 56':'Female', 'Unnamed: 57':'Female', 'Unnamed: 58':'Female',
'Unnamed: 59':'Female', 'Unnamed: 60':'Female', 'Unnamed: 61':'Female', 'Unnamed: 62':'Female',
'Unnamed: 63':'Female', 'Unnamed: 64':'Female', 'Unnamed: 65':'Female', 'Unnamed: 66':'Female'}
year_transforms_74_80 = {'Unnamed: 2':1974, 'Unnamed: 3':1975, 'Unnamed: 4':1976, 'Unnamed: 5':1977,
'Unnamed: 6':1978, 'Unnamed: 7':1979, 'Unnamed: 8':1980, 'Unnamed: 10':1974,
'Unnamed: 11':1975, 'Unnamed: 12':1976, 'Unnamed: 13':1977, 'Unnamed: 14':1978,
'Unnamed: 15':1979, 'Unnamed: 16':1980}
year_transforms_81_2012 = {'Unnamed: 2':1981, 'Unnamed: 3':1982, 'Unnamed: 4':1983, 'Unnamed: 5':1984,
'Unnamed: 6':1985, 'Unnamed: 7':1986, 'Unnamed: 8':1987, 'Unnamed: 9':1988,
'Unnamed: 10':1989, 'Unnamed: 11':1990, 'Unnamed: 12':1991, 'Unnamed: 13':1992,
'Unnamed: 14':1993, 'Unnamed: 15':1994, 'Unnamed: 16':1995, 'Unnamed: 17':1996,
'Unnamed: 18':1997, 'Unnamed: 19':1998, 'Unnamed: 20':1999, 'Unnamed: 21':2000,
'Unnamed: 22':2001, 'Unnamed: 23':2002, 'Unnamed: 24':2003, 'Unnamed: 25':2004,
'Unnamed: 26':2005, 'Unnamed: 27':2006, 'Unnamed: 28':2007, 'Unnamed: 29':2008,
'Unnamed: 30':2009, 'Unnamed: 31':2010, 'Unnamed: 32':2011, 'Unnamed: 33':2012,
'Unnamed: 35':1981, 'Unnamed: 36':1982, 'Unnamed: 37':1983, 'Unnamed: 38':1984,
'Unnamed: 39':1985, 'Unnamed: 40':1986, 'Unnamed: 41':1987, 'Unnamed: 42':1988,
'Unnamed: 43':1989, 'Unnamed: 44':1990, 'Unnamed: 45':1991, 'Unnamed: 46':1992,
'Unnamed: 47':1993, 'Unnamed: 48':1994, 'Unnamed: 49':1995, 'Unnamed: 50':1996,
'Unnamed: 51':1997, 'Unnamed: 52':1998, 'Unnamed: 53':1999, 'Unnamed: 54':2000,
'Unnamed: 55':2001, 'Unnamed: 56':2002, 'Unnamed: 57':2003, 'Unnamed: 58':2004,
'Unnamed: 59':2005, 'Unnamed: 60':2006, 'Unnamed: 61':2007, 'Unnamed: 62':2008,
'Unnamed: 63':2009, 'Unnamed: 64':2010, 'Unnamed: 65':2011, 'Unnamed: 66':2012}
#what we're expecting
REGIONS = ['YORKSHIRE AND THE HUMBER', 'NORTH WEST','EAST MIDLANDS', 'WEST MIDLANDS',
'EAST', 'SOUTH EAST', 'SOUTH WEST', 'WALES', 'NORTH EAST', 'LONDON']
AGEGROUPS = ['ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']
SEX = ['Male', 'Female']
UNDERLYING_CAUSE = ['IPF', 'Asbestosis', 'Mesothelioma', 'Mesothelioma_other', 'Mesothelioma all']
YEARS = ['1974']
for year in range(1975, 2013):
YEARS.append(str(year))
In [5]:
#lets load the worksheets of interest into lists of dataframes using our dict
df_list1 = [pd.read_excel('./data/eng/Lung disease tables FINAL.xlsx', sheet['Name']) for sheet in sheets] #74-80
df_list2 = [pd.read_excel('./data/eng/Lung disease tables FINAL.xlsx', sheet['Name']) for sheet in sheets] #81-2012
In [6]:
for i, item in enumerate(df_list1):
df_list1[i].rename(columns={'Contents Page':'Region'}, inplace=True)
df_list1[i]['Region'] = df_list1[i].icol(0).fillna(method='pad') #fix the first column
df_list1[i]['Region'] = df_list1[i]['Region'].str.strip()
df_list1[i]['Region'] = df_list1[i]['Region'].str.upper()
df_list1[i]['Region'] = df_list1[i]['Region'].map(transforms.get)
df_list1[i] = df_list1[i][df_list1[i]['Region'].notnull()] #throw away rows that don't contain a region
df_list1[i].rename(columns={'Unnamed: 1':'Agegroup'}, inplace=True)
df_list1[i]['Agegroup'] = df_list1[i]['Agegroup'].str.strip()
df_list1[i]['Agegroup'] = df_list1[i]['Agegroup'].str.upper()
df_list1[i] = df_list1[i].dropna(axis=1, how='all') #throw away columns that don't contain data
df_list1[i] = pd.melt(df_list1[i], id_vars=['Region', 'Agegroup'])
df_list1[i]['Sex'] = df_list1[i].variable.map(sex_transforms_74_80.get)
df_list1[i]['Year'] = df_list1[i].variable.map(year_transforms_74_80.get)
df_list1[i].rename(columns={'value':'Deaths'}, inplace=True)
df_list1[i] = df_list1[i][['Region', 'Agegroup', 'Deaths', 'Sex', 'Year']]
df_list1[i]['Underlying Cause'] = dis_lookup[sheets[i]['Name']]
In [7]:
for i, item in enumerate(df_list2):
df_list2[i].rename(columns={'Contents Page':'Region'}, inplace=True)
df_list2[i]['Region'] = df_list2[i].icol(0).fillna(method='pad') #fix the first column
df_list2[i]['Region'] = df_list2[i]['Region'].str.strip()
df_list2[i]['Region'] = df_list2[i]['Region'].str.upper()
df_list2[i]['Region'] = df_list2[i]['Region'].map(transforms.get)
df_list2[i] = df_list2[i][df_list2[i]['Region'].notnull()] #throw away rows that don't contain a region
df_list2[i].rename(columns={'Unnamed: 1':'Agegroup'}, inplace=True)
df_list2[i]['Agegroup'] = df_list2[i]['Agegroup'].str.strip()
df_list2[i]['Agegroup'] = df_list2[i]['Agegroup'].str.upper()
df_list2[i] = df_list2[i].dropna(axis=1, how='all') #throw away columns that don't contain data
df_list2[i] = pd.melt(df_list2[i], id_vars=['Region', 'Agegroup'])
df_list2[i]['Sex'] = df_list2[i].variable.map(sex_transforms_81_2012.get)
df_list2[i]['Year'] = df_list2[i].variable.map(year_transforms_81_2012.get)
df_list2[i].rename(columns={'value':'Deaths'}, inplace=True)
df_list2[i] = df_list2[i][['Region', 'Agegroup', 'Deaths', 'Sex', 'Year']]
df_list2[i]['Underlying Cause'] = dis_lookup[sheets[i]['Name']]
In [8]:
df1 = pd.concat(df_list1) #combine sheets 1-3 for 1974-1980
df2 = pd.concat(df_list2) #combine sheets 4-6 for 1981-2012
df_list = [df1, df2]
df = pd.concat(df_list) #combine sheets 1-3 and 4-6
ons1 = df #save result
#df['Country'] = 'England & Wales' #add a label for country
In [9]:
df.tail()
Out[9]:
In [10]:
#df.Year = pd.to_datetime(df.Year.astype(str))
In [11]:
#df.index = df.Year
In [12]:
#df.groupby('Year').Deaths.sum().plot()
In [13]:
#load meso 2 table
In [14]:
workbook = xlrd.open_workbook('./data/eng/Mesothelioma table 2 FINAL.xls') #what ONS sent part two
print workbook.sheet_names()
In [15]:
df = pd.read_excel('./data/eng/Mesothelioma table 2 FINAL.xls', 'Table 1', skiprows=3)
df.head()
Out[15]:
In [19]:
df_list3 = [df]
#this works because years are dupilcated in the dataset for women and pandas applies a .1 to indicate it's a duplicate
def sex(year):
year = year % 1
if year == 0.0:
return 'male'
else:
return 'female'
for i, item in enumerate(df_list3):
df_list3[i].rename(columns={'Area of usual residence':'Region'}, inplace=True)
df_list3[i]['Region'] = df_list3[i].icol(0).fillna(method='pad') #fix the first column
df_list3[i]['Region'] = df_list3[i]['Region'].str.strip()
df_list3[i]['Region'] = df_list3[i]['Region'].str.upper()
df_list3[i]['Region'] = df_list3[i]['Region'].map(transforms.get)
df_list3[i] = df_list3[i][df_list3[i]['Region'].notnull()] #throw away rows that don't contain a region
df_list3[i].rename(columns={'Unnamed: 1':'Agegroup'}, inplace=True)
df_list3[i]['Agegroup'] = df_list3[i]['Agegroup'].str.strip()
df_list3[i]['Agegroup'] = df_list3[i]['Agegroup'].str.upper()
df_list3[i] = df_list3[i].dropna(axis=1, how='all') #throw away columns that don't contain data
df_list3[i] = pd.melt(df_list3[i], id_vars=['Region', 'Agegroup'])
df_list3[i]['variable'] = df_list3[i]['variable'].convert_objects(convert_numeric=True)
df_list3[i]['Sex'] = df_list3[i]['variable'].map(sex)
df_list3[i].rename(columns={'variable':'Year'}, inplace=True)
df_list3[i].rename(columns={'value':'Deaths'}, inplace=True)
df_list3[i] = df_list3[i][['Region', 'Agegroup', 'Deaths', 'Sex', 'Year']]
df_list3[i]['Underlying Cause'] = dis_lookup[sheets[i]['Name']]
In [ ]: