In [1]:
import pandas as pd
import xlrd
import numpy as np
In [2]:
workbook = xlrd.open_workbook('.././data/eng/Lung disease tables FINAL.xlsx') #what ONS sent part one
print workbook.sheet_names()
df = pd.read_excel('.././data/eng/Lung disease tables FINAL.xlsx', 'Contents') #lets load the contents worksheet
df.head() #lets look at the contents
Out[2]:
In [3]:
df.columns = ['Name', 'Desc'] #lets have Name and Description columns for the worksheets(
df.fillna('n/a', inplace=True) #get rid on NaNs
df = df[df['Name'].str.contains('Table')] #get rid of rows that aren't about our worksheets of interest
df['Name'] = df['Name'].str.replace('Table2', 'Table 2') #correct ONS typo
sheets = df.to_dict(outtype='records') #lets save our work as dict
In [4]:
#lets make a shorthand table-disease lookup
dis_lookup = {'Table 1':'IPF', 'Table 2':'Asbestosis', 'Table 3':'Pulmonary Mesothelioma',
'Table 4':'IPF', 'Table 5':'Asbestosis', 'Table 6':'Pulmonary Mesothelioma',
'Table 7':'All Mesothelioma'}
#just a few transforms needed to make the ons data usable
transforms = {'YORKSHIRE AND THE HUMBER':'YORKSHIRE AND THE HUMBER',
'YORKSHIRE & HUMBERSIDE':'YORKSHIRE AND THE HUMBER',
'YORKSHIRE & HUMBER':'YORKSHIRE AND THE HUMBER',
'NORTH WEST':'NORTH WEST',
'EAST MIDLANDS':'EAST MIDLANDS', 'WEST MIDLANDS':'WEST MIDLANDS',
'WEST MIDLANDS (REGION)':'WEST MIDLANDS', 'EAST':'EAST',
'SOUTH EAST':'SOUTH EAST', 'SOUTH WEST':'SOUTH WEST', 'WALES':'WALES',
'NORTH EAST':'NORTH EAST', 'LONDON':'LONDON',
'EASTERN':'EAST', 'EAST OF ENGLAND':'EAST', 'EAST ANGLIA':'EAST', 'NORTH':'NORTH EAST',
'YORKSHIRE AND HUMBERSIDE':'YORKSHIRE AND THE HUMBER',
'ALL AGES':'ALL AGES', 'UNDER 25':'UNDER 25', '25-34':'25-34', '35-44':'35-44',
'45-54':'45-54', '55-64':'55-64', '65-74':'65-74', '75-84':'75-84', '85+':'85+',
'<25':'UNDER 25', '25 TO 34':'25-34', '35 TO 44':'35-44','45 TO 54':'45-54', '55 TO 64':'55-64',
'65 TO 74':'65-74', '75 TO 84':'75-84'}
#what we're expecting
REGIONS = ['YORKSHIRE AND THE HUMBER', 'NORTH WEST','EAST MIDLANDS', 'WEST MIDLANDS',
'EAST', 'SOUTH EAST', 'SOUTH WEST', 'WALES', 'NORTH EAST', 'LONDON']
AGEGROUPS = ['ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']
SEX = ['Male', 'Female']
CAUSE = ['IPF', 'Asbestosis', 'Mesothelioma', 'Mesothelioma_other', 'Mesothelioma all']
YEARS = ['1974']
for year in range(1975, 2013):
YEARS.append(str(year))
#this works because years are dupilcated in the dataset for women and pandas applies a .1 to indicate it's a duplicate
def sex(year):
year = year % 1
if year == 0.0:
return 'Male'
else:
return 'Female'
In [5]:
#lets load the worksheets of interest into lists of dataframes using our dict
df_list = [pd.read_excel('.././data/eng/Lung disease tables FINAL.xlsx', sheet['Name'], skiprows=4) for sheet in sheets]
df_list.append(pd.read_excel('.././data/eng/Mesothelioma table 2 FINAL.xls', 'Table 1', skiprows=3)) #add meso data
sheets.append({'Desc': u'Number of deaths in each region of England and Wales where the underlying cause was Mesothelioma (all types), deaths registered between 2001 and 2012.',
'Name': u'Table 7'}) #add meso data
In [6]:
for i, item in enumerate(df_list):
df_list[i].rename(columns={'Area of usual residence':'Region'}, inplace=True)
df_list[i]['Region'] = df_list[i]['Region'].fillna(method='pad') #fix the first column
df_list[i]['Region'] = df_list[i]['Region'].str.strip()
df_list[i]['Region'] = df_list[i]['Region'].str.upper()
df_list[i]['Region'] = df_list[i]['Region'].map(transforms.get)
df_list[i] = df_list[i][df_list[i]['Region'].notnull()] #throw away rows that don't contain a region
df_list[i]['Agegroup'] = df_list[i]['Agegroup'].str.strip()
df_list[i]['Agegroup'] = df_list[i]['Agegroup'].str.upper()
df_list[i]['Agegroup'] = df_list[i]['Agegroup'].map(transforms.get)
df_list[i] = df_list[i].dropna(axis=1, how='all') #throw away columns that don't contain data
df_list[i] = pd.melt(df_list[i], id_vars=['Region', 'Agegroup'])
df_list[i]['variable'] = df_list[i]['variable'].convert_objects(convert_numeric=True)
df_list[i]['Sex'] = df_list[i]['variable'].map(sex)
df_list[i].rename(columns={'variable':'Year'}, inplace=True)
df_list[i].rename(columns={'value':'Deaths'}, inplace=True)
df_list[i] = df_list[i][['Region', 'Agegroup', 'Deaths', 'Sex', 'Year']]
df_list[i]['Cause'] = dis_lookup[sheets[i]['Name']]
In [7]:
df = pd.concat(df_list) #combine sheets
df['Year'] = df['Year'].astype(int)
df['Year'] = pd.to_datetime(df['Year'], format='%Y')
ons1 = df #save result
#df['Country'] = 'England & Wales' #add a label for country
In [8]:
df.to_pickle('./../data/pickle/pypf.pkl')