In [1]:
import pandas as pd
import xlrd 
import numpy as np

In [2]:
workbook = xlrd.open_workbook('./data/eng/Lung disease tables FINAL.xlsx') #what ONS sent part one
print workbook.sheet_names()
df = pd.read_excel('./data/eng/Lung disease tables FINAL.xlsx', 'Contents') #lets load the contents worksheet
df.head() #lets look at the contents


[u'Contents', u'Table 1', u'Table 2', u'Table 3', u'Table 4', u'Table 5', u'Table 6']
Out[2]:
Contents Unnamed: 1
0 NaN NaN
1 Table 1 Number of deaths in each standard region of En...
2 Table2 Number of deaths in each standard region of En...
3 Table 3 Number of deaths in each standard region of En...
4 Table 4 Number of deaths in each region of England and...

5 rows × 2 columns


In [3]:
df.columns = ['Name', 'Desc'] #lets have Name and Description columns for the worksheets
df.fillna('n/a', inplace=True) #get rid on NaNs
df = df[df['Name'].str.contains('Table')] #get rid of rows that aren't about our worksheets of interest
df['Name'] = df['Name'].str.replace('Table2', 'Table 2') #correct ONS typo
sheets = df.to_dict(outtype='records') #lets save our work as dict

In [4]:
#lets make a shorthand table-disease lookup
dis_lookup = {'Table 1':'IPF', 'Table 2':'Asbestosis', 'Table 3':'Pulmonary Mesothelioma', 
              'Table 4':'IPF', 'Table 5':'Asbestosis', 'Table 6':'Pulmonary Mesothelioma'} 

#a few transforms needed to make the ons data usable
transforms = {'YORKSHIRE AND THE HUMBER':'YORKSHIRE AND THE HUMBER',
              'YORKSHIRE & HUMBERSIDE':'YORKSHIRE AND THE HUMBER',
              'YORKSHIRE & HUMBER':'YORKSHIRE AND THE HUMBER',
              'NORTH WEST':'NORTH WEST',
              'EAST MIDLANDS':'EAST MIDLANDS', 'WEST MIDLANDS':'WEST MIDLANDS',
              'WEST MIDLANDS (REGION)':'WEST MIDLANDS', 'EAST':'EAST', 
              'SOUTH EAST':'SOUTH EAST', 'SOUTH WEST':'SOUTH WEST', 'WALES':'WALES', 
              'NORTH EAST':'NORTH EAST', 'LONDON':'LONDON', 
              'EASTERN':'EAST', 'EAST OF ENGLAND':'EAST', 'EAST ANGLIA':'EAST', 'NORTH':'NORTH EAST', 
              'YORKSHIRE AND HUMBERSIDE':'YORKSHIRE AND THE HUMBER'}

sex_transforms_74_80 = {'Unnamed: 2':'Male', 'Unnamed: 3':'Male', 'Unnamed: 4':'Male', 'Unnamed: 5':'Male',
       'Unnamed: 6':'Male', 'Unnamed: 7':'Male', 'Unnamed: 8':'Male', 'Unnamed: 10':'Female',
       'Unnamed: 11':'Female', 'Unnamed: 12':'Female', 'Unnamed: 13':'Female', 'Unnamed: 14':'Female',
       'Unnamed: 15':'Female', 'Unnamed: 16':'Female'}

sex_transforms_81_2012 = {'Unnamed: 2':'Male', 'Unnamed: 3':'Male', 'Unnamed: 4':'Male', 'Unnamed: 5':'Male',
       'Unnamed: 6':'Male', 'Unnamed: 7':'Male', 'Unnamed: 8':'Male', 'Unnamed: 9':'Male',
       'Unnamed: 10':'Male', 'Unnamed: 11':'Male', 'Unnamed: 12':'Male', 'Unnamed: 13':'Male',
       'Unnamed: 14':'Male', 'Unnamed: 15':'Male', 'Unnamed: 16':'Male', 'Unnamed: 17':'Male',
       'Unnamed: 18':'Male', 'Unnamed: 19':'Male', 'Unnamed: 20':'Male', 'Unnamed: 21':'Male',
       'Unnamed: 22':'Male', 'Unnamed: 23':'Male', 'Unnamed: 24':'Male', 'Unnamed: 25':'Male',
       'Unnamed: 26':'Male', 'Unnamed: 27':'Male', 'Unnamed: 28':'Male', 'Unnamed: 29':'Male',
       'Unnamed: 30':'Male', 'Unnamed: 31':'Male', 'Unnamed: 32':'Male', 'Unnamed: 33':'Male',
       'Unnamed: 35':'Female', 'Unnamed: 36':'Female', 'Unnamed: 37':'Female', 'Unnamed: 38':'Female',
       'Unnamed: 39':'Female', 'Unnamed: 40':'Female', 'Unnamed: 41':'Female', 'Unnamed: 42':'Female',
       'Unnamed: 43':'Female', 'Unnamed: 44':'Female', 'Unnamed: 45':'Female', 'Unnamed: 46':'Female',
       'Unnamed: 47':'Female', 'Unnamed: 48':'Female', 'Unnamed: 49':'Female', 'Unnamed: 50':'Female',
       'Unnamed: 51':'Female', 'Unnamed: 52':'Female', 'Unnamed: 53':'Female', 'Unnamed: 54':'Female',
       'Unnamed: 55':'Female', 'Unnamed: 56':'Female', 'Unnamed: 57':'Female', 'Unnamed: 58':'Female',
       'Unnamed: 59':'Female', 'Unnamed: 60':'Female', 'Unnamed: 61':'Female', 'Unnamed: 62':'Female',
       'Unnamed: 63':'Female', 'Unnamed: 64':'Female', 'Unnamed: 65':'Female', 'Unnamed: 66':'Female'}

year_transforms_74_80 = {'Unnamed: 2':1974, 'Unnamed: 3':1975, 'Unnamed: 4':1976, 'Unnamed: 5':1977,
       'Unnamed: 6':1978, 'Unnamed: 7':1979, 'Unnamed: 8':1980, 'Unnamed: 10':1974,
       'Unnamed: 11':1975, 'Unnamed: 12':1976, 'Unnamed: 13':1977, 'Unnamed: 14':1978,
       'Unnamed: 15':1979, 'Unnamed: 16':1980}

year_transforms_81_2012 = {'Unnamed: 2':1981, 'Unnamed: 3':1982, 'Unnamed: 4':1983, 'Unnamed: 5':1984,
       'Unnamed: 6':1985, 'Unnamed: 7':1986, 'Unnamed: 8':1987, 'Unnamed: 9':1988,
       'Unnamed: 10':1989, 'Unnamed: 11':1990, 'Unnamed: 12':1991, 'Unnamed: 13':1992,
       'Unnamed: 14':1993, 'Unnamed: 15':1994, 'Unnamed: 16':1995, 'Unnamed: 17':1996,
       'Unnamed: 18':1997, 'Unnamed: 19':1998, 'Unnamed: 20':1999, 'Unnamed: 21':2000,
       'Unnamed: 22':2001, 'Unnamed: 23':2002, 'Unnamed: 24':2003, 'Unnamed: 25':2004,
       'Unnamed: 26':2005, 'Unnamed: 27':2006, 'Unnamed: 28':2007, 'Unnamed: 29':2008,
       'Unnamed: 30':2009, 'Unnamed: 31':2010, 'Unnamed: 32':2011, 'Unnamed: 33':2012,
       'Unnamed: 35':1981, 'Unnamed: 36':1982, 'Unnamed: 37':1983, 'Unnamed: 38':1984,
       'Unnamed: 39':1985, 'Unnamed: 40':1986, 'Unnamed: 41':1987, 'Unnamed: 42':1988,
       'Unnamed: 43':1989, 'Unnamed: 44':1990, 'Unnamed: 45':1991, 'Unnamed: 46':1992,
       'Unnamed: 47':1993, 'Unnamed: 48':1994, 'Unnamed: 49':1995, 'Unnamed: 50':1996,
       'Unnamed: 51':1997, 'Unnamed: 52':1998, 'Unnamed: 53':1999, 'Unnamed: 54':2000,
       'Unnamed: 55':2001, 'Unnamed: 56':2002, 'Unnamed: 57':2003, 'Unnamed: 58':2004,
       'Unnamed: 59':2005, 'Unnamed: 60':2006, 'Unnamed: 61':2007, 'Unnamed: 62':2008,
       'Unnamed: 63':2009, 'Unnamed: 64':2010, 'Unnamed: 65':2011, 'Unnamed: 66':2012}

#what we're expecting
REGIONS = ['YORKSHIRE AND THE HUMBER', 'NORTH WEST','EAST MIDLANDS', 'WEST MIDLANDS', 
               'EAST', 'SOUTH EAST', 'SOUTH WEST', 'WALES', 'NORTH EAST', 'LONDON']

AGEGROUPS = ['ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']

SEX = ['Male', 'Female']

UNDERLYING_CAUSE = ['IPF', 'Asbestosis', 'Mesothelioma', 'Mesothelioma_other', 'Mesothelioma all']

YEARS = ['1974'] 
for year in range(1975, 2013):
    YEARS.append(str(year))

In [5]:
#lets load the worksheets of interest into lists of dataframes using our dict
df_list1 = [pd.read_excel('./data/eng/Lung disease tables FINAL.xlsx', sheet['Name']) for sheet in sheets] #74-80
df_list2 = [pd.read_excel('./data/eng/Lung disease tables FINAL.xlsx', sheet['Name']) for sheet in sheets] #81-2012

In [6]:
for i, item in enumerate(df_list1):
    df_list1[i].rename(columns={'Contents Page':'Region'}, inplace=True)
    df_list1[i]['Region'] = df_list1[i].icol(0).fillna(method='pad') #fix the first column
    df_list1[i]['Region'] = df_list1[i]['Region'].str.strip()
    df_list1[i]['Region'] = df_list1[i]['Region'].str.upper()
    df_list1[i]['Region'] = df_list1[i]['Region'].map(transforms.get)   
    df_list1[i] = df_list1[i][df_list1[i]['Region'].notnull()] #throw away rows that don't contain a region

    df_list1[i].rename(columns={'Unnamed: 1':'Agegroup'}, inplace=True)
    df_list1[i]['Agegroup'] = df_list1[i]['Agegroup'].str.strip()
    df_list1[i]['Agegroup'] = df_list1[i]['Agegroup'].str.upper()

    df_list1[i] = df_list1[i].dropna(axis=1, how='all') #throw away columns that don't contain data
    
    df_list1[i] = pd.melt(df_list1[i], id_vars=['Region', 'Agegroup'])

    df_list1[i]['Sex'] = df_list1[i].variable.map(sex_transforms_74_80.get)
    df_list1[i]['Year'] = df_list1[i].variable.map(year_transforms_74_80.get)
    df_list1[i].rename(columns={'value':'Deaths'}, inplace=True)
    df_list1[i] = df_list1[i][['Region', 'Agegroup', 'Deaths', 'Sex', 'Year']]
    df_list1[i]['Underlying Cause'] = dis_lookup[sheets[i]['Name']]


/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py:2175: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame
  **kwargs)
-c:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
-c:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead

In [7]:
for i, item in enumerate(df_list2):
    df_list2[i].rename(columns={'Contents Page':'Region'}, inplace=True)
    df_list2[i]['Region'] = df_list2[i].icol(0).fillna(method='pad') #fix the first column
    df_list2[i]['Region'] = df_list2[i]['Region'].str.strip()
    df_list2[i]['Region'] = df_list2[i]['Region'].str.upper()
    df_list2[i]['Region'] = df_list2[i]['Region'].map(transforms.get)   
    df_list2[i] = df_list2[i][df_list2[i]['Region'].notnull()] #throw away rows that don't contain a region

    df_list2[i].rename(columns={'Unnamed: 1':'Agegroup'}, inplace=True)
    df_list2[i]['Agegroup'] = df_list2[i]['Agegroup'].str.strip()
    df_list2[i]['Agegroup'] = df_list2[i]['Agegroup'].str.upper()

    df_list2[i] = df_list2[i].dropna(axis=1, how='all') #throw away columns that don't contain data
    
    df_list2[i] = pd.melt(df_list2[i], id_vars=['Region', 'Agegroup'])

    df_list2[i]['Sex'] = df_list2[i].variable.map(sex_transforms_81_2012.get)
    df_list2[i]['Year'] = df_list2[i].variable.map(year_transforms_81_2012.get)
    df_list2[i].rename(columns={'value':'Deaths'}, inplace=True)
    df_list2[i] = df_list2[i][['Region', 'Agegroup', 'Deaths', 'Sex', 'Year']]
    df_list2[i]['Underlying Cause'] = dis_lookup[sheets[i]['Name']]

In [8]:
df1 = pd.concat(df_list1) #combine sheets 1-3 for 1974-1980
df2 = pd.concat(df_list2) #combine sheets 4-6 for 1981-2012
df_list = [df1, df2] 
df = pd.concat(df_list) #combine sheets 1-3 and 4-6
ons1 = df #save result
#df['Country'] = 'England & Wales' #add a label for country

In [9]:
df.tail()


Out[9]:
Region Agegroup Deaths Sex Year Underlying Cause
5051 WALES 45-54 1 Female 2012 Pulmonary Mesothelioma
5052 WALES 55-64 0 Female 2012 Pulmonary Mesothelioma
5053 WALES 65-74 0 Female 2012 Pulmonary Mesothelioma
5054 WALES 75-84 0 Female 2012 Pulmonary Mesothelioma
5055 WALES 85+ 1 Female 2012 Pulmonary Mesothelioma

5 rows × 6 columns


In [10]:
#df.Year = pd.to_datetime(df.Year.astype(str))

In [11]:
#df.index = df.Year

In [12]:
#df.groupby('Year').Deaths.sum().plot()

In [13]:
#load meso 2 table

In [14]:
workbook = xlrd.open_workbook('./data/eng/Mesothelioma table 2 FINAL.xls') #what ONS sent part two
print workbook.sheet_names()


[u'Table 1', u'Table 2']

In [15]:
df = pd.read_excel('./data/eng/Mesothelioma table 2 FINAL.xls', 'Table 1', skiprows=3)
df.head()


Out[15]:
Area of usual residence Agegroup 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 Unnamed: 14 2001.1 2002.1 2003.1 2004.1 2005.1
0 North East All Ages 124 118 109 114 99 123 77 146 128 145 134 132 NaN 21 10 11 21 17 ...
1 NaN <25 0 0 0 0 0 0 0 0 0 0 0 0 NaN 0 0 0 0 0 ...
2 NaN 25 to 34 0 0 0 0 0 0 0 0 0 0 0 0 NaN 0 0 0 0 0 ...
3 NaN 35 to 44 1 0 0 0 0 0 0 1 0 0 0 1 NaN 0 0 0 0 0 ...
4 NaN 45 to 54 4 2 5 3 1 5 2 1 3 2 0 0 NaN 1 0 0 2 2 ...

5 rows × 27 columns


In [19]:
df_list3 = [df]

#this works because years are dupilcated in the dataset for women and pandas applies a .1 to indicate it's a duplicate
def sex(year):
    year = year % 1
    if year == 0.0:
        return 'male'
    else:
        return 'female'

for i, item in enumerate(df_list3):
    df_list3[i].rename(columns={'Area of usual residence':'Region'}, inplace=True)
    df_list3[i]['Region'] = df_list3[i].icol(0).fillna(method='pad') #fix the first column
    df_list3[i]['Region'] = df_list3[i]['Region'].str.strip()
    df_list3[i]['Region'] = df_list3[i]['Region'].str.upper()
    df_list3[i]['Region'] = df_list3[i]['Region'].map(transforms.get)   
    df_list3[i] = df_list3[i][df_list3[i]['Region'].notnull()] #throw away rows that don't contain a region

    df_list3[i].rename(columns={'Unnamed: 1':'Agegroup'}, inplace=True)
    df_list3[i]['Agegroup'] = df_list3[i]['Agegroup'].str.strip()
    df_list3[i]['Agegroup'] = df_list3[i]['Agegroup'].str.upper()

    df_list3[i] = df_list3[i].dropna(axis=1, how='all') #throw away columns that don't contain data
    
    df_list3[i] = pd.melt(df_list3[i], id_vars=['Region', 'Agegroup'])
    df_list3[i]['variable'] = df_list3[i]['variable'].convert_objects(convert_numeric=True)
    
    df_list3[i]['Sex'] = df_list3[i]['variable'].map(sex)
    
    df_list3[i].rename(columns={'variable':'Year'}, inplace=True)
    df_list3[i].rename(columns={'value':'Deaths'}, inplace=True)
    df_list3[i] = df_list3[i][['Region', 'Agegroup', 'Deaths', 'Sex', 'Year']]
    df_list3[i]['Underlying Cause'] = dis_lookup[sheets[i]['Name']]

In [ ]: