Lets prepare the data for analysis some more


In [1]:
import pandas as pd
from pandas import Series, DataFrame, Panel
import pickle

In [10]:
pypf = pd.read_pickle('.././data/pickle/pypf.pkl')
pypop = pd.read_pickle('.././data/pickle/pypop.pkl')
df = pd.merge(pypf, pypop)

In [12]:
df.head(9)


Out[12]:
Region Agegroup Deaths Sex Year Cause Population
0 NORTH EAST ALL AGES 26.000000 Male 1974-01-01 IPF 1536100.0
1 NORTH EAST ALL AGES 1.000000 Male 1974-01-01 Asbestosis 1536100.0
2 NORTH EAST ALL AGES 12.500000 Male 1974-01-01 All Mesothelioma 1536100.0
3 NORTH EAST UNDER 25 0.000000 Male 1974-01-01 IPF 615500.0
4 NORTH EAST 25-34 0.000000 Male 1974-01-01 IPF 203300.0
5 NORTH EAST 25-34 0.000000 Male 1974-01-01 All Mesothelioma 203300.0
6 NORTH EAST 35-44 1.000000 Male 1974-01-01 IPF 183600.0
7 NORTH EAST 35-44 0.000000 Male 1974-01-01 Asbestosis 183600.0
8 NORTH EAST 35-44 1.785714 Male 1974-01-01 All Mesothelioma 183600.0

In [14]:
pypf[pypf['Cause'] == 'All Mesothelioma']


Out[14]:
Region Agegroup Deaths Sex Year Cause
0 NORTH EAST ALL AGES 12.500000 Male 1974-01-01 All Mesothelioma
1 NORTH EAST 25-34 0.000000 Male 1974-01-01 All Mesothelioma
2 NORTH EAST 35-44 1.785714 Male 1974-01-01 All Mesothelioma
3 NORTH EAST 45-54 7.142857 Male 1974-01-01 All Mesothelioma
4 NORTH EAST 55-64 1.785714 Male 1974-01-01 All Mesothelioma
5 NORTH EAST 65-74 1.785714 Male 1974-01-01 All Mesothelioma
6 NORTH EAST 75-84 0.000000 Male 1974-01-01 All Mesothelioma
7 YORKSHIRE AND THE HUMBER ALL AGES 14.285714 Male 1974-01-01 All Mesothelioma
8 YORKSHIRE AND THE HUMBER UNDER 25 0.000000 Male 1974-01-01 All Mesothelioma
9 YORKSHIRE AND THE HUMBER 25-34 0.000000 Male 1974-01-01 All Mesothelioma
10 YORKSHIRE AND THE HUMBER 35-44 1.785714 Male 1974-01-01 All Mesothelioma
11 YORKSHIRE AND THE HUMBER 45-54 1.785714 Male 1974-01-01 All Mesothelioma
12 YORKSHIRE AND THE HUMBER 55-64 5.357143 Male 1974-01-01 All Mesothelioma
13 YORKSHIRE AND THE HUMBER 65-74 5.357143 Male 1974-01-01 All Mesothelioma
14 YORKSHIRE AND THE HUMBER 75-84 0.000000 Male 1974-01-01 All Mesothelioma
15 NORTH WEST ALL AGES 41.071429 Male 1974-01-01 All Mesothelioma
16 NORTH WEST 25-34 0.000000 Male 1974-01-01 All Mesothelioma
17 NORTH WEST 35-44 0.000000 Male 1974-01-01 All Mesothelioma
18 NORTH WEST 45-54 8.928571 Male 1974-01-01 All Mesothelioma
19 NORTH WEST 55-64 14.285714 Male 1974-01-01 All Mesothelioma
20 NORTH WEST 65-74 16.071429 Male 1974-01-01 All Mesothelioma
21 NORTH WEST 75-84 1.785714 Male 1974-01-01 All Mesothelioma
22 NORTH WEST 85+ 0.000000 Male 1974-01-01 All Mesothelioma
23 EAST MIDLANDS ALL AGES 8.928571 Male 1974-01-01 All Mesothelioma
24 EAST MIDLANDS UNDER 25 0.000000 Male 1974-01-01 All Mesothelioma
25 EAST MIDLANDS 25-34 0.000000 Male 1974-01-01 All Mesothelioma
26 EAST MIDLANDS 35-44 0.000000 Male 1974-01-01 All Mesothelioma
27 EAST MIDLANDS 45-54 5.357143 Male 1974-01-01 All Mesothelioma
28 EAST MIDLANDS 55-64 1.785714 Male 1974-01-01 All Mesothelioma
29 EAST MIDLANDS 65-74 1.785714 Male 1974-01-01 All Mesothelioma
... ... ... ... ... ... ...
2130 LONDON 65-74 11.000000 Female 2012-01-01 All Mesothelioma
2131 LONDON 75-84 13.000000 Female 2012-01-01 All Mesothelioma
2132 LONDON 85+ 8.000000 Female 2012-01-01 All Mesothelioma
2133 SOUTH EAST ALL AGES 63.000000 Female 2012-01-01 All Mesothelioma
2134 SOUTH EAST UNDER 25 0.000000 Female 2012-01-01 All Mesothelioma
2135 SOUTH EAST 25-34 0.000000 Female 2012-01-01 All Mesothelioma
2136 SOUTH EAST 35-44 0.000000 Female 2012-01-01 All Mesothelioma
2137 SOUTH EAST 45-54 3.000000 Female 2012-01-01 All Mesothelioma
2138 SOUTH EAST 55-64 7.000000 Female 2012-01-01 All Mesothelioma
2139 SOUTH EAST 65-74 20.000000 Female 2012-01-01 All Mesothelioma
2140 SOUTH EAST 75-84 19.000000 Female 2012-01-01 All Mesothelioma
2141 SOUTH EAST 85+ 14.000000 Female 2012-01-01 All Mesothelioma
2142 SOUTH WEST ALL AGES 45.000000 Female 2012-01-01 All Mesothelioma
2143 SOUTH WEST UNDER 25 0.000000 Female 2012-01-01 All Mesothelioma
2144 SOUTH WEST 25-34 0.000000 Female 2012-01-01 All Mesothelioma
2145 SOUTH WEST 35-44 0.000000 Female 2012-01-01 All Mesothelioma
2146 SOUTH WEST 45-54 0.000000 Female 2012-01-01 All Mesothelioma
2147 SOUTH WEST 55-64 7.000000 Female 2012-01-01 All Mesothelioma
2148 SOUTH WEST 65-74 10.000000 Female 2012-01-01 All Mesothelioma
2149 SOUTH WEST 75-84 20.000000 Female 2012-01-01 All Mesothelioma
2150 SOUTH WEST 85+ 8.000000 Female 2012-01-01 All Mesothelioma
2151 WALES ALL AGES 17.000000 Female 2012-01-01 All Mesothelioma
2152 WALES UNDER 25 0.000000 Female 2012-01-01 All Mesothelioma
2153 WALES 25-34 0.000000 Female 2012-01-01 All Mesothelioma
2154 WALES 35-44 0.000000 Female 2012-01-01 All Mesothelioma
2155 WALES 45-54 1.000000 Female 2012-01-01 All Mesothelioma
2156 WALES 55-64 2.000000 Female 2012-01-01 All Mesothelioma
2157 WALES 65-74 8.000000 Female 2012-01-01 All Mesothelioma
2158 WALES 75-84 4.000000 Female 2012-01-01 All Mesothelioma
2159 WALES 85+ 2.000000 Female 2012-01-01 All Mesothelioma

6342 rows × 6 columns


In [11]:
df_male = df[df['Sex'] == 'Male']

standpop08 = df_male[df_male['Year'] == pd.to_datetime('2008')]

wholeref = Series(standpop08.Population.values, index=standpop08.Agegroup).to_dict()

neref = Series(standpop08[standpop08['Region'] == 'NORTH EAST'].Population.values, index=standpop08[standpop08['Region'] == 'NORTH EAST'].Agegroup).to_dict()
yhref = Series(standpop08[standpop08['Region'] == 'YORKSHIRE AND THE HUMBER'].Population.values, index=standpop08[standpop08['Region'] == 'YORKSHIRE AND THE HUMBER'].Agegroup).to_dict()
nwref = Series(standpop08[standpop08['Region'] == 'NORTH WEST'].Population.values, index=standpop08[standpop08['Region'] == 'NORTH WEST'].Agegroup).to_dict()
emref = Series(standpop08[standpop08['Region'] == 'EAST MIDLANDS'].Population.values, index=standpop08[standpop08['Region'] == 'EAST MIDLANDS'].Agegroup).to_dict()
wmref = Series(standpop08[standpop08['Region'] == 'WEST MIDLANDS'].Population.values, index=standpop08[standpop08['Region'] == 'WEST MIDLANDS'].Agegroup).to_dict()
eeref = Series(standpop08[standpop08['Region'] == 'EAST'].Population.values, index=standpop08[standpop08['Region'] == 'EAST'].Agegroup).to_dict()
seref = Series(standpop08[standpop08['Region'] == 'SOUTH EAST'].Population.values, index=standpop08[standpop08['Region'] == 'SOUTH EAST'].Agegroup).to_dict()
swref = Series(standpop08[standpop08['Region'] == 'SOUTH WEST'].Population.values, index=standpop08[standpop08['Region'] == 'SOUTH WEST'].Agegroup).to_dict()
wwref = Series(standpop08[standpop08['Region'] == 'WALES'].Population.values, index=standpop08[standpop08['Region'] == 'WALES'].Agegroup).to_dict()
llref = Series(standpop08[standpop08['Region'] == 'LONDON'].Population.values, index=standpop08[standpop08['Region'] == 'LONDON'].Agegroup).to_dict()

ref_list = (neref, yhref, nwref, emref, wmref, eeref, seref, swref, wwref, llref)


ne = df_male[df_male['Region'] == 'NORTH EAST']
yh = df_male[df_male['Region'] == 'YORKSHIRE AND THE HUMBER']
nw = df_male[df_male['Region'] == 'NORTH WEST']
em = df_male[df_male['Region'] == 'EAST MIDLANDS']
wm = df_male[df_male['Region'] == 'WEST MIDLANDS']
ee = df_male[df_male['Region'] == 'EAST']
se = df_male[df_male['Region'] == 'SOUTH EAST']
sw = df_male[df_male['Region'] == 'SOUTH WEST']
ww = df_male[df_male['Region'] == 'WALES']
ll = df_male[df_male['Region'] == 'LONDON']

reg_list = (ne, yh, nw, em, wm, ee, se, sw, ww, ll)


for i, item in enumerate(reg_list):
    reg_list[i]['2008 population'] = reg_list[i]['Agegroup'].map(lambda x: ref_list[i][x] )
    
df1 = pd.concat(reg_list)

df_female = df[df['Sex'] == 'Female']

standpop08 = df_female[df_female['Year'] == pd.to_datetime('2008')]

neref = Series(standpop08[standpop08['Region'] == 'NORTH EAST'].Population.values, index=standpop08[standpop08['Region'] == 'NORTH EAST'].Agegroup).to_dict()
yhref = Series(standpop08[standpop08['Region'] == 'YORKSHIRE AND THE HUMBER'].Population.values, index=standpop08[standpop08['Region'] == 'YORKSHIRE AND THE HUMBER'].Agegroup).to_dict()
nwref = Series(standpop08[standpop08['Region'] == 'NORTH WEST'].Population.values, index=standpop08[standpop08['Region'] == 'NORTH WEST'].Agegroup).to_dict()
emref = Series(standpop08[standpop08['Region'] == 'EAST MIDLANDS'].Population.values, index=standpop08[standpop08['Region'] == 'EAST MIDLANDS'].Agegroup).to_dict()
wmref = Series(standpop08[standpop08['Region'] == 'WEST MIDLANDS'].Population.values, index=standpop08[standpop08['Region'] == 'WEST MIDLANDS'].Agegroup).to_dict()
eeref = Series(standpop08[standpop08['Region'] == 'EAST'].Population.values, index=standpop08[standpop08['Region'] == 'EAST'].Agegroup).to_dict()
seref = Series(standpop08[standpop08['Region'] == 'SOUTH EAST'].Population.values, index=standpop08[standpop08['Region'] == 'SOUTH EAST'].Agegroup).to_dict()
swref = Series(standpop08[standpop08['Region'] == 'SOUTH WEST'].Population.values, index=standpop08[standpop08['Region'] == 'SOUTH WEST'].Agegroup).to_dict()
wwref = Series(standpop08[standpop08['Region'] == 'WALES'].Population.values, index=standpop08[standpop08['Region'] == 'WALES'].Agegroup).to_dict()
llref = Series(standpop08[standpop08['Region'] == 'LONDON'].Population.values, index=standpop08[standpop08['Region'] == 'LONDON'].Agegroup).to_dict()

ref_list = (neref, yhref, nwref, emref, wmref, eeref, seref, swref, wwref, llref)


ne = df_female[df_female['Region'] == 'NORTH EAST']
yh = df_female[df_female['Region'] == 'YORKSHIRE AND THE HUMBER']
nw = df_female[df_female['Region'] == 'NORTH WEST']
em = df_female[df_female['Region'] == 'EAST MIDLANDS']
wm = df_female[df_female['Region'] == 'WEST MIDLANDS']
ee = df_female[df_female['Region'] == 'EAST']
se = df_female[df_female['Region'] == 'SOUTH EAST']
sw = df_female[df_female['Region'] == 'SOUTH WEST']
ww = df_female[df_female['Region'] == 'WALES']
ll = df_female[df_female['Region'] == 'LONDON']


reg_list = (ne, yh, nw, em, wm, ee, se, sw, ww, ll)

for i, item in enumerate(reg_list):
    reg_list[i]['2008 population'] = reg_list[i]['Agegroup'].map(lambda x: ref_list[i][x] )
    
df2 = pd.concat(reg_list)

df_list = [df1, df2]
df = pd.concat(df_list)


/home/drcjar/.virtualenvs/pypf/lib/python2.7/site-packages/ipykernel/__main__.py:36: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/drcjar/.virtualenvs/pypf/lib/python2.7/site-packages/ipykernel/__main__.py:73: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [12]:
df['Rate per 100,000 population'] = (df['Deaths'] / df['Population']) * 100000
#df1 = df[df['Agegroup'] != 'ALL AGES'] #lets throw away all ages rows
df['Estimated deaths age standardised to 2008 population'] = (df['2008 population'] / 100000) * df['Rate per 100,000 population']

In [13]:
df['Rate per 100,000 (standardised)'] = (df['Estimated deaths age standardised to 2008 population'] / df['2008 population']) * 100000

In [14]:
df.to_pickle('.././data/pickle/pypf_prep.pkl')

In [15]:
regpop = df[(df['Agegroup'] == 'ALL AGES') & (df['Year'] == pd.to_datetime('1979'))].groupby('Region').Population.sum()
regpop.sort()
regpop.values.sum() / 3


/home/drcjar/.virtualenvs/pypf/lib/python2.7/site-packages/ipykernel/__main__.py:2: FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
  from ipykernel import kernelapp as app
Out[15]:
49508200.0

In [16]:
df.head()


Out[16]:
Region Agegroup Deaths Sex Year Cause Population 2008 population Rate per 100,000 population Estimated deaths age standardised to 2008 population Rate per 100,000 (standardised)
0 NORTH EAST ALL AGES 26.0 Male 1974-01-01 IPF 1536100.0 1256065.0 1.692598 21.260133 1.692598
1 NORTH EAST ALL AGES 1.0 Male 1974-01-01 Asbestosis 1536100.0 1256065.0 0.065100 0.817697 0.065100
2 NORTH EAST ALL AGES 12.5 Male 1974-01-01 All Mesothelioma 1536100.0 1256065.0 0.813749 10.221218 0.813749
3 NORTH EAST UNDER 25 0.0 Male 1974-01-01 IPF 615500.0 405899.0 0.000000 0.000000 0.000000
4 NORTH EAST 25-34 0.0 Male 1974-01-01 IPF 203300.0 149126.0 0.000000 0.000000 0.000000

In [ ]:


In [ ]: