In [1]:
import pandas as pd
import xlrd
import numpy as np
import os
import csv
In [2]:
def csv_from_excel(excel_file):
workbook = xlrd.open_workbook(excel_file)
all_worksheets = workbook.sheet_names()
for worksheet_name in all_worksheets:
worksheet = workbook.sheet_by_name(worksheet_name)
your_csv_file = open(''.join([worksheet_name,'.csv']), 'wb')
wr = csv.writer(your_csv_file, quoting=csv.QUOTE_ALL)
for rownum in xrange(worksheet.nrows):
wr.writerow([unicode(entry).encode("utf-8") for entry in worksheet.row_values(rownum)])
your_csv_file.close()
In [3]:
#dump xls sheets to csv for population data
#os.chdir('./data/eng/pop')
#files = !ls
#for f in files:
# csv_from_excel(f)
#os.mkdir('csv')
#!mv *csv $pathtocsv
In [4]:
#just a few transforms needed to make the ons data usable
transforms = {'YORKSHIRE AND THE HUMBER':'YORKSHIRE AND THE HUMBER',
'YORKSHIRE & HUMBERSIDE':'YORKSHIRE AND THE HUMBER',
'YORKSHIRE & HUMBER':'YORKSHIRE AND THE HUMBER',
'NORTH WEST':'NORTH WEST',
'EAST MIDLANDS':'EAST MIDLANDS', 'WEST MIDLANDS':'WEST MIDLANDS',
'WEST MIDLANDS (REGION)':'WEST MIDLANDS', 'EAST':'EAST',
'SOUTH EAST':'SOUTH EAST', 'SOUTH WEST':'SOUTH WEST', 'WALES':'WALES',
'NORTH EAST':'NORTH EAST', 'LONDON':'LONDON',
'EASTERN':'EAST', 'EAST OF ENGLAND':'EAST', 'EAST ANGLIA':'EAST', 'NORTH':'NORTH EAST',
'YORKSHIRE AND HUMBERSIDE':'YORKSHIRE AND THE HUMBER',
'ALL AGES':'ALL AGES', 'UNDER 25':'UNDER 25', '25-34':'25-34', '35-44':'35-44',
'45-54':'45-54', '55-64':'55-64', '65-74':'65-74', '75-84':'75-84', '85+':'85+',
'<25':'UNDER 25', '25 TO 34':'25-34', '35 TO 44':'35-44','45 TO 54':'45-54', '55 TO 64':'55-64',
'65 TO 74':'65-74', '75 TO 84':'75-84'}
In [5]:
os.chdir('.././data/eng/pop/csv')
In [6]:
#eng pop data 74-80
#df_pop_74_80
files = !ls 19*
df_list = [pd.read_csv(f, skiprows=28) for f in files] #lots of junk at start of files
colnames = ['Region', 'ALL AGES', '0.0', '1-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85+']
mpopcols = [2] + range(5,25)
prows = range(1,10)
#prepare male dataframe
for i, item in enumerate(df_list):
df_list[i] = df_list[i].iloc[prows, mpopcols].fillna(method='pad')
df_list[i].columns = colnames
df_list[i]['Region'] = df_list[i]['Region'].str.strip()
df_list[i]['Region'] = df_list[i]['Region'].str.upper()
df_list[i]['Region'] = df_list[i]['Region'].map(transforms.get)
df_list[i]['UNDER 25'] = df_list[i]['0.0'].astype('float') + df_list[i]['1-4'].astype('float') + df_list[i]['5-9'].astype('float') + df_list[i]['10-14'].astype('float') + df_list[i]['15-19'].astype('float') + df_list[i]['20-24'].astype('float')
df_list[i]['25-34'] = df_list[i]['25-29'].astype('float') + df_list[i]['30-34'].astype('float')
df_list[i]['35-44'] = df_list[i]['35-39'].astype('float') + df_list[i]['40-44'].astype('float')
df_list[i]['45-54'] = df_list[i]['45-49'].astype('float') + df_list[i]['50-54'].astype('float')
df_list[i]['55-64'] = df_list[i]['55-59'].astype('float') + df_list[i]['60-64'].astype('float')
df_list[i]['65-74'] = df_list[i]['65-69'].astype('float') + df_list[i]['70-74'].astype('float')
df_list[i]['75-84'] = df_list[i]['75-79'].astype('float') + df_list[i]['80-84'].astype('float')
df_list[i] = df_list[i][['Region', 'ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']]
df_list[i] = pd.melt(df_list[i], id_vars=['Region'])
df_list[i].rename(columns={'variable':'Agegroup'}, inplace=True)
df_list[i].rename(columns={'value':'Population'}, inplace=True)
df_list[i]['Sex'] = 'Male'
df_list[i]['Year'] = files[i][:4]
df_list[i]['Population'] = df_list[i]['Population'].astype(float)
#concatenate male dataframes
df_pop_74_80_male = pd.concat(df_list)
#74-80
#general setup
files = !ls 19*
df_list = [pd.read_csv(f, skiprows=28) for f in files] #lots of junk at start of files
colnames = ['Region', 'ALL AGES', '0.0', '1-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85+']
prows = range(1,10)
fpopcols = [2] + range(25,45)
#prepare female dataframe
for i, item in enumerate(df_list):
df_list[i] = df_list[i].iloc[prows, fpopcols].fillna(method='pad')
df_list[i].columns = colnames
df_list[i]['Region'] = df_list[i]['Region'].str.strip()
df_list[i]['Region'] = df_list[i]['Region'].str.upper()
df_list[i]['Region'] = df_list[i]['Region'].map(transforms.get)
df_list[i]['UNDER 25'] = df_list[i]['0.0'].astype('float') + df_list[i]['1-4'].astype('float') + df_list[i]['5-9'].astype('float') + df_list[i]['10-14'].astype('float') + df_list[i]['15-19'].astype('float') + df_list[i]['20-24'].astype('float')
df_list[i]['25-34'] = df_list[i]['25-29'].astype('float') + df_list[i]['30-34'].astype('float')
df_list[i]['35-44'] = df_list[i]['35-39'].astype('float') + df_list[i]['40-44'].astype('float')
df_list[i]['45-54'] = df_list[i]['45-49'].astype('float') + df_list[i]['50-54'].astype('float')
df_list[i]['55-64'] = df_list[i]['55-59'].astype('float') + df_list[i]['60-64'].astype('float')
df_list[i]['65-74'] = df_list[i]['65-69'].astype('float') + df_list[i]['70-74'].astype('float')
df_list[i]['75-84'] = df_list[i]['75-79'].astype('float') + df_list[i]['80-84'].astype('float')
df_list[i] = df_list[i][['Region', 'ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']]
df_list[i] = pd.melt(df_list[i], id_vars=['Region'])
df_list[i].rename(columns={'variable':'Agegroup'}, inplace=True)
df_list[i].rename(columns={'value':'Population'}, inplace=True)
df_list[i]['Sex'] = 'Female'
df_list[i]['Year'] = files[i][:4]
df_list[i]['Population'] = df_list[i]['Population'].astype(float)
#concatenate female dataframes
df_pop_74_80_female = pd.concat(df_list)
#combine male and female dataframes
df_pop_74_80_list = [df_pop_74_80_female , df_pop_74_80_male]
df_pop_74_80 = pd.concat(df_pop_74_80_list)
In [7]:
#eng pop data 81-90
#df_pop_81_90
#general setup
files = !ls Mid*Males.csv
files = files[:10] #cos they messed with format after
#setup male dataframe
df_list = [pd.read_csv(f) for f in files]
for i, item in enumerate(df_list):
df_list[i] = df_list[i].iloc[7:17, 1:]
df_list[i].rename(columns={'Name':'Region'}, inplace=True)
df_list[i].rename(columns={' ALL AGES':'ALL AGES'}, inplace=True)
df_list[i]['Region'] = df_list[i]['Region'].str.strip()
df_list[i]['Region'] = df_list[i]['Region'].str.upper()
df_list[i]['Region'] = df_list[i]['Region'].map(transforms.get)
df_list[i]['UNDER 25'] = df_list[i]['0.0'].astype('float') + df_list[i]['1-4'].astype('float') + df_list[i]['5-9'].astype('float') + df_list[i]['10-14'].astype('float') + df_list[i]['15-19'].astype('float') + df_list[i]['20-24'].astype('float')
df_list[i]['25-34'] = df_list[i]['25-29'].astype('float') + df_list[i]['30-34'].astype('float')
df_list[i]['35-44'] = df_list[i]['35-39'].astype('float') + df_list[i]['40-44'].astype('float')
df_list[i]['45-54'] = df_list[i]['45-49'].astype('float') + df_list[i]['50-54'].astype('float')
df_list[i]['55-64'] = df_list[i]['55-59'].astype('float') + df_list[i]['60-64'].astype('float')
df_list[i]['65-74'] = df_list[i]['65-69'].astype('float') + df_list[i]['70-74'].astype('float')
df_list[i]['75-84'] = df_list[i]['75-79'].astype('float') + df_list[i]['80-84'].astype('float')
df_list[i] = df_list[i][['Region', 'ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']]
df_list[i] = pd.melt(df_list[i], id_vars=['Region'])
df_list[i].rename(columns={'variable':'Agegroup'}, inplace=True)
df_list[i].rename(columns={'value':'Population'}, inplace=True)
df_list[i]['Sex'] = 'Male'
df_list[i]['Year'] = files[i][4:8]
df_list[i]['Population'] = df_list[i]['Population'].astype(float)
df_pop_81_90_male = pd.concat(df_list)
#setup female dataframe
files = !ls Mid*Females.csv
files = files[:10] #cos they messed with format after
df_list = [pd.read_csv(f) for f in files]
for i, item in enumerate(df_list):
df_list[i] = df_list[i].iloc[7:17, 1:]
df_list[i].rename(columns={'Name':'Region'}, inplace=True)
df_list[i].rename(columns={' ALL AGES':'ALL AGES'}, inplace=True)
df_list[i]['Region'] = df_list[i]['Region'].str.strip()
df_list[i]['Region'] = df_list[i]['Region'].str.upper()
df_list[i]['Region'] = df_list[i]['Region'].map(transforms.get)
df_list[i]['UNDER 25'] = df_list[i]['0.0'].astype('float') + df_list[i]['1-4'].astype('float') + df_list[i]['5-9'].astype('float') + df_list[i]['10-14'].astype('float') + df_list[i]['15-19'].astype('float') + df_list[i]['20-24'].astype('float')
df_list[i]['25-34'] = df_list[i]['25-29'].astype('float') + df_list[i]['30-34'].astype('float')
df_list[i]['35-44'] = df_list[i]['35-39'].astype('float') + df_list[i]['40-44'].astype('float')
df_list[i]['45-54'] = df_list[i]['45-49'].astype('float') + df_list[i]['50-54'].astype('float')
df_list[i]['55-64'] = df_list[i]['55-59'].astype('float') + df_list[i]['60-64'].astype('float')
df_list[i]['65-74'] = df_list[i]['65-69'].astype('float') + df_list[i]['70-74'].astype('float')
df_list[i]['75-84'] = df_list[i]['75-79'].astype('float') + df_list[i]['80-84'].astype('float')
df_list[i] = df_list[i][['Region', 'ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']]
df_list[i] = pd.melt(df_list[i], id_vars=['Region'])
df_list[i].rename(columns={'variable':'Agegroup'}, inplace=True)
df_list[i].rename(columns={'value':'Population'}, inplace=True)
df_list[i]['Sex'] = 'Female'
df_list[i]['Year'] = files[i][4:8]
df_list[i]['Population'] = df_list[i]['Population'].astype(float)
df_pop_81_90_female = pd.concat(df_list)
#combine male and female dataframes
df_pop_81_90_list = [df_pop_81_90_male, df_pop_81_90_female]
df_pop_81_90 = pd.concat(df_pop_81_90_list)
In [8]:
#eng pop data 91-2000
#df_pop_91_2000
#gen setup
files = !ls Mid*Males.csv
files = files[10:20] #cos they messed with format in 1991 and 2001
#setup male dataframe
df_list = [pd.read_csv(f) for f in files]
for i, item in enumerate(df_list):
df_list[i] = df_list[i].iloc[7:17,:]
df_list[i].rename(columns={'Name':'Region'}, inplace=True)
df_list[i].rename(columns={' ALL AGES':'ALL AGES'}, inplace=True)
df_list[i]['Region'] = df_list[i]['Region'].str.strip()
df_list[i]['Region'] = df_list[i]['Region'].str.upper()
df_list[i]['Region'] = df_list[i]['Region'].map(transforms.get)
df_list[i]['UNDER 25'] = df_list[i].iloc[:, 3:28].astype('float').sum(axis=1)
df_list[i]['25-34'] = df_list[i].iloc[:, 28:38].astype('float').sum(axis=1)
df_list[i]['35-44'] = df_list[i].iloc[:, 38:48].astype('float').sum(axis=1)
df_list[i]['45-54'] = df_list[i].iloc[:, 48:58].astype('float').sum(axis=1)
df_list[i]['55-64'] = df_list[i].iloc[:, 58:68].astype('float').sum(axis=1)
df_list[i]['65-74'] = df_list[i].iloc[:, 68:78].astype('float').sum(axis=1)
df_list[i]['75-84'] = df_list[i].iloc[:, 78:88].astype('float').sum(axis=1)
df_list[i]['85+'] = df_list[i].iloc[:,88]
df_list[i] = df_list[i][['Region', 'ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']]
df_list[i] = pd.melt(df_list[i], id_vars=['Region'])
df_list[i].rename(columns={'variable':'Agegroup'}, inplace=True)
df_list[i].rename(columns={'value':'Population'}, inplace=True)
df_list[i]['Sex'] = 'Male'
df_list[i]['Year'] = files[i][4:8]
df_pop_91_2000_male = pd.concat(df_list)
#gen setup
files = !ls Mid*Females.csv
files = files[10:20] #cos they messed with format in 1991 and 2001
df_list = [pd.read_csv(f) for f in files]
#setup female dataframe
for i, item in enumerate(df_list):
df_list[i] = df_list[i].iloc[7:17,:]
df_list[i].rename(columns={'Name':'Region'}, inplace=True)
df_list[i].rename(columns={' ALL AGES':'ALL AGES'}, inplace=True)
df_list[i]['Region'] = df_list[i]['Region'].str.strip()
df_list[i]['Region'] = df_list[i]['Region'].str.upper()
df_list[i]['Region'] = df_list[i]['Region'].map(transforms.get)
df_list[i]['UNDER 25'] = df_list[i].iloc[:, 3:28].astype('float').sum(axis=1)
df_list[i]['25-34'] = df_list[i].iloc[:, 28:38].astype('float').sum(axis=1)
df_list[i]['35-44'] = df_list[i].iloc[:, 38:48].astype('float').sum(axis=1)
df_list[i]['45-54'] = df_list[i].iloc[:, 48:58].astype('float').sum(axis=1)
df_list[i]['55-64'] = df_list[i].iloc[:, 58:68].astype('float').sum(axis=1)
df_list[i]['65-74'] = df_list[i].iloc[:, 68:78].astype('float').sum(axis=1)
df_list[i]['75-84'] = df_list[i].iloc[:, 78:88].astype('float').sum(axis=1)
df_list[i]['85+'] = df_list[i].iloc[:,88]
df_list[i] = df_list[i][['Region', 'ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']]
df_list[i] = pd.melt(df_list[i], id_vars=['Region'])
df_list[i].rename(columns={'variable':'Agegroup'}, inplace=True)
df_list[i].rename(columns={'value':'Population'}, inplace=True)
df_list[i]['Sex'] = 'Female'
df_list[i]['Year'] = files[i][4:8]
df_pop_91_2000_female = pd.concat(df_list)
#combine dataframe
df_pop_91_2000_list = [df_pop_91_2000_male, df_pop_91_2000_female]
df_pop_91_2000 = pd.concat(df_pop_91_2000_list)
In [9]:
#eng pop data 2001
#df_pop_2001
#2001 is a special case (format changes again)
#I very almost lost the will to live and hacked together the required csvs by hand for this one
#gen setup
files = !ls Mad*Males.csv
df_list = [pd.read_csv(f) for f in files]
for column in df_list[0].columns:
df_list[0][column] = df_list[0][column].str.replace(',','')
df_list[0]['Name'] = df_list[0]['Name'].str.upper()
df_list[0] = df_list[0][(df_list[0]['Code'] == 'E12000001')| #'NORTH EAST'
(df_list[0]['Code'] == 'E12000002')| #'NORTH WEST'
(df_list[0]['Code'] == 'E12000003')| #'YORKSHIRE AND THE HUMBER'
(df_list[0]['Code'] == 'E12000004')| #'EAST MIDLANDS'
(df_list[0]['Code'] == 'E12000005')| #'WEST MIDLANDS'
(df_list[0]['Code'] == 'E12000006')| #'EAST'
(df_list[0]['Name'] == 'LONDON')| # E12000006, used name because two londons in dataset
(df_list[0]['Code'] == 'E12000008')| #'SOUTH EAST'
(df_list[0]['Code'] == 'E12000009')| #'SOUTH WEST'
(df_list[0]['Code'] == 'W92000004')] #'WALES
#male setup
for i, item in enumerate(df_list):
df_list[i].rename(columns={'Name':'Region'}, inplace=True)
df_list[i].rename(columns={' ALL AGES':'ALL AGES'}, inplace=True)
df_list[i]['Region'] = df_list[i]['Region'].str.strip()
df_list[i]['Region'] = df_list[i]['Region'].str.upper()
df_list[i]['Region'] = df_list[i]['Region'].map(transforms.get)
df_list[i]['UNDER 25'] = df_list[i].iloc[:, 3:28].astype('float').sum(axis=1)
df_list[i]['25-34'] = df_list[i].iloc[:, 28:38].astype('float').sum(axis=1)
df_list[i]['35-44'] = df_list[i].iloc[:, 38:48].astype('float').sum(axis=1)
df_list[i]['45-54'] = df_list[i].iloc[:, 48:58].astype('float').sum(axis=1)
df_list[i]['55-64'] = df_list[i].iloc[:, 58:68].astype('float').sum(axis=1)
df_list[i]['65-74'] = df_list[i].iloc[:, 68:78].astype('float').sum(axis=1)
df_list[i]['75-84'] = df_list[i].iloc[:, 78:88].astype('float').sum(axis=1)
df_list[i]['85+'] = df_list[i].iloc[:,88:94].astype('float').sum(axis=1)
df_list[i] = df_list[i][['Region', 'ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']]
df_list[i] = pd.melt(df_list[i], id_vars=['Region'])
df_list[i].rename(columns={'variable':'Agegroup'}, inplace=True)
df_list[i].rename(columns={'value':'Population'}, inplace=True)
df_list[i]['Sex'] = 'Male'
df_list[i]['Year'] = files[i][4:8]
df_list[i]['Population'] = df_list[i]['Population'].astype('float')
df_pop_2001_male = pd.concat(df_list)
#2001 special case
#gen setup
files = !ls Mad*Females.csv
df_list = [pd.read_csv(f) for f in files]
for column in df_list[0].columns:
df_list[0][column] = df_list[0][column].str.replace(',','')
df_list[0]['Name'] = df_list[0]['Name'].str.upper()
df_list[0] = df_list[0][(df_list[0]['Code'] == 'E12000001')| #'NORTH EAST'
(df_list[0]['Code'] == 'E12000002')| #'NORTH WEST'
(df_list[0]['Code'] == 'E12000003')| #'YORKSHIRE AND THE HUMBER'
(df_list[0]['Code'] == 'E12000004')| #'EAST MIDLANDS'
(df_list[0]['Code'] == 'E12000005')| #'WEST MIDLANDS'
(df_list[0]['Code'] == 'E12000006')| #'EAST'
(df_list[0]['Name'] == 'LONDON')| # E12000006, used name because two londons in dataset
(df_list[0]['Code'] == 'E12000008')| #'SOUTH EAST'
(df_list[0]['Code'] == 'E12000009')| #'SOUTH WEST'
(df_list[0]['Code'] == 'W92000004')] #'WALES
#female setup
for i, item in enumerate(df_list):
df_list[i].rename(columns={'Name':'Region'}, inplace=True)
df_list[i].rename(columns={' ALL AGES':'ALL AGES'}, inplace=True)
df_list[i]['Region'] = df_list[i]['Region'].str.strip()
df_list[i]['Region'] = df_list[i]['Region'].str.upper()
df_list[i]['Region'] = df_list[i]['Region'].map(transforms.get)
df_list[i]['UNDER 25'] = df_list[i].iloc[:, 3:28].astype('float').sum(axis=1)
df_list[i]['25-34'] = df_list[i].iloc[:, 28:38].astype('float').sum(axis=1)
df_list[i]['35-44'] = df_list[i].iloc[:, 38:48].astype('float').sum(axis=1)
df_list[i]['45-54'] = df_list[i].iloc[:, 48:58].astype('float').sum(axis=1)
df_list[i]['55-64'] = df_list[i].iloc[:, 58:68].astype('float').sum(axis=1)
df_list[i]['65-74'] = df_list[i].iloc[:, 68:78].astype('float').sum(axis=1)
df_list[i]['75-84'] = df_list[i].iloc[:, 78:88].astype('float').sum(axis=1)
df_list[i]['85+'] = df_list[i].iloc[:,88:94].astype('float').sum(axis=1)
df_list[i] = df_list[i][['Region', 'ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']]
df_list[i] = pd.melt(df_list[i], id_vars=['Region'])
df_list[i].rename(columns={'variable':'Agegroup'}, inplace=True)
df_list[i].rename(columns={'value':'Population'}, inplace=True)
df_list[i]['Sex'] = 'Female'
df_list[i]['Year'] = files[i][4:8]
df_list[i]['Population'] = df_list[i]['Population'].astype('float')
df_pop_2001_female = pd.concat(df_list)
# combine male and females
df_pop_2001_list = [df_pop_2001_female, df_pop_2001_male]
df_pop_2001 = pd.concat(df_pop_2001_list)
In [10]:
#eng pop data 2002-2012
#df_pop_2002_2012
#gen setup
files = !ls Mid*Males.csv
files = files[20:] #cos they messed with format in 1991 and 2001
df_list = [pd.read_csv(f) for f in files]
#male setup
for i, item in enumerate(df_list):
df_list[i] = df_list[i].iloc[7:17,:]
df_list[i].rename(columns={'Name':'Region'}, inplace=True)
df_list[i].rename(columns={' ALL AGES':'ALL AGES'}, inplace=True)
df_list[i]['Region'] = df_list[i]['Region'].str.strip()
df_list[i]['Region'] = df_list[i]['Region'].str.upper()
df_list[i]['Region'] = df_list[i]['Region'].map(transforms.get)
df_list[i]['UNDER 25'] = df_list[i].iloc[:, 3:28].sum(axis=1)
df_list[i]['25-34'] = df_list[i].iloc[:, 28:38].astype('float').sum(axis=1)
df_list[i]['35-44'] = df_list[i].iloc[:, 38:48].astype('float').sum(axis=1)
df_list[i]['45-54'] = df_list[i].iloc[:, 48:58].astype('float').sum(axis=1)
df_list[i]['55-64'] = df_list[i].iloc[:, 58:68].astype('float').sum(axis=1)
df_list[i]['65-74'] = df_list[i].iloc[:, 68:78].astype('float').sum(axis=1)
df_list[i]['75-84'] = df_list[i].iloc[:, 78:88].astype('float').sum(axis=1)
df_list[i]['85+'] = df_list[i].iloc[:,88:94].astype('float').sum(axis=1)
df_list[i] = df_list[i][['Region', 'ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']]
df_list[i] = pd.melt(df_list[i], id_vars=['Region'])
df_list[i].rename(columns={'variable':'Agegroup'}, inplace=True)
df_list[i].rename(columns={'value':'Population'}, inplace=True)
df_list[i]['Sex'] = 'Male'
df_list[i]['Year'] = files[i][4:8]
df_pop_02_2012_male = pd.concat(df_list)
files = !ls Mid*Females.csv
files = files[20:] #cos they messed with format in 1991 and 2001
df_list = [pd.read_csv(f) for f in files]
#female setup
for i, item in enumerate(df_list):
df_list[i] = df_list[i].iloc[7:17,:]
df_list[i].rename(columns={'Name':'Region'}, inplace=True)
df_list[i].rename(columns={' ALL AGES':'ALL AGES'}, inplace=True)
df_list[i]['Region'] = df_list[i]['Region'].str.strip()
df_list[i]['Region'] = df_list[i]['Region'].str.upper()
df_list[i]['Region'] = df_list[i]['Region'].map(transforms.get)
df_list[i]['UNDER 25'] = df_list[i].iloc[:, 3:28].astype('float').sum(axis=1)
df_list[i]['25-34'] = df_list[i].iloc[:, 28:38].astype('float').sum(axis=1)
df_list[i]['35-44'] = df_list[i].iloc[:, 38:48].astype('float').sum(axis=1)
df_list[i]['45-54'] = df_list[i].iloc[:, 48:58].astype('float').sum(axis=1)
df_list[i]['55-64'] = df_list[i].iloc[:, 58:68].astype('float').sum(axis=1)
df_list[i]['65-74'] = df_list[i].iloc[:, 68:78].astype('float').sum(axis=1)
df_list[i]['75-84'] = df_list[i].iloc[:, 78:88].astype('float').sum(axis=1)
df_list[i]['85+'] = df_list[i].iloc[:,88:94].astype('float').sum(axis=1)
df_list[i] = df_list[i][['Region', 'ALL AGES', 'UNDER 25', '25-34', '35-44', '45-54', '55-64', '65-74', '75-84', '85+']]
df_list[i] = pd.melt(df_list[i], id_vars=['Region'])
df_list[i].rename(columns={'variable':'Agegroup'}, inplace=True)
df_list[i].rename(columns={'value':'Population'}, inplace=True)
df_list[i]['Sex'] = 'Female'
df_list[i]['Year'] = files[i][4:8]
df_pop_02_2012_female = pd.concat(df_list)
#combine male and female
df_pop_02_2012_list = [df_pop_02_2012_male, df_pop_02_2012_female]
df_pop_02_2012 = pd.concat(df_pop_02_2012_list)
In [11]:
df_list_pop = [df_pop_74_80, df_pop_81_90, df_pop_91_2000, df_pop_2001, df_pop_02_2012]
df = pd.concat(df_list_pop)
In [12]:
df.Year = pd.to_datetime(df.Year)
In [13]:
df.Year
Out[13]:
In [14]:
df.to_pickle('../../../pickle/pypop.pkl')
In [14]: