This notebook cleans some UK schools data sets and joins them with other data sources as deprivation data.
Main datasets:
In [1]:
import pandas as pd
import numpy as np
In [2]:
def is_int(value):
try:
int(value)
return True
except ValueError:
return False
def is_float(value):
try:
float(value)
return True
except ValueError:
return False
to_float = lambda x: float(x if is_float(x) else np.nan)
to_int = lambda x: int(x) if is_int(x) else np.nan
In [3]:
def clean_spine(file_path):
def la_codes(file_path):
la_codes = pd.read_csv(file_path, usecols=['LEA', 'LA Name'])
la_codes.columns = ['la', 'la name']
return la_codes
la_codes = la_codes('/project/uk-schools-clustering/data/meta/la_and_region_codes_meta.csv')
spine = pd.read_csv(
file_path,
usecols=['URN', 'LA', 'SCHNAME', 'LOCALITY', 'TOWN', 'POSTCODE'],
dtype={
'URN': 'object'
}
)
spine['POSTCODE'] = spine['POSTCODE'].str.replace(' ', '')
spine.columns = ['urn', 'la', 'name', 'locality', 'town', 'postcode']
return pd.merge(spine, la_codes, on=['la']).drop_duplicates('urn')
In [4]:
clean_spine('/project/uk-schools-clustering/data/raw/2016-2017_england_spine.csv').sample(5)
Out[4]:
In [5]:
def clean_census(file_path):
census = pd.read_csv(
file_path, usecols=['URN', 'NOR', 'NORG', 'NORB', 'NUMEAL', 'NUMENGFL', 'PNUMFSM'],
converters={
'NOR': to_int,
'NORG': to_int,
'NORB': to_int,
'NUMEAL': to_int,
'NUMENGFL': to_int,
'PNUMFSM': to_float
}
)
census['on free meal'] = (census['NORG']*census['PNUMFSM']) / 100
census['on free meal'] = round(census['on free meal'])
census.drop(inplace=True, columns=['PNUMFSM'])
census.columns = ['urn', 'total pupils on roll',
'girls on roll', 'boys on roll',
'english not first language', 'english first language', 'on free meal']
return census.drop_duplicates('urn')
In [6]:
clean_census('/project/uk-schools-clustering/data/raw/2016-2017_england_census.csv')
Out[6]:
In [7]:
def clean_workforce(file_path):
clean_salary = lambda x : x.replace('£', '').replace(',','.')
workforce = pd.read_csv(
file_path,
usecols=['URN', 'Total Number of Teachers (Headcount)', 'Mean Gross FTE Salary of All Teachers'],
dtype={'URN': object},
converters={
'Total Number of Teachers (Headcount)': to_int,
'Mean Gross FTE Salary of All Teachers': lambda x: to_float(clean_salary(x))
}
)
workforce.columns = ['urn', 'teacher headcount', 'mean salary fte']
return workforce
In [8]:
clean_workforce('/project/uk-schools-clustering/data/raw/2016-2017_england_swf.csv')
Out[8]:
In [9]:
def clean_spending(file_path):
clean_value = lambda x : x.replace(',','.')
to_float = lambda x: float(clean_value(x) if is_float(clean_value(x)) else np.nan)
spending = pd.read_csv(
file_path,
usecols=['URN', 'TOTALINCOME', 'TOTALEXPENDITURE'],
dtype={
'URN': 'object'
},
converters={
'TOTALINCOME': lambda x : to_float(x),
'TOTALEXPENDITURE': lambda x : to_float(x)
}
)
spending.columns = ['urn', 'total income pp', 'total expenditure pp']
return spending
In [10]:
clean_spending('/project/uk-schools-clustering/data/raw/2016-2017_england_cfr.csv')
Out[10]:
In [11]:
def clean_deprivation(file_path):
deprivation = pd.read_csv(
file_path,
usecols=['Postcode', 'Income Score', 'Employment Score', 'IDACI Score'],
converters={
'Postcode' : lambda s : s.replace(' ', ''),
'Income Score': lambda x : to_float(x),
'Employment Score': lambda x : to_float(x),
'IDACI Score': lambda x : to_float(x)
}
)
deprivation.columns = ['postcode', 'income score', 'empl score', 'idaci score']
return deprivation
In [12]:
clean_deprivation('/project/uk-schools-clustering/data/raw/deprivation-by-postcode-2015.csv')
Out[12]:
In [13]:
def clean_k2final(file_path):
def clean_percent(percent_str):
percent_candidate = percent_str.replace('%', '')
return to_float(percent_candidate) / 100
k2final = pd.read_csv(
file_path,
usecols=['URN', 'PTREAD_EXP',
'PTMAT_EXP', 'PTGPS_EXP', 'PTWRITTA_EXP',
'READ_AVERAGE', 'GPS_AVERAGE', 'MAT_AVERAGE'
],
converters={
'PTREAD_EXP' : clean_percent,
'PTMAT_EXP' : clean_percent,
'PTGPS_EXP' : clean_percent,
'PTWRITTA_EXP' : clean_percent,
'READ_AVERAGE' : to_int,
'GPS_AVERAGE' : to_int,
'MAT_AVERAGE' : to_int
}
)
k2final.rename(columns={
'URN':'urn',
'PTREAD_EXP': 'perc pupils meeting reading standard',
'PTMAT_EXP': 'perc pupils meeting math standard',
'PTGPS_EXP': 'perc pupils meeting grammar standard',
'PTWRITTA_EXP': 'perc pupils meeting writing standard',
'READ_AVERAGE': 'avg reading scaled score',
'GPS_AVERAGE': 'avg grammar scaled score',
'MAT_AVERAGE': 'avg math scaled score'
}, inplace=True)
return k2final
In [14]:
clean_k2final('/project/uk-schools-clustering/data/raw/2016-2017_england_ks2final.csv')
Out[14]:
In [86]:
def get_data(save_to = None, columns = None):
spine = clean_spine('/project/uk-schools-clustering/data/raw/2016-2017_england_spine.csv')
census = clean_census('/project/uk-schools-clustering/data/raw/2016-2017_england_census.csv')
workforce = clean_workforce('/project/uk-schools-clustering/data/raw/2016-2017_england_swf.csv')
spending = clean_spending('/project/uk-schools-clustering/data/raw/2016-2017_england_cfr.csv')
deprivation = clean_deprivation('/project/uk-schools-clustering/data/raw/deprivation-by-postcode-2015.csv')
k2final = clean_k2final('/project/uk-schools-clustering/data/raw/2016-2017_england_ks2final.csv')
result = pd.merge(spine, census, on=['urn'])
result = pd.merge(result, deprivation, on=['postcode'])
result = pd.merge(result, workforce, on=['urn'])
result = pd.merge(result, spending, on=['urn'])
result = pd.merge(result, k2final, on=['urn'])
result.dropna(axis=0, subset=[
'total income pp',
'idaci score',
'mean salary fte',
'perc pupils meeting reading standard',
'perc pupils meeting grammar standard',
'perc pupils meeting math standard',
'avg reading scaled score',
'avg grammar scaled score',
'avg math scaled score'
], how='any', inplace=True)
# result.dropna(axis=0, how='all', inplace=True)
if columns is None:
columns_to_select = result.columns
else:
columns_to_select = columns
if save_to is not None:
result[columns_to_select].to_csv(save_to, index=False)
return result[columns_to_select]
In [87]:
get_data(
'/project/uk-schools-clustering/data/derived/2016-2017_england.csv',
columns=['urn', 'name', 'english first language', 'girls on roll',
'english not first language','total income pp', 'total pupils on roll', 'on free meal',
'idaci score', 'teacher headcount','boys on roll', 'mean salary fte', 'total expenditure pp',
'income score', 'empl score', 'perc pupils meeting reading standard',
'perc pupils meeting math standard', 'perc pupils meeting grammar standard', 'perc pupils meeting writing standard',
'avg reading scaled score','avg grammar scaled score','avg math scaled score']
)
Out[87]:
In [ ]: