In [120]:
import pandas as pd
import csv
In [147]:
audit_df = pd.read_csv(
'master_list_cleaned_wards.csv',
sep=",", low_memory=False,
converters={'geo_ward_num_clean': str}
)
ilifa_df = pd.read_csv(
'kzn_nw_cleaned_wards.csv',
sep=",", low_memory=False,
converters={'geo_ward_num_clean': str}
)
In [236]:
# audit_df['geo_ward_num_clean'] = audit_df['geo_ward_num_clean'].astype(str)
# ilifa_df['geo_ward_num_clean'] = ilifa_df['geo_ward_num_clean'].astype(str)
In [148]:
# Calculate total children(age 3-5) enrolled, and write values to a new column
# Note column names are incorrect in that they refer to children present,
# though in reality they refer to children enrolled.
enrolled_cols = audit_df.loc[:, audit_df.columns[157:187]].columns
audit_df.loc[:,'children_enrolled_age_3_to_5'] = audit_df.loc[:,(enrolled_cols)].sum(axis=1)
enrolled_cols = ilifa_df.loc[:, ilifa_df.columns[159:189]].columns
ilifa_df.loc[:,'children_enrolled_age_3_to_5'] = ilifa_df.loc[:,(enrolled_cols)].sum(axis=1)
In [149]:
# Calculate total practitioners per child(age 3-5), and write values to a new column
practitioner_cols = audit_df.iloc[:, [246, 248, 250]].columns
audit_df.loc[:,'practitioners_for_ages_3_to_5'] = audit_df.loc[:,(practitioner_cols)].sum(axis=1)
practitioner_cols = ilifa_df.iloc[:, [248, 250, 252]].columns
ilifa_df.loc[:,'practitioners_for_ages_3_to_5'] = ilifa_df.loc[:,(practitioner_cols)].sum(axis=1)
In [150]:
filtered_audit_df = audit_df[
(audit_df.complete == 'Yes') &
(audit_df.geo_ward_num_clean != "") &
(len(ilifa_df.geo_ward_num_clean) != 8,) &
~(audit_df.geo_ward_num_clean.str.contains('na')) &
~(audit_df.province.isin(['KZN', 'NW']))]
filtered_ilifa_df = ilifa_df[
(ilifa_df.complete == 1) &
(ilifa_df.geo_ward_num_clean.str != "",) &
~(audit_df.geo_ward_num_clean.str.contains('na'))
]
In [151]:
filtered_ilifa_df = ilifa_df[
(ilifa_df.complete == 1) &
(len(ilifa_df.geo_ward_num_clean) != 8,)
# ~(audit_df.geo_ward_num_clean.str.contains('na'))
]
In [92]:
filtered_audit_df[filtered_audit_df.geo_ward_num_clean.isnull()]
Out[92]:
In [79]:
# These need to fixed.
# audit_df[(audit_df.complete == 'Yes') & (audit_df.geo_ward_num_clean.notnull()) & (audit_df.geo_local_muni_code.isnull())]
In [152]:
# Remap the two dataframes to have matching columns
audit_df_cols = [
'ID',
'geo_local_muni_province_code',
'geo_local_muni_district_code',
'geo_local_muni_code',
'geo_ward_num_clean',
'children_enrolled_age_3_to_5',
'registration_funding_status',
'practitioners_for_ages_3_to_5',
'type_of_site',
'has_grade_r',
'no_children_grant_csg'
]
ilifa_df_cols = [
'ID',
'PROVINCE',
'DISTRICT CODE',
'LOCAL CODE',
'geo_ward_num_clean',
'children_enrolled_age_3_to_5',
'registration_funding_status',
'practitioners_for_ages_3_to_5',
'type_of_site',
'has_grade_r',
'0_children_grant_csg'
]
sub_audit_df = filtered_audit_df.loc[:,(audit_df_cols)]
sub_ilifa_df = filtered_ilifa_df.loc[:,(audit_df_cols)]
sub_audit_df.columns = [
'id', 'province', 'district', 'municipality', 'ward',
'children_enrolled_age_3_to_5',
'registration_status_of_centre',
'practitioners_for_ages_3_to_5',
'type_of_centre',
'centres_with_grade_r_learners',
'child_support_grant']
sub_ilifa_df.columns = [
'id', 'province', 'district', 'municipality', 'ward',
'children_enrolled_age_3_to_5',
'registration_status_of_centre',
'practitioners_for_ages_3_to_5',
'type_of_centre',
'centres_with_grade_r_learners',
'child_support_grant']
In [252]:
# Spot check results
# sub_audit_df.apply(lambda x: sum(x.isnull()),axis=0)
# sub_ilifa_df.apply(lambda x: sum(x.isnull()),axis=0)
In [153]:
# Combine the datasets
combined_df = sub_audit_df.append(sub_ilifa_df)
combined_df = combined_df[(combined_df.ward != '')]
combined_df = combined_df.reset_index(drop=True)
In [15]:
# Children enrolled
geo_levels = [
'province',
'district',
'municipality',
'ward'
]
df_dict = {}
# enrolled_df = pd.DataFrame(columns=['geo_level', 'children_enrolled_total'])
for geo_level in geo_levels:
geo_level_df = combined_df.loc[:, [geo_level, 'children_enrolled_age_3_to_5']].groupby(geo_level).sum()
geo_level_df['geo_level'] = geo_level
geo_level_df['geo_code'] = geo_level_df.index
geo_level_df.reset_index(inplace=True, drop=True)
df_dict[geo_level] = geo_level_df
# geo_level_df.rename(columns={col:'geo_code'})
# enrolled_by_geolevel[geo_level]['geo_code'] = enrolled_by_geolevel[geo_level].index
# enrolled_by_geolevel[geo_level].reset_index(inplace=True, drop=True)
csv_df = pd.concat([df for df in df_dict.itervalues()])
# Append row for country results
country = {
'geo_level': 'country',
'geo_code': 'ZA',
'children_enrolled_age_3_to_5': combined_df['children_enrolled_age_3_to_5'].sum()
}
country_df = pd.DataFrame([country])
csv_df = csv_df.append(country_df)
csv_df.reset_index(inplace=True, drop=True)
csv_df.loc[csv_df['geo_level'] == 'country']
csv_df['children_enrolled_age_3_to_5'].fillna(0, inplace=True)
In [19]:
csv_df['children_enrolled_age_3_to_5'] = csv_df['children_enrolled_age_3_to_5'].map('{:.0f}'.format)
headers = ['geo_level', 'geo_code', 'children_enrolled_age_3_to_5']
csv_df.to_csv('total_ecd_children_enrolled.csv', columns=headers, index=False)
In [85]:
# ECD centres by type
geo_levels = [
'province',
'district',
'municipality',
'ward',
]
df_dict = {}
for geo_level in geo_levels:
geo_level_df = combined_df.reset_index().pivot_table(index=[geo_level], columns=['type_of_centre'], values='index', aggfunc='count')
geo_level_df['geo_level'] = geo_level
geo_level_df['geo_code'] = geo_level_df.index
geo_level_df.reset_index(inplace=True, drop=True)
df_dict[geo_level] = geo_level_df
csv_df = pd.concat([df for df in df_dict.itervalues()])
centre_type_cols = ['community_based', 'home_based', 'school_based', 'other', 'not_specified']
csv_df.rename(columns={
'(Please make a selection)': 'not_specified',
'Community-based': 'community_based',
'Home-based': 'home_based',
'School-based': 'school_based',
'Other': 'other'}, inplace=True)
csv_df.fillna(0, inplace=True)
csv_df.loc[:,'total_ecd_centres'] = csv_df.loc[:,(centre_type_cols)].sum(axis=1)
# Append row for country results
centre_type_cols.append('total_ecd_centres')
country = {
'geo_level': 'country',
'geo_code': 'ZA'}
for col in centre_type_cols:
country[col] = csv_df[col].sum()
country_df = pd.DataFrame([country])
csv_df = csv_df.append(country_df)
csv_df.reset_index(inplace=True, drop=True)
csv_df.loc[csv_df['geo_level'] == 'country']
Out[85]:
In [86]:
for col in centre_type_cols:
csv_df[col] = csv_df[col].map('{:.0f}'.format)
headers = ['geo_level', 'geo_code'] + centre_type_cols
csv_df.to_csv('centres_by_type.csv', columns=headers, index=False)
In [98]:
# ECD centres by registration status
geo_levels = [
'province',
'district',
'municipality',
'ward',
]
df_dict = {}
for geo_level in geo_levels:
geo_level_df = combined_df.reset_index().pivot_table(index=[geo_level], columns=['registration_status_of_centre'], values='index', aggfunc='count')
geo_level_df['geo_level'] = geo_level
geo_level_df['geo_code'] = geo_level_df.index
geo_level_df.reset_index(inplace=True, drop=True)
df_dict[geo_level] = geo_level_df
csv_df = pd.concat([df for df in df_dict.itervalues()])
centre_type_cols = [
'registered', 'registration_conditional', 'registration_in_process',
'registration_incomplete-access_denied', 'registration_incomplete-closed', 'registration_incomplete-not_found',
'not_registered', 'unspecified']
csv_df.rename(columns={
'(Please make a selection)': 'unspecified',
'Conditional': 'registration_conditional',
'Full': 'registered',
'INCOMPLETE - ACCESS DENIED': 'registration_incomplete-access_denied',
'INCOMPLETE - CLOSED': 'registration_incomplete-closed',
'INCOMPLETE - NOT FOUND': 'registration_incomplete-not_found',
'In process': 'registration_in_process',
'Not Registered': 'not_registered',
}, inplace=True)
csv_df.fillna(0, inplace=True)
csv_df.loc[:,'total_ecd_centres'] = csv_df.loc[:,(centre_type_cols)].sum(axis=1)
# Append row for country results
centre_type_cols.append('total_ecd_centres')
country = {
'geo_level': 'country',
'geo_code': 'ZA'}
for col in centre_type_cols:
country[col] = csv_df[col].sum()
country_df = pd.DataFrame([country])
csv_df = csv_df.append(country_df)
csv_df.reset_index(inplace=True, drop=True)
csv_df.loc[csv_df['geo_level'] == 'country']
Out[98]:
In [99]:
for col in centre_type_cols:
csv_df[col] = csv_df[col].map('{:.0f}'.format)
headers = ['geo_level', 'geo_code'] + centre_type_cols
csv_df.to_csv('centres_by_registration.csv', columns=headers, index=False)
In [116]:
# Practitioners per child aged 3-5
geo_levels = [
'province',
'district',
'municipality',
'ward'
]
df_dict = {}
# enrolled_df = pd.DataFrame(columns=['geo_level', 'children_enrolled_total'])
for geo_level in geo_levels:
geo_level_df = combined_df.loc[:, [geo_level, 'practitioners_for_ages_3_to_5']].groupby(geo_level).sum()
geo_level_df['geo_level'] = geo_level
geo_level_df['geo_code'] = geo_level_df.index
geo_level_df.reset_index(inplace=True, drop=True)
df_dict[geo_level] = geo_level_df
# Combine the geo_levels into a single dataframe
csv_df = pd.concat([df for df in df_dict.itervalues()])
# Sum column and append results as row for country results
country = {
'geo_level': 'country',
'geo_code': 'ZA',
'practitioners_for_ages_3_to_5': combined_df['practitioners_for_ages_3_to_5'].sum()
}
country_df = pd.DataFrame([country])
csv_df = csv_df.append(country_df)
csv_df.reset_index(inplace=True, drop=True)
csv_df.loc[csv_df['geo_level'] == 'country']
csv_df['practitioners_for_ages_3_to_5'].fillna(0, inplace=True)
In [117]:
csv_df['practitioners_for_ages_3_to_5'] = csv_df['practitioners_for_ages_3_to_5'].map('{:.0f}'.format)
headers = ['geo_level', 'geo_code', 'practitioners_per_child_aged_3_to_5']
csv_df.to_csv('ecd_practitioners.csv', columns=headers, index=False)
In [118]:
# Practitioners per child aged 3-5
geo_levels = [
'province',
'district',
'municipality',
'ward'
]
df_dict = {}
# enrolled_df = pd.DataFrame(columns=['geo_level', 'children_enrolled_total'])
for geo_level in geo_levels:
geo_level_df = combined_df.loc[:, [geo_level, 'centres_with_grade_r_learners']].groupby(geo_level).count()
geo_level_df['geo_level'] = geo_level
geo_level_df['geo_code'] = geo_level_df.index
geo_level_df.reset_index(inplace=True, drop=True)
df_dict[geo_level] = geo_level_df
# Combine the geo_levels into a single dataframe
csv_df = pd.concat([df for df in df_dict.itervalues()])
# Sum column and append results as row for country results
country = {
'geo_level': 'country',
'geo_code': 'ZA',
'centres_with_grade_r_learners': combined_df['centres_with_grade_r_learners'].count()
}
country_df = pd.DataFrame([country])
csv_df = csv_df.append(country_df)
csv_df.reset_index(inplace=True, drop=True)
csv_df.loc[csv_df['geo_level'] == 'country']
csv_df['centres_with_grade_r_learners'].fillna(0, inplace=True)
In [119]:
csv_df['centres_with_grade_r_learners'] = csv_df['centres_with_grade_r_learners'].map('{:.0f}'.format)
headers = ['geo_level', 'geo_code', 'centres_with_grade_r_learners']
csv_df.to_csv('ecd_grade_r.csv', columns=headers, index=False)
In [179]:
# Practitioners per child aged 3-5
combined_df['child_support_grant'].fillna(0, inplace=True)
geo_levels = [
'province',
'district',
'municipality',
'ward'
]
df_dict = {}
# enrolled_df = pd.DataFrame(columns=['geo_level', 'children_enrolled_total'])
for geo_level in geo_levels:
geo_level_df = combined_df.loc[:, [geo_level, 'child_support_grant']].groupby(geo_level).sum()
geo_level_df['geo_level'] = geo_level
geo_level_df['geo_code'] = geo_level_df.index
geo_level_df.reset_index(inplace=True, drop=True)
df_dict[geo_level] = geo_level_df
# Combine the geo_levels into a single dataframe
csv_df = pd.concat([df for df in df_dict.itervalues()])
# Sum column and append results as row for country results
country = {
'geo_level': 'country',
'geo_code': 'ZA',
'child_support_grant': combined_df['child_support_grant'].sum()
}
country_df = pd.DataFrame([country])
csv_df = csv_df.append(country_df)
csv_df.reset_index(inplace=True, drop=True)
csv_df.loc[csv_df['geo_level'] == 'country']
# csv_df['child_support_grant'].fillna(0, inplace=True)
Out[179]:
In [180]:
csv_df['child_support_grant'] = csv_df['child_support_grant'].map('{:.0f}'.format)
headers = ['geo_level', 'geo_code', 'child_support_grant']
csv_df.to_csv('ecd_grants.csv', columns=headers, index=False)
In [167]:
csv_df.dtypes
Out[167]:
In [163]:
combined_df.dtypes
Out[163]:
In [176]:
test_df.loc[:, ['province', 'child_support_grant']].groupby('province').sum()
Out[176]:
In [175]:
test_df = combined_df[
(combined_df.child_support_grant.notnull())]
In [ ]: