In [120]:
import pandas as pd
import csv

In [147]:
audit_df = pd.read_csv(
    'master_list_cleaned_wards.csv',
    sep=",", low_memory=False,
    converters={'geo_ward_num_clean': str}
)
ilifa_df = pd.read_csv(
    'kzn_nw_cleaned_wards.csv',
    sep=",", low_memory=False,
    converters={'geo_ward_num_clean': str}
)

In [236]:
# audit_df['geo_ward_num_clean'] = audit_df['geo_ward_num_clean'].astype(str)
# ilifa_df['geo_ward_num_clean'] = ilifa_df['geo_ward_num_clean'].astype(str)

In [148]:
# Calculate total children(age 3-5) enrolled, and write values to a new column
# Note column names are incorrect in that they refer to children present,
# though in reality they refer to children enrolled.

enrolled_cols = audit_df.loc[:, audit_df.columns[157:187]].columns
audit_df.loc[:,'children_enrolled_age_3_to_5'] = audit_df.loc[:,(enrolled_cols)].sum(axis=1)

enrolled_cols = ilifa_df.loc[:, ilifa_df.columns[159:189]].columns
ilifa_df.loc[:,'children_enrolled_age_3_to_5'] = ilifa_df.loc[:,(enrolled_cols)].sum(axis=1)

In [149]:
# Calculate total practitioners per child(age 3-5), and write values to a new column

practitioner_cols = audit_df.iloc[:, [246, 248, 250]].columns
audit_df.loc[:,'practitioners_for_ages_3_to_5'] = audit_df.loc[:,(practitioner_cols)].sum(axis=1)

practitioner_cols = ilifa_df.iloc[:, [248, 250, 252]].columns
ilifa_df.loc[:,'practitioners_for_ages_3_to_5'] = ilifa_df.loc[:,(practitioner_cols)].sum(axis=1)

In [150]:
filtered_audit_df = audit_df[
    (audit_df.complete == 'Yes') &
    (audit_df.geo_ward_num_clean != "") &
    (len(ilifa_df.geo_ward_num_clean) != 8,) &
    ~(audit_df.geo_ward_num_clean.str.contains('na')) &
    ~(audit_df.province.isin(['KZN', 'NW']))]

filtered_ilifa_df = ilifa_df[
    (ilifa_df.complete == 1) &
    (ilifa_df.geo_ward_num_clean.str != "",) &
    ~(audit_df.geo_ward_num_clean.str.contains('na'))
]

In [151]:
filtered_ilifa_df = ilifa_df[
    (ilifa_df.complete == 1) &
    (len(ilifa_df.geo_ward_num_clean) != 8,)
#     ~(audit_df.geo_ward_num_clean.str.contains('na'))
]

In [92]:
filtered_audit_df[filtered_audit_df.geo_ward_num_clean.isnull()]


Out[92]:
province ID date start end started complete signature signature_interviewee gps_location ... local_muni_clean geo_local_muni_name geo_local_muni_code geo_local_muni_ward_prefix geo_local_muni_district_code geo_local_muni_province_code geo_local_muni_province_number geo_ward_num_clean children_enrolled_age_3_to_5 practitioners_per_child_aged_3_to_5

0 rows × 537 columns


In [79]:
# These need to fixed.
# audit_df[(audit_df.complete == 'Yes')  & (audit_df.geo_ward_num_clean.notnull()) & (audit_df.geo_local_muni_code.isnull())]

In [152]:
# Remap the two dataframes to have matching columns

audit_df_cols = [
    'ID',
    'geo_local_muni_province_code',
    'geo_local_muni_district_code',
    'geo_local_muni_code',
    'geo_ward_num_clean',
    'children_enrolled_age_3_to_5',
    'registration_funding_status',
    'practitioners_for_ages_3_to_5',
    'type_of_site',
    'has_grade_r',
    'no_children_grant_csg'
]
 
ilifa_df_cols = [
    'ID',
    'PROVINCE',
    'DISTRICT CODE',
    'LOCAL CODE',
    'geo_ward_num_clean',
    'children_enrolled_age_3_to_5',
    'registration_funding_status',
    'practitioners_for_ages_3_to_5',
    'type_of_site',
    'has_grade_r',
    '0_children_grant_csg'
]

sub_audit_df = filtered_audit_df.loc[:,(audit_df_cols)]
sub_ilifa_df = filtered_ilifa_df.loc[:,(audit_df_cols)]

sub_audit_df.columns = [
    'id', 'province', 'district', 'municipality', 'ward',
    'children_enrolled_age_3_to_5',
    'registration_status_of_centre',
    'practitioners_for_ages_3_to_5',
    'type_of_centre',
    'centres_with_grade_r_learners',
    'child_support_grant']
sub_ilifa_df.columns = [
    'id', 'province', 'district', 'municipality', 'ward',
    'children_enrolled_age_3_to_5',
    'registration_status_of_centre',
    'practitioners_for_ages_3_to_5',
    'type_of_centre',
    'centres_with_grade_r_learners',
    'child_support_grant']

In [252]:
# Spot check results
# sub_audit_df.apply(lambda x: sum(x.isnull()),axis=0)
# sub_ilifa_df.apply(lambda x: sum(x.isnull()),axis=0)

In [153]:
# Combine the datasets

combined_df = sub_audit_df.append(sub_ilifa_df)
combined_df = combined_df[(combined_df.ward != '')]
combined_df = combined_df.reset_index(drop=True)

In [15]:
# Children enrolled

geo_levels = [
    'province',
    'district',
    'municipality',
    'ward'
]

df_dict = {}
# enrolled_df = pd.DataFrame(columns=['geo_level', 'children_enrolled_total'])

for geo_level in geo_levels:
    geo_level_df = combined_df.loc[:, [geo_level, 'children_enrolled_age_3_to_5']].groupby(geo_level).sum()
    geo_level_df['geo_level'] = geo_level
    geo_level_df['geo_code'] = geo_level_df.index
    geo_level_df.reset_index(inplace=True, drop=True)
    
    df_dict[geo_level] = geo_level_df
#     geo_level_df.rename(columns={col:'geo_code'})
    
    

#     enrolled_by_geolevel[geo_level]['geo_code'] = enrolled_by_geolevel[geo_level].index
#     enrolled_by_geolevel[geo_level].reset_index(inplace=True, drop=True)
    
csv_df = pd.concat([df for df in df_dict.itervalues()])

# Append row for country results
country = {
    'geo_level': 'country',
    'geo_code': 'ZA',
    'children_enrolled_age_3_to_5': combined_df['children_enrolled_age_3_to_5'].sum()
}

country_df = pd.DataFrame([country])

csv_df = csv_df.append(country_df)
csv_df.reset_index(inplace=True, drop=True)

csv_df.loc[csv_df['geo_level'] == 'country']
csv_df['children_enrolled_age_3_to_5'].fillna(0, inplace=True)

In [19]:
csv_df['children_enrolled_age_3_to_5'] = csv_df['children_enrolled_age_3_to_5'].map('{:.0f}'.format)
headers = ['geo_level', 'geo_code', 'children_enrolled_age_3_to_5']
csv_df.to_csv('total_ecd_children_enrolled.csv', columns=headers, index=False)

In [85]:
# ECD centres by type

geo_levels = [
    'province',
    'district',
    'municipality',
    'ward',
]

df_dict = {}

for geo_level in geo_levels:
    geo_level_df = combined_df.reset_index().pivot_table(index=[geo_level], columns=['type_of_centre'], values='index', aggfunc='count')
    geo_level_df['geo_level'] = geo_level
    geo_level_df['geo_code'] = geo_level_df.index
    geo_level_df.reset_index(inplace=True, drop=True)
    
    df_dict[geo_level] = geo_level_df
    
csv_df = pd.concat([df for df in df_dict.itervalues()])

centre_type_cols = ['community_based', 'home_based', 'school_based', 'other', 'not_specified']
csv_df.rename(columns={
    '(Please make a selection)': 'not_specified',
    'Community-based': 'community_based',
    'Home-based': 'home_based',
    'School-based': 'school_based',
    'Other': 'other'}, inplace=True)

csv_df.fillna(0, inplace=True)
csv_df.loc[:,'total_ecd_centres'] = csv_df.loc[:,(centre_type_cols)].sum(axis=1)

# Append row for country results

centre_type_cols.append('total_ecd_centres')

country = {
    'geo_level': 'country',
    'geo_code': 'ZA'}

for col in centre_type_cols:
    country[col] = csv_df[col].sum()

country_df = pd.DataFrame([country])

csv_df = csv_df.append(country_df)
csv_df.reset_index(inplace=True, drop=True)

csv_df.loc[csv_df['geo_level'] == 'country']


Out[85]:
community_based geo_code geo_level home_based not_specified other school_based total_ecd_centres
3890 46267 ZA country 15728 4786 1248 7120 75149

In [86]:
for col in centre_type_cols:
    csv_df[col] = csv_df[col].map('{:.0f}'.format)
    
headers = ['geo_level', 'geo_code'] + centre_type_cols
csv_df.to_csv('centres_by_type.csv', columns=headers, index=False)

In [98]:
# ECD centres by registration status

geo_levels = [
    'province',
    'district',
    'municipality',
    'ward',
]

df_dict = {}

for geo_level in geo_levels:
    geo_level_df = combined_df.reset_index().pivot_table(index=[geo_level], columns=['registration_status_of_centre'], values='index', aggfunc='count')
    geo_level_df['geo_level'] = geo_level
    geo_level_df['geo_code'] = geo_level_df.index
    geo_level_df.reset_index(inplace=True, drop=True)
    
    df_dict[geo_level] = geo_level_df
    
csv_df = pd.concat([df for df in df_dict.itervalues()])

centre_type_cols = [
    'registered', 'registration_conditional', 'registration_in_process',
    'registration_incomplete-access_denied', 'registration_incomplete-closed', 'registration_incomplete-not_found',
    'not_registered', 'unspecified']
csv_df.rename(columns={
    '(Please make a selection)': 'unspecified',
    'Conditional': 'registration_conditional',
    'Full': 'registered',
    'INCOMPLETE - ACCESS DENIED': 'registration_incomplete-access_denied',
    'INCOMPLETE - CLOSED': 'registration_incomplete-closed',
    'INCOMPLETE - NOT FOUND': 'registration_incomplete-not_found',
    'In process': 'registration_in_process',
    'Not Registered': 'not_registered',
    }, inplace=True)

csv_df.fillna(0, inplace=True)
csv_df.loc[:,'total_ecd_centres'] = csv_df.loc[:,(centre_type_cols)].sum(axis=1)

# Append row for country results

centre_type_cols.append('total_ecd_centres')

country = {
    'geo_level': 'country',
    'geo_code': 'ZA'}

for col in centre_type_cols:
    country[col] = csv_df[col].sum()

country_df = pd.DataFrame([country])

csv_df = csv_df.append(country_df)
csv_df.reset_index(inplace=True, drop=True)

csv_df.loc[csv_df['geo_level'] == 'country']


Out[98]:
geo_code geo_level not_registered registered registration_conditional registration_in_process registration_incomplete-access_denied registration_incomplete-closed registration_incomplete-not_found total_ecd_centres unspecified
3897 ZA country 21226 31878 7584 10297 1074 2645 300 76052 1048

In [99]:
for col in centre_type_cols:
    csv_df[col] = csv_df[col].map('{:.0f}'.format)
    
headers = ['geo_level', 'geo_code'] + centre_type_cols
csv_df.to_csv('centres_by_registration.csv', columns=headers, index=False)

In [116]:
# Practitioners per child aged 3-5

geo_levels = [
    'province',
    'district',
    'municipality',
    'ward'
]

df_dict = {}
# enrolled_df = pd.DataFrame(columns=['geo_level', 'children_enrolled_total'])

for geo_level in geo_levels:
    geo_level_df = combined_df.loc[:, [geo_level, 'practitioners_for_ages_3_to_5']].groupby(geo_level).sum()
    geo_level_df['geo_level'] = geo_level
    geo_level_df['geo_code'] = geo_level_df.index
    geo_level_df.reset_index(inplace=True, drop=True)
    
    df_dict[geo_level] = geo_level_df

# Combine the geo_levels into a single dataframe
csv_df = pd.concat([df for df in df_dict.itervalues()])

# Sum column and append results as row for country results
country = {
    'geo_level': 'country',
    'geo_code': 'ZA',
    'practitioners_for_ages_3_to_5': combined_df['practitioners_for_ages_3_to_5'].sum()
}

country_df = pd.DataFrame([country])

csv_df = csv_df.append(country_df)
csv_df.reset_index(inplace=True, drop=True)

csv_df.loc[csv_df['geo_level'] == 'country']
csv_df['practitioners_for_ages_3_to_5'].fillna(0, inplace=True)

In [117]:
csv_df['practitioners_for_ages_3_to_5'] = csv_df['practitioners_for_ages_3_to_5'].map('{:.0f}'.format)
headers = ['geo_level', 'geo_code', 'practitioners_per_child_aged_3_to_5']
csv_df.to_csv('ecd_practitioners.csv', columns=headers, index=False)

In [118]:
# Practitioners per child aged 3-5

geo_levels = [
    'province',
    'district',
    'municipality',
    'ward'
]

df_dict = {}
# enrolled_df = pd.DataFrame(columns=['geo_level', 'children_enrolled_total'])

for geo_level in geo_levels:
    geo_level_df = combined_df.loc[:, [geo_level, 'centres_with_grade_r_learners']].groupby(geo_level).count()
    geo_level_df['geo_level'] = geo_level
    geo_level_df['geo_code'] = geo_level_df.index
    geo_level_df.reset_index(inplace=True, drop=True)
    
    df_dict[geo_level] = geo_level_df

# Combine the geo_levels into a single dataframe
csv_df = pd.concat([df for df in df_dict.itervalues()])

# Sum column and append results as row for country results
country = {
    'geo_level': 'country',
    'geo_code': 'ZA',
    'centres_with_grade_r_learners': combined_df['centres_with_grade_r_learners'].count()
}

country_df = pd.DataFrame([country])

csv_df = csv_df.append(country_df)
csv_df.reset_index(inplace=True, drop=True)

csv_df.loc[csv_df['geo_level'] == 'country']
csv_df['centres_with_grade_r_learners'].fillna(0, inplace=True)

In [119]:
csv_df['centres_with_grade_r_learners'] = csv_df['centres_with_grade_r_learners'].map('{:.0f}'.format)
headers = ['geo_level', 'geo_code', 'centres_with_grade_r_learners']
csv_df.to_csv('ecd_grade_r.csv', columns=headers, index=False)

In [179]:
# Practitioners per child aged 3-5
combined_df['child_support_grant'].fillna(0, inplace=True)
geo_levels = [
    'province',
    'district',
    'municipality',
    'ward'
]

df_dict = {}
# enrolled_df = pd.DataFrame(columns=['geo_level', 'children_enrolled_total'])

for geo_level in geo_levels:
    geo_level_df = combined_df.loc[:, [geo_level, 'child_support_grant']].groupby(geo_level).sum()
    geo_level_df['geo_level'] = geo_level
    geo_level_df['geo_code'] = geo_level_df.index
    geo_level_df.reset_index(inplace=True, drop=True)
    
    df_dict[geo_level] = geo_level_df

# Combine the geo_levels into a single dataframe
csv_df = pd.concat([df for df in df_dict.itervalues()])

# Sum column and append results as row for country results
country = {
    'geo_level': 'country',
    'geo_code': 'ZA',
    'child_support_grant': combined_df['child_support_grant'].sum()
}

country_df = pd.DataFrame([country])

csv_df = csv_df.append(country_df)
csv_df.reset_index(inplace=True, drop=True)

csv_df.loc[csv_df['geo_level'] == 'country']
# csv_df['child_support_grant'].fillna(0, inplace=True)


Out[179]:
child_support_grant geo_code geo_level
3901 497409 ZA country

In [180]:
csv_df['child_support_grant'] = csv_df['child_support_grant'].map('{:.0f}'.format)
headers = ['geo_level', 'geo_code', 'child_support_grant']
csv_df.to_csv('ecd_grants.csv', columns=headers, index=False)

In [167]:
csv_df.dtypes


Out[167]:
child_support_grant    object
geo_code               object
geo_level              object
dtype: object

In [163]:
combined_df.dtypes


Out[163]:
id                                object
province                          object
district                          object
municipality                      object
ward                              object
children_enrolled_age_3_to_5     float64
registration_status_of_centre     object
practitioners_for_ages_3_to_5    float64
type_of_centre                    object
centres_with_grade_r_learners     object
child_support_grant              float64
dtype: object

In [176]:
test_df.loc[:, ['province', 'child_support_grant']].groupby('province').sum()


Out[176]:
child_support_grant
province
EC 64970
FS 66765
GT 79707
KZN 0
LIM 133572
MP 72196
NC 18222
NW 967
WC 60956

In [175]:
test_df = combined_df[
    (combined_df.child_support_grant.notnull())]

In [ ]: