In [13]:
import pandas as pd
import json
import re

In [2]:
!ls ../data


'Data for Akshada monitoring tool'
 Indicators-RCH-Category-Structure.csv
 Indicators-RCH-Data.csv
'Indicators-RCH-prgress-tracking_V2.xlsx - Apr.csv'
'Indicators-RCH-prgress-tracking_V2.xlsx - Jul.csv'
'Indicators-RCH-prgress-tracking_V2.xlsx - Jun.csv'
'Indicators-RCH-prgress-tracking_V2.xlsx - May.csv'
 shc_data.json

In [3]:
apr_data = pd.read_csv('../data/Indicators-RCH-prgress-tracking_V2.xlsx - Apr.csv')
may_data = pd.read_csv('../data/Indicators-RCH-prgress-tracking_V2.xlsx - May.csv')

In [4]:
apr_data.head()


Out[4]:
S.No Block Sector SHC Unit Type Target Pregnant Women Target Live Birth Traget Full Immunization ANC registered Total Number of pregnant registered  before 12 weeks against total number of ANC ... %.7 Total number of new borns received 6 post natal HBNC visit Percentage of new borns received 6 post natal HBNC visit % Given zero dose to children against total live birth %.8 Children fully immunized Percentage of children fully immunized Unnamed: 38 IMR MMR
0 1 Kishanganj Bajranggarh Gobarcha SHC 6 5 5 6 5 ... #DIV/0! 5 NaN 0 #DIV/0! 5 100.00 NaN NaN NaN
1 2 Kishanganj Bajranggarh Kheruna SHC 5 5 5 9 4 ... #DIV/0! 0 NaN 0 #DIV/0! 7 140.00 NaN NaN NaN
2 3 Kishanganj Bhanwargarh Basthooni SHC 4 3 3 4 0 ... #DIV/0! 3 NaN 0 #DIV/0! 4 133.33 NaN NaN NaN
3 4 Kishanganj Bhanwargarh Hirapur SHC 5 4 4 7 4 ... #DIV/0! 3 NaN 0 #DIV/0! 3 75.00 NaN NaN NaN
4 5 Kishanganj Bhanwargarh Peenjana SHC 7 6 6 7 1 ... #DIV/0! 4 NaN 0 #DIV/0! 7 116.67 NaN NaN NaN

5 rows × 41 columns


In [5]:
categories = pd.read_csv('../data/Indicators-RCH-Category-Structure.csv')

In [43]:
merged_data = pd.merge(left=apr_data, right=may_data, on=['Block', 'Sector', 'SHC'], suffixes=['_apr', '_may'])

In [72]:
[col for col in merged_data.columns if 'ANC registered' in col]


Out[72]:
['ANC registered_apr', 'ANC registered_may']

In [73]:
apr_data['ANC registered'].head()


Out[73]:
0    6
1    9
2    4
3    7
4    7
Name: ANC registered, dtype: int64

In [74]:
may_data['ANC registered'].head()


Out[74]:
0     2
1    16
2     0
3     7
4     5
Name: ANC registered, dtype: int64

In [75]:
merged_data[['ANC registered_apr', 'ANC registered_may']]


Out[75]:
ANC registered_apr ANC registered_may
0 6 2
1 9 16
2 4 0
3 7 7
4 7 5
5 16 7
6 12 14
7 3 6
8 4 5
9 3 11
10 4 5
11 7 14
12 7 8
13 7 6
14 10 5
15 7 13
16 8 9
17 7 10
18 4 3
19 10 6
20 7 5
21 7 3
22 13 3
23 12 3
24 7 7
25 16 7
26 3 12
27 8 6
28 11 3
29 16 0
30 9 5
31 2 17
32 6 3
33 7 1
34 9 5
35 0 3
36 7 2
37 17 14
38 9 7
39 18 8
40 17 18
41 13 10
42 7 8
43 20 19
44 15 18

In [80]:
class GenerateJsonData(object):
    '''Given a data file and config file in csv format generate the json to be used in story generator.
    
    The Config file should contain hierarchy information in each row, with columns as :-
        - Category = This is the highest level category that will be displayed on the left nav bar of story generator.
        - Sub Cateogry = These are the name of the options that will be displayed under a category.
        - Target Column = This is the map of the column from the `data` dataframe that needs to be used as values
        - unit = The unit with which the value needs to be extended.
    
    The data file should contain in this case some base columns `S.No.`, `Block`, `Sector`, `SHC` and all the
    column names mentioned in the `Target Column`
    '''
    
    def __init__(self, data_file_paths, config_file_path):
        '''
        Read and set the file paths
        
        Args:
            - data_file_path (`dict`:{'month, year': datafilepath}): Dictionary with keys
                as month, year and value as datafilepath.
            - config_file_path (string): path to the config file.
        '''
        self.data_file_path = data_file_paths
        self.config_file_path = config_file_path
        self.data = self.config = None
    
    def load_data(self):
        '''
        Load the config file and data file.
        
        Returns:
            None
        '''
        data = None
        last_month = None
        for data_source in self.data_file_path:
            if data is None:
                data = pd.read_csv(self.data_file_path[data_source])
                data.columns = [col.strip() for col in data.columns]
            else:
                new_data = pd.read_csv(self.data_file_path[data_source])
                new_data.columns = [col.strip() for col in new_data.columns]
                new_month = data_source.split(',')[0]
                data = pd.merge(left=data, right=new_data,
                                on=['Block', 'Sector', 'SHC'],
                                suffixes=['_{0}'.format(new_month), '_{0}'.format(last_month)])
            last_month = data_source.split(',')[0]
        self.data = data
        self.config = pd.read_csv(self.config_file_path)
        return None
    
    @staticmethod
    def generate_slug(string_val):
        '''
        Convert a string value to a slug.
        
        Arg:
            string_val (string): A string object to be slugged
        
        Returns:
            A slugged string.
        '''
        return re.sub('[^0-9a-zA-Z]+', '_', string_val.lower())


    def generate_json_data(self, output_file):
        if self.data is None or self.config_file_path is None:
            self.load_data()
        data = self.data.fillna(0)
        json_data = []
        meta_cols = ['SHC', 'Sector', 'Block']
        for category, group in self.config.groupby('Category'):
            category_slug = self.generate_slug(category)
            category_json = {'category_name': category,
                             'category_slug': category_slug, 
                             'sub_records': []}
            for _, row in group.iterrows():
                record = {}
                record['record_name'] = row['Sub Category']
                record['record_slug'] = self.generate_slug(row['Sub Category'])
                record['unit'] = row['unit']
                record['record_figures'] = []
                target_cols = []
                for data_source in self.data_file_path:
                    month = data_source.split(',')[0]
                    target_cols.append(row['Target Column'] + '_{0}'.format(month))
                cols = meta_cols + target_cols
                print(target_cols)
                for index, sub_row in data[cols].iterrows():
                    be = []
                    for data_source, target_col in zip(self.data_file_path, target_cols):
                        be.append({data_source: str(sub_row[target_col])})
                    figures = {'BE': be}
                    record['record_figures'].append({
                        'figures': figures,
                        'grpby_name': sub_row['SHC'],
                        'shc': sub_row['SHC'],
                        'block': sub_row['Block'],
                        'sector': sub_row['Sector']
                    })
                category_json['sub_records'].append(record)
            json_data.append(category_json)
        with open(output_file, 'w') as json_data_file:
            json.dump(json_data, json_data_file)

In [81]:
GenerateJsonData({'Apr, 2017-18': '../data/Indicators-RCH-prgress-tracking_V2.xlsx - Apr.csv',
                  'May, 2017-18': '../data/Indicators-RCH-prgress-tracking_V2.xlsx - May.csv',
                  'Jun, 2017-18': '../data/Indicators-RCH-prgress-tracking_V2.xlsx - Jun.csv',
                  'Jul, 2017-18': '../data/Indicators-RCH-prgress-tracking_V2.xlsx - Jul.csv'}, 
                 '../data/Indicators-RCH-Category-Structure.csv').generate_json_data('../data/timeseries.json')


['Target Pregnant Women_Apr', 'Target Pregnant Women_May', 'Target Pregnant Women_Jun', 'Target Pregnant Women_Jul']
['Total Number of pregnant registered\xa0 before 12 weeks against total number of ANC_Apr', 'Total Number of pregnant registered\xa0 before 12 weeks against total number of ANC_May', 'Total Number of pregnant registered\xa0 before 12 weeks against total number of ANC_Jun', 'Total Number of pregnant registered\xa0 before 12 weeks against total number of ANC_Jul']
['Percentage of pregnant registered  before 12 weeks against total number of ANC_Apr', 'Percentage of pregnant registered  before 12 weeks against total number of ANC_May', 'Percentage of pregnant registered  before 12 weeks against total number of ANC_Jun', 'Percentage of pregnant registered  before 12 weeks against total number of ANC_Jul']
['Number of pregnant women received 3 check ups_Apr', 'Number of pregnant women received 3 check ups_May', 'Number of pregnant women received 3 check ups_Jun', 'Number of pregnant women received 3 check ups_Jul']
['Percentage of pregnant women received 3 check ups_Apr', 'Percentage of pregnant women received 3 check ups_May', 'Percentage of pregnant women received 3 check ups_Jun', 'Percentage of pregnant women received 3 check ups_Jul']
['Children fully immunized_Apr', 'Children fully immunized_May', 'Children fully immunized_Jun', 'Children fully immunized_Jul']
['Percentage of children fully immunized_Apr', 'Percentage of children fully immunized_May', 'Percentage of children fully immunized_Jun', 'Percentage of children fully immunized_Jul']
['Total number of new borns received 6 post natal HBNC visit_Apr', 'Total number of new borns received 6 post natal HBNC visit_May', 'Total number of new borns received 6 post natal HBNC visit_Jun', 'Total number of new borns received 6 post natal HBNC visit_Jul']
['Percentage of new borns received 6 post natal HBNC visit_Apr', 'Percentage of new borns received 6 post natal HBNC visit_May', 'Percentage of new borns received 6 post natal HBNC visit_Jun', 'Percentage of new borns received 6 post natal HBNC visit_Jul']

In [ ]: