In [9]:
#import urllib.request
#import pprint
import json
import re
from openpyxl import load_workbook
from elasticsearch import Elasticsearch
es = Elasticsearch(port=9500)
sentenceSplit = re.compile('(Pg\. ([0-9]{1,3})\/([0-9]{1,3})): ((.|.\n)*(\n\n|$))')

data = load_workbook('WHSDATA.xlsx')['Sheet1']
pp = pprint.PrettyPrinter(indent=4)
skip = True
documents = []
count = 0;

typeNames = [
        "",
        "1 - Ensuring affected people, particularly women, have a stronger voice and greater role in humanitarian action",
        "2 - Confronting international humanitarian law (“IHL”) violations and finding new ways to protect and assist people in conflict",
        "3 - Localizing preparedness and response",
        "4 - Developing new approaches for managing recurrent and protected crises",
        "5 - Adapting the humanitarian system to new contexts, actors, and challenges",
        "6 - Global action to address the finance gap",
        "7 - Creating an enabling environment and investment in innovation to better deal with current and future humanitarian challenges",
        "8 - Gender"
    ];

#target = open("WHS.json", 'w')
for cell in data.rows:
    if(skip):
        skip = False
        continue
    doc = dict()
    doc['Name'] = urllib.request.unquote(cell[0].value)
    
    texts = list()
    
    for i in range(1, 69):
        if len(cell[i].value) > 10:
            texts.append({'text': cell[i].value, 'type': data.columns[i][0].value })
    
    for i in range(70, 148):
        header = data.columns[i][0].value
        field = str.split(header, ":")
        title = field[0]
        if cell[i].value is not None:
            value = True if len(field) == 1 else field[1].strip()
            doc[title] = value
        else:
            if len(field) == 1:
                value = False
                doc[title] = value
    for text in texts:
        doc['text'] = text['text']
        doc['subtopic'] = text['type']
        #es.index(index="whs2", doc_type='document', body=doc)
        for sent in re.findall(sentenceSplit, text['text']):
            doc['page'] = int(sent[1])
            doc['total'] = int(sent[2])
            doc['text'] = sent[3]
            doc['topic'] = typeNames[int(doc["subtopic"][0])]
            #es.index(index="whs4", doc_type='document', body=doc)
    count+=1
    print(str(count), doc["Name"], "Size:" + str(len(texts)))
    break;
#json.dump(documents, target)
print("done")


1 007-East Africa WHS consultation - Education in Emergencies.pdf Size:21
done

In [ ]: