In [9]:
#import urllib.request
#import pprint
import json
import re
from openpyxl import load_workbook
from elasticsearch import Elasticsearch
es = Elasticsearch(port=9500)
sentenceSplit = re.compile('(Pg\. ([0-9]{1,3})\/([0-9]{1,3})): ((.|.\n)*(\n\n|$))')
data = load_workbook('WHSDATA.xlsx')['Sheet1']
pp = pprint.PrettyPrinter(indent=4)
skip = True
documents = []
count = 0;
typeNames = [
"",
"1 - Ensuring affected people, particularly women, have a stronger voice and greater role in humanitarian action",
"2 - Confronting international humanitarian law (“IHL”) violations and finding new ways to protect and assist people in conflict",
"3 - Localizing preparedness and response",
"4 - Developing new approaches for managing recurrent and protected crises",
"5 - Adapting the humanitarian system to new contexts, actors, and challenges",
"6 - Global action to address the finance gap",
"7 - Creating an enabling environment and investment in innovation to better deal with current and future humanitarian challenges",
"8 - Gender"
];
#target = open("WHS.json", 'w')
for cell in data.rows:
if(skip):
skip = False
continue
doc = dict()
doc['Name'] = urllib.request.unquote(cell[0].value)
texts = list()
for i in range(1, 69):
if len(cell[i].value) > 10:
texts.append({'text': cell[i].value, 'type': data.columns[i][0].value })
for i in range(70, 148):
header = data.columns[i][0].value
field = str.split(header, ":")
title = field[0]
if cell[i].value is not None:
value = True if len(field) == 1 else field[1].strip()
doc[title] = value
else:
if len(field) == 1:
value = False
doc[title] = value
for text in texts:
doc['text'] = text['text']
doc['subtopic'] = text['type']
#es.index(index="whs2", doc_type='document', body=doc)
for sent in re.findall(sentenceSplit, text['text']):
doc['page'] = int(sent[1])
doc['total'] = int(sent[2])
doc['text'] = sent[3]
doc['topic'] = typeNames[int(doc["subtopic"][0])]
#es.index(index="whs4", doc_type='document', body=doc)
count+=1
print(str(count), doc["Name"], "Size:" + str(len(texts)))
break;
#json.dump(documents, target)
print("done")
In [ ]: