In [10]:
import urllib.request
import pprint
import json
import re
from openpyxl import load_workbook
from elasticsearch import Elasticsearch
es = Elasticsearch(port=9500)
sentenceSplit = re.compile('(Pg\. ([0-9]{1,3})\/([0-9]{1,3})): ((.|.\n)*(\n\n|$))')

data = load_workbook('WHSDATA.xlsx')['Sheet1']
pp = pprint.PrettyPrinter(indent=4)
skip = True
documents = []
count = 0;
#target = open("WHS.json", 'w')
for cell in data.rows:
    if(skip):
        skip = False
        continue
    doc = dict()
    doc['Name'] = urllib.request.unquote(cell[0].value)
    
    texts = list()
    
    for i in range(1, 69):
        if len(cell[i].value) > 10:
            texts.append({'text': cell[i].value, 'type': data.columns[i][0].value })
    
    for i in range(70, 148):
        header = data.columns[i][0].value
        field = str.split(header, ":")
        title = field[0]
        if cell[i].value is not None:
            value = True if len(field) == 1 else field[1].strip()
            doc[title] = value
        else:
            if len(field) == 1:
                value = False
                doc[title] = value
    for text in texts:
        doc['text'] = text['text']
        doc['type'] = text['type']
        #es.index(index="whs2", doc_type='document', body=doc)
        for sent in re.findall(sentenceSplit, text['text']):
            doc['page'] = int(sent[1])
            doc['total'] = int(sent[2])
            doc['text'] = sent[3]
            es.index(index="whs3", doc_type='document', body=doc)
    print(doc["name"], len(texts));
#json.dump(documents, target)
print("done")


done

In [ ]: