In [10]:
import urllib.request
import pprint
import json
import re
from openpyxl import load_workbook
from elasticsearch import Elasticsearch
es = Elasticsearch(port=9500)
sentenceSplit = re.compile('(Pg\. ([0-9]{1,3})\/([0-9]{1,3})): ((.|.\n)*(\n\n|$))')
data = load_workbook('WHSDATA.xlsx')['Sheet1']
pp = pprint.PrettyPrinter(indent=4)
skip = True
documents = []
count = 0;
#target = open("WHS.json", 'w')
for cell in data.rows:
if(skip):
skip = False
continue
doc = dict()
doc['Name'] = urllib.request.unquote(cell[0].value)
texts = list()
for i in range(1, 69):
if len(cell[i].value) > 10:
texts.append({'text': cell[i].value, 'type': data.columns[i][0].value })
for i in range(70, 148):
header = data.columns[i][0].value
field = str.split(header, ":")
title = field[0]
if cell[i].value is not None:
value = True if len(field) == 1 else field[1].strip()
doc[title] = value
else:
if len(field) == 1:
value = False
doc[title] = value
for text in texts:
doc['text'] = text['text']
doc['type'] = text['type']
#es.index(index="whs2", doc_type='document', body=doc)
for sent in re.findall(sentenceSplit, text['text']):
doc['page'] = int(sent[1])
doc['total'] = int(sent[2])
doc['text'] = sent[3]
es.index(index="whs3", doc_type='document', body=doc)
print(doc["name"], len(texts));
#json.dump(documents, target)
print("done")
In [ ]: