In [18]:
RAW_DOCS_FOLDER = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Docs"
# OUTPUT FOLDER FOR DOCUMENT PROCESSING - USED IN REMAINING STEPS
DOCS_FOLDER = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/ProcessedDocs"
# REMOVE ALL FILES IN OUPUT FOLDER FIRST (SO YOU CAN RE-RUN AT WILL)
EMPTY_OUTPUT_FOLDER = True
FILE_MASK = ".*\.txt"
PARSE_HTML = True
FILE_SIZE_LIMIT_CHARS = 1000
In [11]:
#Shared
import re
re_collapse_spaces = re.compile("\s+")
def collapse_spaces(s):
return re_collapse_spaces.sub(" ", s).strip()
re1 = re.compile("[;:\'\"\*/\),\(\|\s]+")
def clean_str(s):
s = str(s).replace("'s"," ")
#doesn't work in regex
s = s.replace("-", " ").replace("\\"," ")
s = re1.sub(" ",s).strip()
return collapse_spaces(s)
In [17]:
import os, re, time
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
REPL = ".\n"
def strip_non_ascii(text):
return ''.join(i for i in text if ord(i)<128)
# Make common html tags line breaks
def pre_process_text(txt):
txt = txt.replace("</li><li>", REPL).replace("<li>", REPL).replace("</li>", REPL)
txt = txt.replace("<br>", REPL)
txt = txt.replace("<br/>", REPL)
txt = txt.replace("<br />", REPL)
txt = txt.replace("<p>", REPL)
txt = txt.replace("<p/>", REPL)
txt = txt.replace("<p />", REPL)
txt = txt.replace("</p>", REPL)
txt = txt.replace(". .", REPL)
txt = txt.replace(" ", " ")
while ".." in txt:
txt = txt.replace("..", ". ")
while " " in txt:
txt = txt.replace(" ", " ")
return txt
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', strip_non_ascii(element)):
return False
return True
def get_text(html):
bs = BeautifulSoup(html)
texts = bs.findAll(text=True)
visible_texts = filter(visible, texts)
return REPL.join(visible_texts)
def parse_html(html):
txt = get_text(pre_process_text(html))
return txt
def split_into_sentences(txt):
txt = strip_non_ascii(txt)
sents = map(clean_str,sent_tokenize(txt))
return filter(lambda s: len(s.strip()) > 5, sents)
def find_files(folder, regex, remove_empty = False):
"""
Find all files matching the [regex] pattern in [folder]
folder : string
folder to search (not recursive)
regex : string (NOT regex object)
pattern to match
"""
files = os.listdir(folder)
matches = [os.path.abspath(os.path.join(folder, f))
for f in files
if re.search(regex, f, re.IGNORECASE)]
if remove_empty:
matches = [f for f in matches if os.path.getsize(f) > 0]
matches.sort()
return matches
def delete_files(folder, regex):
""" Deletes files in [folder] that match [regex]
e.g. delete_files("C:/Dice Data/DelTest", ".*\.txt", 30)
folder : string
folder to search
regex : string
file pattern to match
"""
matches = find_files(folder, regex)
for full_path in matches:
os.remove(full_path)
In [20]:
import ntpath
ntpath.basename("a/b/c")
def get_file_name(path):
head, tail = ntpath.split(path)
return tail or ntpath.basename(head)
start = time.time()
if EMPTY_OUTPUT_FOLDER:
if DOCS_FOLDER == RAW_DOCS_FOLDER:
print("ERROR - Can't empty output folder if the same as the input folder")
else:
delete_files(DOCS_FOLDER,".*")
files = find_files(RAW_DOCS_FOLDER, FILE_MASK, True)
for i, fpath in enumerate(files):
with open(fpath) as f:
contents = f.read()
if len(contents) < FILE_SIZE_LIMIT_CHARS:
continue
if PARSE_HTML:
contents = parse_html(contents)
if len(contents) < FILE_SIZE_LIMIT_CHARS:
continue
sents = split_into_sentences(contents)
doc = "\n".join(sents)
file_name = get_file_name(fpath)
fout_name = DOCS_FOLDER + "/" + file_name.split(".")[0] + "_proc.txt"
with open(fout_name, "w+") as fout:
fout.write(doc)
if i % 1000 == 0:
print(i)
end = time.time()
print("Loading and processing documents took %s seconds" % str(end - start))