In [ ]:
import pandas as pd
import re
In [ ]:
import lxml.etree
import zipfile
def get_text(file_path):
word_zip= zipfile.ZipFile(str(file_path))
all_text = []
with word_zip.open('word/document.xml') as xml_file:
for event, element in lxml.etree.iterparse(xml_file,
tag='{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t'):
all_text.append(element.text)
element.clear()
return '\n'.join(all_text)
In [ ]:
import pathlib
wiki = pathlib.Path(r'C:\Workspace\root\DuckDuckGo\wiki')
docs = []
for file_path in wiki.iterdir():
if file_path.suffix == '.docx':
doc = {
'name': file_path.name,
'text': get_text(file_path)
}
docs.append(doc)
In [ ]:
len(docs)
In [ ]:
df = pd.DataFrame(docs)
In [ ]:
# Remove `docx` ext.
df['name'] = df.name.apply(lambda name: name.rpartition('.')[0])
In [ ]:
templates = re.compile('Template:[^\n]*?\n')
category = re.compile('Cat[\n]{0,1}egory:[^\n]*?\n')
edit = '[\nedit\n]'
refs = re.compile('\[\d+?\]')
other_refs = re.compile('<ref.*?/>')
thumb = re.compile('thumb\|.*?\|')
def clean_text(syn):
syn = templates.sub('', syn)
syn = category.sub('', syn)
syn = syn.replace(edit, '')
syn = refs.sub('', syn)
syn = other_refs.sub('', syn)
syn = thumb.sub('', syn)
return syn.replace('\n', ' ')
df['text'] = df.text.apply(clean_text)
In [ ]:
In [ ]:
df.to_csv('../data/wiki/wiki.csv', encoding='utf8', index=False)
In [ ]: