In [ ]:
import pandas as pd
import re

Text extraction from docx


In [ ]:
import lxml.etree
import zipfile

def get_text(file_path):

    word_zip= zipfile.ZipFile(str(file_path))

    all_text = []

    with word_zip.open('word/document.xml') as xml_file:
        for event, element in lxml.etree.iterparse(xml_file, 
                                                   tag='{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t'):
            all_text.append(element.text)
            element.clear()
    
    return '\n'.join(all_text)

In [ ]:
import pathlib

wiki = pathlib.Path(r'C:\Workspace\root\DuckDuckGo\wiki')

docs = []

for file_path in wiki.iterdir():
    if file_path.suffix == '.docx':
        doc = {
            'name': file_path.name,
            'text': get_text(file_path)
        }

        docs.append(doc)

In [ ]:
len(docs)

In [ ]:
df = pd.DataFrame(docs)

In [ ]:
# Remove `docx` ext.
df['name'] = df.name.apply(lambda name: name.rpartition('.')[0])

In [ ]:
templates = re.compile('Template:[^\n]*?\n')
category = re.compile('Cat[\n]{0,1}egory:[^\n]*?\n')
edit = '[\nedit\n]'
refs = re.compile('\[\d+?\]')
other_refs =  re.compile('<ref.*?/>')
thumb = re.compile('thumb\|.*?\|')

def clean_text(syn):
    syn = templates.sub('', syn)
    syn = category.sub('', syn)
    syn = syn.replace(edit, '')
    syn = refs.sub('', syn)
    syn = other_refs.sub('', syn)
    syn = thumb.sub('', syn)
    return syn.replace('\n', ' ')

df['text'] = df.text.apply(clean_text)

In [ ]:


In [ ]:
df.to_csv('../data/wiki/wiki.csv', encoding='utf8', index=False)

In [ ]: