In [ ]:
    
import pandas as pd
import re
    
In [ ]:
    
import lxml.etree
import zipfile
def get_text(file_path):
    word_zip= zipfile.ZipFile(str(file_path))
    all_text = []
    with word_zip.open('word/document.xml') as xml_file:
        for event, element in lxml.etree.iterparse(xml_file, 
                                                   tag='{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t'):
            all_text.append(element.text)
            element.clear()
    
    return '\n'.join(all_text)
    
In [ ]:
    
import pathlib
wiki = pathlib.Path(r'C:\Workspace\root\DuckDuckGo\wiki')
docs = []
for file_path in wiki.iterdir():
    if file_path.suffix == '.docx':
        doc = {
            'name': file_path.name,
            'text': get_text(file_path)
        }
        docs.append(doc)
    
In [ ]:
    
len(docs)
    
In [ ]:
    
df = pd.DataFrame(docs)
    
In [ ]:
    
# Remove `docx` ext.
df['name'] = df.name.apply(lambda name: name.rpartition('.')[0])
    
In [ ]:
    
templates = re.compile('Template:[^\n]*?\n')
category = re.compile('Cat[\n]{0,1}egory:[^\n]*?\n')
edit = '[\nedit\n]'
refs = re.compile('\[\d+?\]')
other_refs =  re.compile('<ref.*?/>')
thumb = re.compile('thumb\|.*?\|')
def clean_text(syn):
    syn = templates.sub('', syn)
    syn = category.sub('', syn)
    syn = syn.replace(edit, '')
    syn = refs.sub('', syn)
    syn = other_refs.sub('', syn)
    syn = thumb.sub('', syn)
    return syn.replace('\n', ' ')
df['text'] = df.text.apply(clean_text)
    
In [ ]:
    
    
In [ ]:
    
df.to_csv('../data/wiki/wiki.csv', encoding='utf8', index=False)
    
In [ ]: