In [1]:
import os
import re
from os import path
In [3]:
# Путь к части дампа. Неважно какую часть выбрать, т.к. статьи перемешаны в произвольном порядке.
data_path = r'wikiextraction\text\AA'
# Корень хранилища.
result_path = r'corpus'
In [58]:
reg_text = '<doc .* url="(?P<url>.*)".*title="(?P<title>.*)">.*'
reg = re.compile(reg_text)
In [80]:
# Очистка данных.
def process_data(content):
content_parts = re.split('</doc>', content)
for content_part in content_parts:
content_part = content_part.strip()
result = re.match(reg, content_part)
if result is None:
continue
result = result.groupdict()
title = result['title']
metainfo[title] = result['url']
try:
os.makedirs(join(result_path, title[0]))
except OSError:
pass
try:
result_file = open(join(result_path, title[0], title + '.txt'), 'w', encoding='utf-8')
except OSError:
continue
header_end = content_part.find('>')
result_file.write(content_part[header_end+1:].strip())
result_file.close()
In [81]:
metainfo = dict()
data_files = listdir(data_path)
for file_name in data_files:
content = open(join(data_path,file_name), 'r', encoding='utf-8').read()
process_data(content)
In [87]:
metainfo_file = open(join(result_path, 'metainfo.txt'), 'w')
for key in sorted(metainfo):
try:
metainfo_file.write(key +' : ' + metainfo[key] + '\n')
except UnicodeEncodeError:
continue
metainfo_file.close()
In [88]:
print(len(metainfo.keys()))