In [9]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pymorphy2
import pandas as pd
from os.path import join, isdir
from os import listdir, makedirs
from __future__ import print_function
import io
In [2]:
data_path = './data'
markup_path = './markup'
In [3]:
morph = pymorphy2.MorphAnalyzer()
def norm(x):
p = morph.parse(x)[0]
return p.normal_form
In [18]:
total_sentences = 0
data_folders = listdir(data_path)
for folder in data_folders:
if isdir(join(data_path, folder)):
files = listdir(join(data_path, folder))
try:
makedirs(join(markup_path, folder))
except OSError:
pass
for fl in files:
markup_file_path = join(markup_path, folder, fl[:fl.find('.txt')]+'.csv')
data_file_path = join(data_path, folder, fl)
total_sentences += process_file(data_file_path, markup_file_path)
print(folder, end=' ')
In [17]:
def process_file(result_file_path, markup_file_path):
text = io.open(result_file_path, 'r', encoding='utf-8').read()
pos = 0
sentence_num = 0
Id = 0
dataframe = {'id':[], 'sentence':[], 'token':[],'pos':[], 'norm':[]}
sentences = sent_tokenize(text)
for sentence in sentences:
words = word_tokenize(sentence)
new_pos = text.find(sentence, pos)
if new_pos != -1:
pos = new_pos
for word in words:
new_pos = text.find(word, pos)
dataframe['id'].append(Id)
dataframe['sentence'].append(sentence_num)
dataframe['token'].append(word)
dataframe['pos'].append(new_pos)
dataframe['norm'].append(norm(word))
if new_pos != -1:
pos = new_pos
Id+=1
sentence_num+=1
pd.DataFrame(data=dataframe).to_csv(markup_file_path, encoding='utf-8')
return sentence_num
In [39]:
print(total_sentences)
In [15]:
import nltk
nltk.download('punkt')
Out[15]: