In [9]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pymorphy2
import pandas as pd
from os.path import join, isdir
from os import listdir, makedirs
from __future__ import print_function
import io

In [2]:
data_path = './data'
markup_path = './markup'

In [3]:
morph = pymorphy2.MorphAnalyzer()
def norm(x):    
    p = morph.parse(x)[0]
    return p.normal_form

In [18]:
total_sentences = 0
data_folders = listdir(data_path)
for folder in data_folders:
    if isdir(join(data_path, folder)):        
        files = listdir(join(data_path, folder))
        try:
            makedirs(join(markup_path, folder))
        except OSError:
            pass
        for fl in files:
            markup_file_path = join(markup_path, folder, fl[:fl.find('.txt')]+'.csv')
            data_file_path = join(data_path, folder, fl)
            total_sentences += process_file(data_file_path, markup_file_path)
    print(folder, end=' ')


I H M Р 5 F A 4 P metainfo.txt В А Т Щ Ч N И О Е E S G U 7 6 Н 9 Ш Z Й 8 М X З 1 Ё O У .NET Framework.txt Х С C W V Б J B П 3 Д Г Э Л 2 К K L T Q D Ц Ф Я Ж R ( Ю 

In [17]:
def process_file(result_file_path, markup_file_path):
    text = io.open(result_file_path, 'r', encoding='utf-8').read()
    pos = 0
    sentence_num = 0
    Id = 0
    
    dataframe = {'id':[], 'sentence':[], 'token':[],'pos':[], 'norm':[]}
    sentences = sent_tokenize(text)
    for sentence in sentences:
        words = word_tokenize(sentence)
        new_pos = text.find(sentence, pos)
        if new_pos != -1:
            pos = new_pos
        for word in words:
            new_pos = text.find(word, pos)
            
            dataframe['id'].append(Id)
            dataframe['sentence'].append(sentence_num)
            dataframe['token'].append(word)
            dataframe['pos'].append(new_pos)
            dataframe['norm'].append(norm(word))
            
            if new_pos != -1:
                pos = new_pos
            Id+=1
        sentence_num+=1

    pd.DataFrame(data=dataframe).to_csv(markup_file_path, encoding='utf-8')
    return sentence_num

In [39]:
print(total_sentences)


419086

In [15]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/sdernal/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Out[15]:
True