Example of preprocessing interim data (interim - texts after syntax parse)


In [1]:
import os
import sys
import numpy as np
import pickle
    
src = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src)

from features import feature_proc

In [2]:
data_source = '../data/interim/'
feature_dist_path = '../data/processed/SentiRuEval2016.pickle'

# predefined patterns for siblings
sibling_patterns = [np.array([0,1]), 
            np.array([0,1,2]),
            np.array([0,2,3]),
            np.array([0,1,2,3]),
            np.array([0,2,3,4])]

In [4]:
data, labels, token2id = feature_proc.load_interim_data(data_source)
transformed_data = (feature_proc.sentences_to_ids(feature, token2id) for feature in data)
transformed_data = feature_proc.set_padding(transformed_data, [50, 250, 250])
X = [transformed_data[1]] + feature_proc.get_patterns(transformed_data[2], sibling_patterns)
y = feature_proc.nums_to_one_hot(np.array([label[2] for label in labels]), 3)

with open(feature_dist_path, 'wb') as dist:
    pickle.dump((X, y, labels, len(token2id)), dist)
    print("Processed data serialized")


Processed data serialized

In [ ]: