Example of preprocessing interim data (interim - texts after syntax parse)
In [1]:
import os
import sys
import numpy as np
import pickle
src = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src)
from features import feature_proc
In [2]:
data_source = '../data/interim/'
feature_dist_path = '../data/processed/SentiRuEval2016.pickle'
# predefined patterns for siblings
sibling_patterns = [np.array([0,1]),
np.array([0,1,2]),
np.array([0,2,3]),
np.array([0,1,2,3]),
np.array([0,2,3,4])]
In [4]:
data, labels, token2id = feature_proc.load_interim_data(data_source)
transformed_data = (feature_proc.sentences_to_ids(feature, token2id) for feature in data)
transformed_data = feature_proc.set_padding(transformed_data, [50, 250, 250])
X = [transformed_data[1]] + feature_proc.get_patterns(transformed_data[2], sibling_patterns)
y = feature_proc.nums_to_one_hot(np.array([label[2] for label in labels]), 3)
with open(feature_dist_path, 'wb') as dist:
pickle.dump((X, y, labels, len(token2id)), dist)
print("Processed data serialized")
In [ ]: