In [ ]:
from srs.predictor import loadTrainedPredictor
from srs.utilities import loadTrainingData
from srs import settings
from srs.Model_Word2Vec import AspectPatterns, distill_dynamic, static_aspect_to_vec
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score,classification_report
from sklearn.utils import column_or_1d
from srs.predictor import MaxEntropy_Predictor
In [ ]:
w2v_predictor = loadTrainedPredictor('Word2Vec')
In [ ]:
static_traning_data_dir = settings["static_training_data"]
sentences = loadTrainingData(static_traning_data_dir)
In [ ]:
aspectPattern_names = ['adj_nn','nn']
aspectPatterns = AspectPatterns(aspectPattern_names)
In [ ]:
df = pd.DataFrame(columns=w2v_predictor.static_aspects_all['static_aspect_list_fortraining'])
target = pd.DataFrame(columns=['Prod_Feat'])
In [ ]:
for sen0 in sentences:
distill_dynamic(sen0, aspectPatterns)
#transform the sentence's word2vec_features to vectors
word2vec_features = []
for item in sen0.word2vec_features_list:
word2vec_features=word2vec_features + item
vec_list=[]
for dynamic_aspect in word2vec_features:
dynamic_aspect_splitted=dynamic_aspect.split(' ')
aspect_phrase_vec=[]
for word in dynamic_aspect_splitted:
if word in w2v_predictor.model:
aspect_word_vec=w2v_predictor.model[word]
aspect_phrase_vec.append(aspect_word_vec)
if aspect_phrase_vec:
vec_list.append(aspect_phrase_vec)
num_static_aspect = len(w2v_predictor.static_aspects_all['static_aspect_list_fortraining'])
static_wordlist_vec = static_aspect_to_vec(w2v_predictor.static_aspects_all, w2v_predictor.model)
if vec_list:
similarity_matrix=np.zeros([len(vec_list),num_static_aspect])
for i in range(len(vec_list)):
for j in range(num_static_aspect):
similarity_item_matrix=np.zeros([len(vec_list[i]),len(static_wordlist_vec[j])])
for kk in range(len(vec_list[i])):
for ll in range(len(static_wordlist_vec[j])):
similarity_item_matrix[kk][ll]=np.dot(vec_list[i][kk],static_wordlist_vec[j][ll])
similarity_item_row=np.max(similarity_item_matrix,axis=1)
similarity_item=np.sum(similarity_item_row)
similarity_matrix[i][j]=similarity_item
useful_features_vec = np.max(similarity_matrix, axis=0)
row = pd.DataFrame([useful_features_vec],
columns = w2v_predictor.static_aspects_all['static_aspect_list_fortraining'])
df = df.append(row,ignore_index=True)
row_target = pd.DataFrame([sen0.labeled_aspects],columns=['Prod_Feat'])
target = target.append(row_target,ignore_index=True)
In [ ]:
df.head(5)
In [ ]:
target.head(5)
In [ ]:
train_idx, test_idx = train_test_split(df.index, test_size=0.25, random_state=42)
X_train = df.iloc[train_idx]
X_test = df.iloc[test_idx]
y_train = target.iloc[train_idx].values.ravel()
y_test = target.iloc[test_idx].values.ravel()
In [ ]:
for kernel in ('linear', 'rbf'):
print "================kernel: {0}=========================".format(kernel)
w2v_svm = svm.SVC(kernel=kernel, gamma=10)
w2v_svm.fit(X_train, y_train)
y_predicted = w2v_svm.predict(X_test)
target_names = target.Prod_Feat.unique()
print(classification_report(y_test, y_predicted, target_names=target_names))
In [ ]:
from sklearn.externals import joblib
joblib.dump(w2v_svm, 'w2v_svm.pkl')
In [ ]:
sentences_w2v = []
for idx in test_idx:
sentences_w2v.append(sentences[idx])
In [ ]:
y_predicted = w2v_predictor.predict_for_sentences(sentences_w2v)
In [ ]:
print "================kernel: {0}=========================".format('pure word2vec')
print(classification_report(y_test, y_predicted, target_names=np.insert(target_names, 0, 'useless')))