LTR Model to Salient Concepts

In this approach, we use LTR to select candidate phrases for generating summaris.

Data Construction

This step is reading tokens from BookNLP to construct training set.


In [6]:
import numpy as np
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
#####################################################################
# DataSet
#####################################################################
train_file = open('../../0.part.tokens.LTR_FT', 'rb')
character = {}
for line in train_file:
    terms = line.split('\t')
    key = terms[0] + ' ' + terms[1]
    if not key in character:
        character[key] = []
    character[key].append(terms[2])
train_file.close()
samples = character.values()

In [14]:
def sample2Xy(samples, indexes):
    X = []
    y = []
    for index in indexes:
        for des in samples[index]:
            terms = des.split(' ')
            X.append([float(x) for x in terms[1:]])
            y.append(float(terms[0]))
    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.float32)
    return X, y
kf = KFold(n_splits = 5, shuffle = True)
mse_linear_model = []
mae_linear_model = []
mse_tree_model = []
mae_tree_model = []
for train, test in kf.split(samples):
    X_train, y_train = sample2Xy(samples, train)
    X_test, y_test = sample2Xy(samples, test)
    # Linear Regression
    clf = linear_model.LinearRegression()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    mse_linear_model.append(mean_squared_error(y_test, y_pred))
    mae_linear_model.append(mean_absolute_error(y_test, y_pred))
    # Gradient Boosting Regressor
    clf = GradientBoostingRegressor(learning_rate=0.05, random_state=1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    mse_tree_model.append(mean_squared_error(y_test, y_pred))
    mae_tree_model.append(mean_absolute_error(y_test, y_pred))
print 'Linear Regression MSE ' + str(abs(np.mean(mse_linear_model)))
print 'Linear Regression MAE ' + str(abs(np.mean(mae_linear_model)))
print 'Gradient Boosting Regressor MSE ' + str(abs(np.mean(mse_tree_model)))
print 'Gradient Boosting Regressor MAE ' + str(abs(np.mean(mae_tree_model)))


Linear Regression MSE 0.0421227189918
Linear Regression MAE 0.164648788162
Gradient Boosting Regressor MSE 0.0323571346827
Gradient Boosting Regressor MAE 0.141880088677

In [65]:
i = 495
print y_test[i: i + 10]
print y_pred[i: i + 10]


[ 0.44574937  0.32700461  1.          1.          0.48753181  1.          1.
  0.47809133  1.          0.88545078]
[ 0.8824008   0.48452059  0.90852744  0.84650243  0.56198991  0.94819097
  0.93412086  0.6356297   0.99729832  0.6399198 ]

next step

nDCG

MAP