Extract rudimentary statistical features, such as question lengths (in words and characters), differences and ratios of these lengths.
This utility package imports numpy
, pandas
, matplotlib
and a helper kg
module into the root namespace.
In [1]:
from pygoose import *
Automatically discover the paths to various data folders and compose the project structure.
In [2]:
project = kg.Project.discover()
Identifier for storing these features on disk and referring to them later.
In [3]:
feature_list_id = 'simple_summaries'
Original question datasets.
In [4]:
df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('')
df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('')
Preprocessed and tokenized questions.
In [5]:
tokens_train = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_train.pickle')
tokens_test = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_test.pickle')
In [6]:
def word_difference_ratio(q1_tokens, q2_tokens):
return len(set(q1_tokens) ^ set(q2_tokens)) / (len(set(q1_tokens)) + len(set(q2_tokens)))
In [7]:
def extract_original_question_features(row):
q1 = row[0]
q2 = row[1]
shorter_char_length = min(len(q1), len(q2))
longer_char_length = max(len(q1), len(q2))
return [
np.log(shorter_char_length + 1),
np.log(longer_char_length + 1),
np.log(abs(longer_char_length - shorter_char_length) + 1),
shorter_char_length / longer_char_length,
]
In [8]:
def extract_tokenized_features(pair):
q1 = pair[0]
q2 = pair[1]
shorter_token_length = min(len(q1), len(q2))
longer_token_length = max(len(q1), len(q2))
return [
np.log(shorter_token_length + 1),
np.log(longer_token_length + 1),
np.log(abs(longer_token_length - shorter_token_length) + 1),
shorter_token_length / longer_token_length,
word_difference_ratio(q1, q2),
]
In [9]:
features_original_train = kg.jobs.map_batch_parallel(
df_train.as_matrix(columns=['question1', 'question2']),
item_mapper=extract_original_question_features,
batch_size=1000,
)
In [10]:
features_original_test = kg.jobs.map_batch_parallel(
df_test.as_matrix(columns=['question1', 'question2']),
item_mapper=extract_original_question_features,
batch_size=1000,
)
In [11]:
features_tokenized_train = kg.jobs.map_batch_parallel(
tokens_train,
item_mapper=extract_tokenized_features,
batch_size=1000,
)
In [12]:
features_tokenized_test = kg.jobs.map_batch_parallel(
tokens_test,
item_mapper=extract_tokenized_features,
batch_size=1000,
)
In [13]:
X_train = np.hstack([features_original_train, features_tokenized_train])
In [14]:
X_test = np.hstack([features_original_test, features_tokenized_test])
In [15]:
print('X_train:', X_train.shape)
print('X_test: ', X_test.shape)
In [16]:
feature_names = [
# Character features.
'shorter_char_len_log',
'longer_char_len_log',
'char_len_diff_log',
'char_len_ratio',
# Token features.
'shorter_token_len_log',
'longer_token_len_log',
'token_len_diff_log',
'token_len_ratio',
'word_diff_ratio',
]
In [17]:
project.save_features(X_train, X_test, feature_names, feature_list_id)