Feature: Simple Summary Statistics

Extract rudimentary statistical features, such as question lengths (in words and characters), differences and ratios of these lengths.

Imports

This utility package imports numpy, pandas, matplotlib and a helper kg module into the root namespace.


In [1]:
from pygoose import *

Config

Automatically discover the paths to various data folders and compose the project structure.


In [2]:
project = kg.Project.discover()

Identifier for storing these features on disk and referring to them later.


In [3]:
feature_list_id = 'simple_summaries'

Read Data

Original question datasets.


In [4]:
df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('')
df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('')

Preprocessed and tokenized questions.


In [5]:
tokens_train = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_train.pickle')
tokens_test = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_test.pickle')

Build Features


In [6]:
def word_difference_ratio(q1_tokens, q2_tokens):
    return len(set(q1_tokens) ^ set(q2_tokens)) / (len(set(q1_tokens)) + len(set(q2_tokens)))

In [7]:
def extract_original_question_features(row):
    q1 = row[0]
    q2 = row[1]
    
    shorter_char_length = min(len(q1), len(q2))
    longer_char_length = max(len(q1), len(q2))
    
    return [
        np.log(shorter_char_length + 1),
        np.log(longer_char_length + 1),
        np.log(abs(longer_char_length - shorter_char_length) + 1),
        shorter_char_length / longer_char_length,
    ]

In [8]:
def extract_tokenized_features(pair):
    q1 = pair[0]
    q2 = pair[1]
    
    shorter_token_length = min(len(q1), len(q2))
    longer_token_length = max(len(q1), len(q2))
    
    return [
        np.log(shorter_token_length + 1),
        np.log(longer_token_length + 1),
        np.log(abs(longer_token_length - shorter_token_length) + 1),
        shorter_token_length / longer_token_length,
        word_difference_ratio(q1, q2),
    ]

Extract character-based features


In [9]:
features_original_train = kg.jobs.map_batch_parallel(
    df_train.as_matrix(columns=['question1', 'question2']),
    item_mapper=extract_original_question_features,
    batch_size=1000,
)


Batches: 100%|██████████| 405/405 [00:01<00:00, 282.02it/s]

In [10]:
features_original_test = kg.jobs.map_batch_parallel(
    df_test.as_matrix(columns=['question1', 'question2']),
    item_mapper=extract_original_question_features,
    batch_size=1000,
)


Batches: 100%|██████████| 2346/2346 [00:17<00:00, 133.19it/s]

Extract token-based features


In [11]:
features_tokenized_train = kg.jobs.map_batch_parallel(
    tokens_train,
    item_mapper=extract_tokenized_features,
    batch_size=1000,
)


Batches: 100%|██████████| 405/405 [00:03<00:00, 110.48it/s]

In [12]:
features_tokenized_test = kg.jobs.map_batch_parallel(
    tokens_test,
    item_mapper=extract_tokenized_features,
    batch_size=1000,
)


Batches: 100%|██████████| 2346/2346 [00:17<00:00, 135.39it/s]

Combine features


In [13]:
X_train = np.hstack([features_original_train, features_tokenized_train])

In [14]:
X_test = np.hstack([features_original_test, features_tokenized_test])

In [15]:
print('X_train:', X_train.shape)
print('X_test: ', X_test.shape)


X_train: (404290, 9)
X_test:  (2345796, 9)

Save features


In [16]:
feature_names = [
    # Character features.
    'shorter_char_len_log',
    'longer_char_len_log',
    'char_len_diff_log',
    'char_len_ratio',
    
    # Token features.
    'shorter_token_len_log',
    'longer_token_len_log',
    'token_len_diff_log',
    'token_len_ratio',
    'word_diff_ratio',
]

In [17]:
project.save_features(X_train, X_test, feature_names, feature_list_id)