Feature: Simple Summary Statistics

Extract rudimentary statistical features, such as question lengths (in words and characters), differences and ratios of these lengths.

Imports

This utility package imports numpy, pandas, matplotlib and a helper kg module into the root namespace.



In [1]:

    
from pygoose import *

Config

Automatically discover the paths to various data folders and compose the project structure.



In [2]:

    
project = kg.Project.discover()

Identifier for storing these features on disk and referring to them later.



In [3]:

    
feature_list_id = 'simple_summaries'

Read Data

Original question datasets.



In [4]:

    
df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('')
df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('')

Preprocessed and tokenized questions.



In [5]:

    
tokens_train = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_train.pickle')
tokens_test = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_test.pickle')

Build Features



In [6]:

    
def word_difference_ratio(q1_tokens, q2_tokens):
    return len(set(q1_tokens) ^ set(q2_tokens)) / (len(set(q1_tokens)) + len(set(q2_tokens)))



In [7]:

    
def extract_original_question_features(row):
    q1 = row[0]
    q2 = row[1]
    
    shorter_char_length = min(len(q1), len(q2))
    longer_char_length = max(len(q1), len(q2))
    
    return [
        np.log(shorter_char_length + 1),
        np.log(longer_char_length + 1),
        np.log(abs(longer_char_length - shorter_char_length) + 1),
        shorter_char_length / longer_char_length,
    ]



In [8]:

    
def extract_tokenized_features(pair):
    q1 = pair[0]
    q2 = pair[1]
    
    shorter_token_length = min(len(q1), len(q2))
    longer_token_length = max(len(q1), len(q2))
    
    return [
        np.log(shorter_token_length + 1),
        np.log(longer_token_length + 1),
        np.log(abs(longer_token_length - shorter_token_length) + 1),
        shorter_token_length / longer_token_length,
        word_difference_ratio(q1, q2),
    ]

Extract character-based features



In [9]:

    
features_original_train = kg.jobs.map_batch_parallel(
    df_train.as_matrix(columns=['question1', 'question2']),
    item_mapper=extract_original_question_features,
    batch_size=1000,
)









    



Batches: 100%|██████████| 405/405 [00:01<00:00, 282.02it/s]



In [10]:

    
features_original_test = kg.jobs.map_batch_parallel(
    df_test.as_matrix(columns=['question1', 'question2']),
    item_mapper=extract_original_question_features,
    batch_size=1000,
)









    



Batches: 100%|██████████| 2346/2346 [00:17<00:00, 133.19it/s]

Extract token-based features



In [11]:

    
features_tokenized_train = kg.jobs.map_batch_parallel(
    tokens_train,
    item_mapper=extract_tokenized_features,
    batch_size=1000,
)









    



Batches: 100%|██████████| 405/405 [00:03<00:00, 110.48it/s]



In [12]:

    
features_tokenized_test = kg.jobs.map_batch_parallel(
    tokens_test,
    item_mapper=extract_tokenized_features,
    batch_size=1000,
)









    



Batches: 100%|██████████| 2346/2346 [00:17<00:00, 135.39it/s]

Combine features



In [13]:

    
X_train = np.hstack([features_original_train, features_tokenized_train])



In [14]:

    
X_test = np.hstack([features_original_test, features_tokenized_test])



In [15]:

    
print('X_train:', X_train.shape)
print('X_test: ', X_test.shape)









    



X_train: (404290, 9)
X_test:  (2345796, 9)

Save features



In [16]:

    
feature_names = [
    # Character features.
    'shorter_char_len_log',
    'longer_char_len_log',
    'char_len_diff_log',
    'char_len_ratio',
    
    # Token features.
    'shorter_token_len_log',
    'longer_token_len_log',
    'token_len_diff_log',
    'token_len_ratio',
    'word_diff_ratio',
]



In [17]:

    
project.save_features(X_train, X_test, feature_names, feature_list_id)