Feature Engineering

Features extracted to find the similarity between two given questions. Various types of features are extracted, they are as follows :

  • Various distance similarities
  • Word Level Features
  • Character Level Features
  • Tf-idf features
  • Leaky Features which were disclosed during competetion
  • Miscellaneous Features

Note : This notebook is python 3 compatible.

All the related information about the features can be found in this great post.

Importing Packages


In [1]:
# packages
import argparse
import functools
from collections import defaultdict

import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from collections import Counter
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split


/home/alibaba/anaconda2/envs/py/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Features

We are trying to create more features on the top of features generated by Abhishek on kaggle which can be downloaded from here.


In [2]:
# Functions
def word_match_share(row, stops=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def jaccard(row):
    wic = set(row['question1']).intersection(set(row['question2']))
    uw = set(row['question1']).union(row['question2'])
    if len(uw) == 0:
        uw = [1]
    return (len(wic) / len(uw))

def common_words(row):
    return len(set(row['question1']).intersection(set(row['question2'])))

def total_unique_words(row):
    return len(set(row['question1']).union(row['question2']))

def total_unq_words_stop(row, stops):
    return len([x for x in set(row['question1']).union(row['question2']) if x not in stops])

def wc_diff(row):
    return abs(len(row['question1']) - len(row['question2']))

def wc_ratio(row):
    l1 = len(row['question1'])*1.0 
    l2 = len(row['question2'])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique(row):
    return abs(len(set(row['question1'])) - len(set(row['question2'])))

def wc_ratio_unique(row):
    l1 = len(set(row['question1'])) * 1.0
    l2 = len(set(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique_stop(row, stops=None):
    return abs(len([x for x in set(row['question1']) if x not in stops]) - len([x for x in set(row['question2']) if x not in stops]))

def wc_ratio_unique_stop(row, stops=None):
    l1 = len([x for x in set(row['question1']) if x not in stops])*1.0 
    l2 = len([x for x in set(row['question2']) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def same_start_word(row):
    if not row['question1'] or not row['question2']:
        return np.nan
    return int(row['question1'][0] == row['question2'][0])

def char_diff(row):
    return abs(len(''.join(row['question1'])) - len(''.join(row['question2'])))

def char_ratio(row):
    l1 = len(''.join(row['question1'])) 
    l2 = len(''.join(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def char_diff_unique_stop(row, stops=None):
    return abs(len(''.join([x for x in set(row['question1']) if x not in stops])) - len(''.join([x for x in set(row['question2']) if x not in stops])))


def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
    
def tfidf_word_match_share_stops(row, stops=None, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def tfidf_word_match_share(row, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        q1words[word] = 1
    for word in row['question2']:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R


def build_features(data, stops, weights):
    X = pd.DataFrame()
    f = functools.partial(word_match_share, stops=stops)
    X['word_match'] = data.apply(f, axis=1, raw=True) #1

    f = functools.partial(tfidf_word_match_share, weights=weights)
    X['tfidf_wm'] = data.apply(f, axis=1, raw=True) #2

    f = functools.partial(tfidf_word_match_share_stops, stops=stops, weights=weights)
    X['tfidf_wm_stops'] = data.apply(f, axis=1, raw=True) #3

    X['jaccard'] = data.apply(jaccard, axis=1, raw=True) #4
    X['wc_diff'] = data.apply(wc_diff, axis=1, raw=True) #5
    X['wc_ratio'] = data.apply(wc_ratio, axis=1, raw=True) #6
    X['wc_diff_unique'] = data.apply(wc_diff_unique, axis=1, raw=True) #7
    X['wc_ratio_unique'] = data.apply(wc_ratio_unique, axis=1, raw=True) #8

    f = functools.partial(wc_diff_unique_stop, stops=stops)    
    X['wc_diff_unq_stop'] = data.apply(f, axis=1, raw=True) #9
    f = functools.partial(wc_ratio_unique_stop, stops=stops)    
    X['wc_ratio_unique_stop'] = data.apply(f, axis=1, raw=True) #10

    X['same_start'] = data.apply(same_start_word, axis=1, raw=True) #11
    X['char_diff'] = data.apply(char_diff, axis=1, raw=True) #12

    f = functools.partial(char_diff_unique_stop, stops=stops) 
    X['char_diff_unq_stop'] = data.apply(f, axis=1, raw=True) #13
    
    X['common_words'] = data.apply(common_words, axis=1, raw=True)  #14
    X['total_unique_words'] = data.apply(total_unique_words, axis=1, raw=True)  #15

    f = functools.partial(total_unq_words_stop, stops=stops)
    X['total_unq_words_stop'] = data.apply(f, axis=1, raw=True)  #16
    
    X['char_ratio'] = data.apply(char_ratio, axis=1, raw=True) #17    

    return X

In [5]:
# processing
print ('loading features')
df_train = pd.read_csv('train_features.csv', encoding="ISO-8859-1")
X_train_ab = df_train.iloc[:, 2:-1]
X_train_ab = X_train_ab.drop('euclidean_distance', axis=1)
X_train_ab = X_train_ab.drop('jaccard_distance', axis=1)

print ('loading actual data')
df_train = pd.read_csv('train.csv')
df_train = df_train.fillna(' ')

df_test = pd.read_csv('test.csv')
ques = pd.concat([df_train[['question1', 'question2']], \
    df_test[['question1', 'question2']]], axis=0).reset_index(drop='index')
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])

def q1_freq(row):
    return(len(q_dict[row['question1']]))

def q2_freq(row):
    return(len(q_dict[row['question2']]))

def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

print ('making leaky features')
df_train['q1_q2_intersect'] = df_train.apply(q1_q2_intersect, axis=1, raw=True)
df_train['q1_freq'] = df_train.apply(q1_freq, axis=1, raw=True)
df_train['q2_freq'] = df_train.apply(q2_freq, axis=1, raw=True)

df_test['q1_q2_intersect'] = df_test.apply(q1_q2_intersect, axis=1, raw=True)
df_test['q1_freq'] = df_test.apply(q1_freq, axis=1, raw=True)
df_test['q2_freq'] = df_test.apply(q2_freq, axis=1, raw=True)

test_leaky = df_test.loc[:, ['q1_q2_intersect','q1_freq','q2_freq']]
del df_test

train_leaky = df_train.loc[:, ['q1_q2_intersect','q1_freq','q2_freq']]

# explore

stops = set(stopwords.words("english"))

df_train['question1'] = df_train['question1'].map(lambda x: str(x).lower().split())
df_train['question2'] = df_train['question2'].map(lambda x: str(x).lower().split())

train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist())

words = [x for y in train_qs for x in y]
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

print('Building Features')
X_train = build_features(df_train, stops, weights)
print ('concatenating')
X_train = pd.concat((X_train, X_train_ab, train_leaky), axis=1)
y_train = df_train['is_duplicate'].values


loading features
loading actual data
making leaky features
Building Features
/home/alibaba/anaconda2/envs/py/lib/python3.5/site-packages/ipykernel_launcher.py:118: RuntimeWarning: invalid value encountered in double_scalars
concatenating

In [ ]:
# test processing
    print('Building Test Features')
    df_test = pd.read_csv('test_features.csv', encoding="ISO-8859-1")
    x_test_ab = df_test.iloc[:, 2:-1]
    x_test_ab = x_test_ab.drop('euclidean_distance', axis=1)
    x_test_ab = x_test_ab.drop('jaccard_distance', axis=1)
    print ('reading test data')
    df_test = pd.read_csv('test.csv')
    df_test = df_test.fillna(' ')

    df_test['question1'] = df_test['question1'].map(lambda x: str(x).lower().split())
    df_test['question2'] = df_test['question2'].map(lambda x: str(x).lower().split())
    print('going to function')
    x_test = build_features(df_test, stops, weights)
    print('concatenating')
    x_test = pd.concat((x_test, x_test_ab, test_leaky), axis=1)


Building Test Features
reading test data
going to function
/home/alibaba/anaconda2/envs/py/lib/python3.5/site-packages/ipykernel_launcher.py:118: RuntimeWarning: invalid value encountered in double_scalars
/home/alibaba/anaconda2/envs/py/lib/python3.5/site-packages/ipykernel_launcher.py:118: RuntimeWarning: invalid value encountered in long_scalars
concatenating

In [ ]:
X_train.to_csv('X_train.csv',index=False)
x_test.to_csv('X_test.csv',index = False)

In [2]:
trn_features = pd.read_csv('X_train.csv')

OWL'S Features


In [2]:
import nltk
from collections import Counter
from nltk.corpus import stopwords
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.optimize import minimize
stops = set(stopwords.words("english"))
import multiprocessing
import difflib
import numpy as np
import pandas as pd
import datetime
import operator

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
cvect = CountVectorizer(stop_words='english', ngram_range=(1, 1))

tfidf_txt = pd.Series(train['question1'].tolist() + train['question2'].tolist() + test['question1'].tolist() + test['question2'].tolist()).astype(str)
tfidf.fit_transform(tfidf_txt)
cvect.fit_transform(tfidf_txt)

def diff_ratios(st1, st2):
    seq = difflib.SequenceMatcher()
    seq.set_seqs(str(st1).lower(), str(st2).lower())
    return seq.ratio()

def add_word_count(x, df, word):
    x['q1_' + word] = df['question1'].apply(lambda x: (word in str(x).lower())*1)
    x['q2_' + word] = df['question2'].apply(lambda x: (word in str(x).lower())*1)
    x[word + '_both'] = x['q1_' + word] * x['q2_' + word]
    
def get_features(df_features):
    print('nouns...')
    df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])

    print('tfidf...')
    df_features['z_tfidf_sum1'] = df_features.question1.map(lambda x: np.sum(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_sum2'] = df_features.question2.map(lambda x: np.sum(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_mean1'] = df_features.question1.map(lambda x: np.mean(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_mean2'] = df_features.question2.map(lambda x: np.mean(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_len1'] = df_features.question1.map(lambda x: len(tfidf.transform([str(x)]).data))
    df_features['z_tfidf_len2'] = df_features.question2.map(lambda x: len(tfidf.transform([str(x)]).data))
    
    print ('Caps features')
    df_features['caps_count_q1'] = df_features['question1'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
    df_features['caps_count_q2'] = df_features['question2'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
    df_features['diff_caps'] = df_features['caps_count_q1'] - df_features['caps_count_q2']
    
    df_features['exactly_same'] = (df_features['question1'] == df_features['question2']).astype(int)
    df_features['duplicated'] = df_features.duplicated(['question1','question2']).astype(int)
    
    print('some more')
    add_word_count(df_features, df_features,'how')
    add_word_count(df_features, df_features,'what')
    add_word_count(df_features, df_features,'which')
    add_word_count(df_features, df_features,'who')
    add_word_count(df_features, df_features,'where')
    add_word_count(df_features, df_features,'when')
    add_word_count(df_features, df_features,'why')
    return df_features.fillna(0.0)

In [5]:
trn_features = get_features(train)


nouns...
tfidf...
/home/alibaba/anaconda2/envs/py/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2889: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
/home/alibaba/anaconda2/envs/py/lib/python3.5/site-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
jaccard wale features
some more

In [6]:
tst_features = get_features(test)


nouns...
tfidf...
/home/alibaba/anaconda2/envs/py/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2889: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
/home/alibaba/anaconda2/envs/py/lib/python3.5/site-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
jaccard wale features
some more

In [22]:
def get_noun(df_features):
    print('shared nouns')
    df_features['z_noun_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_nouns if w in r.question2_nouns]), axis=1)
    return df_features.fillna(0.0)

In [23]:
trn_features = get_noun(trn_features)
tst_features = get_noun(tst_features)


shared nouns
shared nouns

Saving Features


In [ ]:
# removing unnecessary columns from train and test data
trn_features = trn_features.ix[:,8:]
tst_features = tst_features.ix[:,5:]

In [3]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')

In [34]:
X_train.shape, X_test.shape


Out[34]:
((404290, 46), (2345796, 46))

In [35]:
X_train = pd.concat((X_train, trn_features), axis=1)
X_test = pd.concat((X_test, tst_features), axis=1)
X_train.shape, X_test.shape


Out[35]:
((404290, 79), (2345796, 79))

In [ ]:
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)

In [5]:
X_train.columns.values


Out[5]:
(array(['word_match', 'tfidf_wm', 'tfidf_wm_stops', 'jaccard', 'wc_diff',
        'wc_ratio', 'wc_diff_unique', 'wc_ratio_unique', 'wc_diff_unq_stop',
        'wc_ratio_unique_stop', 'same_start', 'char_diff',
        'char_diff_unq_stop', 'total_unique_words', 'total_unq_words_stop',
        'char_ratio', 'len_q1', 'len_q2', 'diff_len', 'len_char_q1',
        'len_char_q2', 'len_word_q1', 'len_word_q2', 'common_words',
        'fuzz_qratio', 'fuzz_WRatio', 'fuzz_partial_ratio',
        'fuzz_partial_token_set_ratio', 'fuzz_partial_token_sort_ratio',
        'fuzz_token_set_ratio', 'fuzz_token_sort_ratio', 'wmd', 'norm_wmd',
        'cosine_distance', 'cityblock_distance', 'canberra_distance',
        'minkowski_distance', 'braycurtis_distance', 'skew_q1vec',
        'skew_q2vec', 'kur_q1vec', 'q1_q2_intersect', 'q1_freq', 'q2_freq',
        'q1_pr', 'q2_pr', 'z_tfidf_sum1', 'z_tfidf_sum2', 'z_tfidf_mean1',
        'z_tfidf_mean2', 'z_tfidf_len1', 'z_tfidf_len2', 'caps_count_q1',
        'caps_count_q2', 'diff_caps', 'exactly_same', 'duplicated',
        'q1_how', 'q2_how', 'how_both', 'q1_what', 'q2_what', 'what_both',
        'q1_which', 'q2_which', 'which_both', 'q1_who', 'q2_who',
        'who_both', 'q1_where', 'q2_where', 'where_both', 'q1_when',
        'q2_when', 'when_both', 'q1_why', 'q2_why', 'why_both',
        'z_noun_match'], dtype=object), (404290, 79))

In [6]:
X_train.shape


Out[6]:
(404290, 79)

Total 79 features are generated with page rank features which is generated in page_rank.ipynb