Feature: "Abhishek's Features"

Based on Abhishek Thakur's features published on GitHub and Kaggle forum.

Imports

This utility package imports numpy, pandas, matplotlib and a helper kg module into the root namespace.


In [1]:
from pygoose import *

In [2]:
import os
import warnings

In [3]:
import gensim

In [4]:
from fuzzywuzzy import fuzz

In [5]:
from nltk import word_tokenize
from nltk.corpus import stopwords

In [6]:
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

Config

Automatically discover the paths to various data folders and compose the project structure.


In [7]:
project = kg.Project.discover()

Identifier for storing these features on disk and referring to them later.


In [8]:
feature_list_id = '3rdparty_abhishek'

The path to the saved GoogleNews Word2Vec model.


In [9]:
google_news_model_path = os.path.join(project.aux_dir, 'word2vec', 'GoogleNews-vectors-negative300.bin.gz')

Read data

Original question datasets.


In [10]:
df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('').drop(['id', 'qid1', 'qid2'], axis=1)
df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('').drop(['test_id'], axis=1)

In [11]:
stop_words = stopwords.words('english')

Build features

Raw implementations from Abhishek below (excluding the features we already have in other notebooks):


In [12]:
def wmd(model, s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)

In [13]:
def norm_wmd(model, s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)

In [14]:
def sent2vec(model, s):
    words = s.lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

In [15]:
def extend_with_features(data):
    data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
    data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
    data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)

    model = gensim.models.KeyedVectors.load_word2vec_format(google_news_model_path, binary=True)
    data['wmd'] = data.apply(lambda x: wmd(model, x['question1'], x['question2']), axis=1)

    norm_model = gensim.models.KeyedVectors.load_word2vec_format(google_news_model_path, binary=True)
    norm_model.init_sims(replace=True)
    data['norm_wmd'] = data.apply(lambda x: norm_wmd(norm_model, x['question1'], x['question2']), axis=1)

    question1_vectors = np.zeros((data.shape[0], 300))
    for i, q in progressbar(enumerate(data.question1.values), total=len(data)):
        question1_vectors[i, :] = sent2vec(model, q)

    question2_vectors  = np.zeros((data.shape[0], 300))
    for i, q in progressbar(enumerate(data.question2.values), total=len(data)):
        question2_vectors[i, :] = sent2vec(model, q)

    question1_vectors = np.nan_to_num(question1_vectors)
    question2_vectors = np.nan_to_num(question2_vectors)
    
    data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]
    data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]
    data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]
    data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]
    data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]
    data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(question1_vectors, question2_vectors)]
    data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]

    data['skew_q1vec'] = [skew(x) for x in question1_vectors]
    data['skew_q2vec'] = [skew(x) for x in question2_vectors]
    data['kur_q1vec'] = [kurtosis(x) for x in question1_vectors]
    data['kur_q2vec'] = [kurtosis(x) for x in question2_vectors]

In [16]:
warnings.filterwarnings('ignore')

In [ ]:
extend_with_features(df_train)


100%|██████████| 404290/404290 [01:00<00:00, 6721.99it/s]
100%|██████████| 404290/404290 [01:03<00:00, 6407.20it/s]

In [ ]:
extend_with_features(df_test)


100%|██████████| 2345796/2345796 [05:55<00:00, 6598.74it/s]
100%|██████████| 2345796/2345796 [05:56<00:00, 6571.32it/s]

In [ ]:
df_train.drop(['is_duplicate', 'question1', 'question2'], axis=1, inplace=True)
df_test.drop(['question1', 'question2'], axis=1, inplace=True)

Build final features


In [ ]:
X_train = np.array(df_train.values, dtype='float64')
X_test = np.array(df_test.values, dtype='float64')

In [ ]:
print('X_train:', X_train.shape)
print('X_test: ', X_test.shape)


X_train: (404290, 16)
X_test:  (2345796, 16)

In [ ]:
df_train.describe().T


Out[ ]:
count mean std min 25% 50% 75% max
common_words 404290.000000 4.511586 3.098622 0.000000 2.000000 4.000000 6.000000 41.000000
fuzz_qratio 404290.000000 61.951211 18.374683 0.000000 47.000000 61.000000 77.000000 100.000000
fuzz_WRatio 404290.000000 76.500294 15.293315 0.000000 66.000000 85.000000 86.000000 100.000000
wmd 404290.000000 inf nan 0.000000 1.307707 2.068217 2.883125 inf
norm_wmd 404290.000000 inf nan 0.000000 0.470532 0.764727 1.053355 inf
cosine_distance 402512.000000 0.268459 0.210211 -0.000000 0.113929 0.221445 0.366980 1.120409
cityblock_distance 404290.000000 9.180780 4.318149 0.000000 6.603080 9.215482 11.884680 20.709869
jaccard_distance 403768.000000 0.929334 0.256263 0.000000 1.000000 1.000000 1.000000 1.000000
canberra_distance 404290.000000 131.849568 48.919598 0.000000 112.812653 138.671858 161.453470 300.000000
euclidean_distance 404290.000000 0.663583 0.311963 0.000000 0.477137 0.666145 0.858636 1.496936
minkowski_distance 404290.000000 0.299345 0.140795 0.000000 0.215221 0.300419 0.387252 0.682163
braycurtis_distance 403768.000000 0.370722 0.200331 0.000000 0.245803 0.352763 0.474900 1.136902
skew_q1vec 404290.000000 0.010074 0.135183 -0.669451 -0.080360 0.011263 0.101130 0.675833
skew_q2vec 404290.000000 0.010129 0.134776 -0.669451 -0.080048 0.011207 0.100332 0.781670
kur_q1vec 404290.000000 -0.060688 0.308915 -3.000000 -0.239163 -0.082287 0.101569 2.288984
kur_q2vec 404290.000000 -0.061901 0.306479 -3.000000 -0.240417 -0.083331 0.100070 2.227666

Save features


In [ ]:
feature_names = [
    'abh_common_words',
    'abh_fuzz_qratio',
    'abh_fuzz_WRatio',
    'abh_wmd',
    'abh_norm_wmd',
    'abh_cosine_distance',
    'abh_cityblock_distance',
    'abh_jaccard_distance',
    'abh_canberra_distance',
    'abh_euclidean_distance',
    'abh_minkowski_distance',
    'abh_braycurtis_distance',
    'abh_skew_q1vec',
    'abh_skew_q2vec',
    'abh_kur_q1vec',
    'abh_kur_q2vec',
]

In [ ]:
project.save_features(X_train, X_test, feature_names, feature_list_id)