Feature: "Jaccard with WHQ" (@dasolmar)

Based on the kernel XGB with whq jaccard by David Solis.

Imports

This utility package imports numpy, pandas, matplotlib and a helper kg module into the root namespace.


In [1]:
from pygoose import *

NLTK tools


In [2]:
import nltk

In [3]:
from collections import Counter
from nltk.corpus import stopwords

In [4]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yuriyguts/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[4]:
True

Config

Automatically discover the paths to various data folders and compose the project structure.


In [5]:
project = kg.Project.discover()

Identifier for storing these features on disk and referring to them later.


In [6]:
feature_list_id = '3rdparty_dasolmar_whq'

Read data

Original question sets.


In [7]:
df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('')
df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('')

NLTK built-in stopwords.


In [8]:
stops = set(stopwords.words("english"))

Build features


In [9]:
# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=10000, min_count=2):
    return 0 if count < min_count else 1 / (count + eps)

In [10]:
def add_word_count(x, df, word):
    x['das_q1_' + word] = df['question1'].apply(lambda x: (word in str(x).lower())*1)
    x['das_q2_' + word] = df['question2'].apply(lambda x: (word in str(x).lower())*1)
    x['das_' + word + '_both'] = x['das_q1_' + word] * x['das_q2_' + word]

In [11]:
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [12]:
def word_shares(row):
    q1_list = str(row['question1']).lower().split()
    q1 = set(q1_list)
    q1words = q1.difference(stops)
    if len(q1words) == 0:
        return '0:0:0:0:0:0:0:0'

    q2_list = str(row['question2']).lower().split()
    q2 = set(q2_list)
    q2words = q2.difference(stops)
    if len(q2words) == 0:
        return '0:0:0:0:0:0:0:0'

    words_hamming = sum(1 for i in zip(q1_list, q2_list) if i[0]==i[1])/max(len(q1_list), len(q2_list))

    q1stops = q1.intersection(stops)
    q2stops = q2.intersection(stops)

    q1_2gram = set([i for i in zip(q1_list, q1_list[1:])])
    q2_2gram = set([i for i in zip(q2_list, q2_list[1:])])

    shared_2gram = q1_2gram.intersection(q2_2gram)

    shared_words = q1words.intersection(q2words)
    shared_weights = [weights.get(w, 0) for w in shared_words]
    q1_weights = [weights.get(w, 0) for w in q1words]
    q2_weights = [weights.get(w, 0) for w in q2words]
    total_weights = q1_weights + q1_weights

    R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share
    R2 = len(shared_words) / (len(q1words) + len(q2words) - len(shared_words)) #count share
    R31 = len(q1stops) / len(q1words) #stops in q1
    R32 = len(q2stops) / len(q2words) #stops in q2
    Rcosine_denominator = (np.sqrt(np.dot(q1_weights,q1_weights))*np.sqrt(np.dot(q2_weights,q2_weights)))
    Rcosine = np.dot(shared_weights, shared_weights)/Rcosine_denominator
    if len(q1_2gram) + len(q2_2gram) == 0:
        R2gram = 0
    else:
        R2gram = len(shared_2gram) / (len(q1_2gram) + len(q2_2gram))
    return '{}:{}:{}:{}:{}:{}:{}:{}'.format(R1, R2, len(shared_words), R31, R32, R2gram, Rcosine, words_hamming)

In [13]:
df = pd.concat([df_train, df_test])
df['word_shares'] = df.apply(word_shares, axis=1, raw=True)

x = pd.DataFrame()

x['das_word_match']       = df['word_shares'].apply(lambda x: float(x.split(':')[0]))
x['das_word_match_2root'] = np.sqrt(x['das_word_match'])
x['das_tfidf_word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[1]))
x['das_shared_count']     = df['word_shares'].apply(lambda x: float(x.split(':')[2]))

x['das_stops1_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[3]))
x['das_stops2_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[4]))
x['das_shared_2gram']     = df['word_shares'].apply(lambda x: float(x.split(':')[5]))
x['das_cosine']           = df['word_shares'].apply(lambda x: float(x.split(':')[6]))
x['das_words_hamming']    = df['word_shares'].apply(lambda x: float(x.split(':')[7]))
x['das_diff_stops_r']     = np.abs(x['das_stops1_ratio'] - x['das_stops2_ratio'])

x['das_len_q1'] = df['question1'].apply(lambda x: len(str(x)))
x['das_len_q2'] = df['question2'].apply(lambda x: len(str(x)))
x['das_diff_len'] = np.abs(x['das_len_q1'] - x['das_len_q2'])

x['das_caps_count_q1'] = df['question1'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
x['das_caps_count_q2'] = df['question2'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
x['das_diff_caps'] = np.abs(x['das_caps_count_q1'] - x['das_caps_count_q2'])

x['das_len_char_q1'] = df['question1'].apply(lambda x: len(str(x).replace(' ', '')))
x['das_len_char_q2'] = df['question2'].apply(lambda x: len(str(x).replace(' ', '')))
x['das_diff_len_char'] = np.abs(x['das_len_char_q1'] - x['das_len_char_q2'])

x['das_len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
x['das_len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
x['das_diff_len_word'] = np.abs(x['das_len_word_q1'] - x['das_len_word_q2'])

x['das_avg_word_len1'] = x['das_len_char_q1'] / x['das_len_word_q1']
x['das_avg_word_len2'] = x['das_len_char_q2'] / x['das_len_word_q2']
x['das_diff_avg_word'] = np.abs(x['das_avg_word_len1'] - x['das_avg_word_len2'])

# x['exactly_same'] = (df['question1'] == df['question2']).astype(int)
# x['duplicated'] = df.duplicated(['question1','question2']).astype(int)

whq_words = ['how', 'what', 'which', 'who', 'where', 'when', 'why']
for whq in whq_words:
    add_word_count(x, df, whq)


/home/yuriyguts/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:30: RuntimeWarning: invalid value encountered in double_scalars
/home/yuriyguts/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:35: RuntimeWarning: invalid value encountered in true_divide
/home/yuriyguts/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:30: RuntimeWarning: invalid value encountered in long_scalars

In [14]:
whq_columns_q1 = ['das_q1_' + whq for whq in whq_words]
whq_columns_q2 = ['das_q2_' + whq for whq in whq_words]

In [15]:
x['whq_count_q1'] = x[whq_columns_q1].sum(axis=1)
x['whq_count_q2'] = x[whq_columns_q2].sum(axis=1)
x['whq_count_diff'] = np.abs(x['whq_count_q1'] - x['whq_count_q2'])

In [16]:
feature_names = list(x.columns.values)
print("Features: {}".format(feature_names))


Features: ['das_word_match', 'das_word_match_2root', 'das_tfidf_word_match', 'das_shared_count', 'das_stops1_ratio', 'das_stops2_ratio', 'das_shared_2gram', 'das_cosine', 'das_words_hamming', 'das_diff_stops_r', 'das_len_q1', 'das_len_q2', 'das_diff_len', 'das_caps_count_q1', 'das_caps_count_q2', 'das_diff_caps', 'das_len_char_q1', 'das_len_char_q2', 'das_diff_len_char', 'das_len_word_q1', 'das_len_word_q2', 'das_diff_len_word', 'das_avg_word_len1', 'das_avg_word_len2', 'das_diff_avg_word', 'das_q1_how', 'das_q2_how', 'das_how_both', 'das_q1_what', 'das_q2_what', 'das_what_both', 'das_q1_which', 'das_q2_which', 'das_which_both', 'das_q1_who', 'das_q2_who', 'das_who_both', 'das_q1_where', 'das_q2_where', 'das_where_both', 'das_q1_when', 'das_q2_when', 'das_when_both', 'das_q1_why', 'das_q2_why', 'das_why_both', 'whq_count_q1', 'whq_count_q2', 'whq_count_diff']

In [17]:
X_train = x[:df_train.shape[0]].values
X_test  = x[df_train.shape[0]:].values

Save features


In [18]:
project.save_features(X_train, X_test, feature_names, feature_list_id)