Feature: Intersections Weighted by Word Match

Question intersections weighted by word match ratio (based on the kernel by @skihikingkevin).

Imports

This utility package imports numpy, pandas, matplotlib and a helper kg module into the root namespace.



In [1]:

    
from pygoose import *



In [2]:

    
from collections import defaultdict



In [3]:

    
import seaborn as sns



In [4]:

    
import nltk



In [5]:

    
nltk.download('stopwords')









    



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yuriyguts/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!






    Out[5]:





True

Config

Automatically discover the paths to various data folders and compose the project structure.



In [6]:

    
project = kg.Project.discover()

Identifier for storing these features on disk and referring to them later.



In [7]:

    
feature_list_id = 'wm_intersect'

Load Data

Original question datasets.



In [8]:

    
df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('none')
df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('none')

Build features



In [9]:

    
df_all_pairs = pd.concat([
    df_train[['question1', 'question2']],
    df_test[['question1', 'question2']]
], axis=0).reset_index(drop='index')



In [10]:

    
stops = set(nltk.corpus.stopwords.words('english'))



In [11]:

    
def word_match_share(pair):
    q1 = str(pair[0]).lower().split()
    q2 = str(pair[1]).lower().split()
    q1words = {}
    q2words = {}
    
    for word in q1:
        if word not in stops:
            q1words[word] = 1
    for word in q2:
        if word not in stops:
            q2words[word] = 1
    
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2)) / (len(q1words) + len(q2words))

    return R



In [12]:

    
wms = kg.jobs.map_batch_parallel(
    df_all_pairs[['question1', 'question2']].as_matrix(),
    item_mapper=word_match_share,
    batch_size=1000,
)









    



Batches: 100%|██████████| 2751/2751 [00:03<00:00, 830.27it/s]



In [13]:

    
q_dict = defaultdict(dict)
for i in progressbar(range(len(wms))):
    q_dict[df_all_pairs.question1[i]][df_all_pairs.question2[i]] = wms[i]
    q_dict[df_all_pairs.question2[i]][df_all_pairs.question1[i]] = wms[i]









    



100%|██████████| 2750086/2750086 [03:06<00:00, 14750.65it/s]



In [14]:

    
def q1_q2_intersect(row):
    return len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']])))



In [15]:

    
def q1_q2_wm_ratio(row):
    q1 = q_dict[row['question1']]
    q2 = q_dict[row['question2']]
    
    inter_keys = set(q1.keys()).intersection(set(q2.keys()))
    if len(inter_keys) == 0:
        return 0
    
    inter_wm = 0
    total_wm = 0
    
    for q, wm in q1.items():
        if q in inter_keys:
            inter_wm += wm
        total_wm += wm
    
    for q, wm in q2.items():
        if q in inter_keys:
            inter_wm += wm
        total_wm += wm
    
    if total_wm == 0:
        return 0
    
    return inter_wm / total_wm



In [16]:

    
df_train['q1_q2_wm_ratio'] = df_train.apply(q1_q2_wm_ratio, axis=1, raw=True)
df_test['q1_q2_wm_ratio'] = df_test.apply(q1_q2_wm_ratio, axis=1, raw=True)



In [17]:

    
df_train['q1_q2_intersect'] = df_train.apply(q1_q2_intersect, axis=1, raw=True)
df_test['q1_q2_intersect'] = df_test.apply(q1_q2_intersect, axis=1, raw=True)

Visualize



In [18]:

    
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
intersect_counts = df_train.q1_q2_intersect.value_counts()
sns.barplot(intersect_counts.index[:20], intersect_counts.values[:20])

plt.subplot(1, 2, 2)
df_train['q1_q2_wm_ratio'].plot.hist()









    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fdd694ff828>



In [19]:

    
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.violinplot(x='is_duplicate', y='q1_q2_wm_ratio', data=df_train)

plt.subplot(1, 2, 2)
sns.violinplot(x='is_duplicate', y='q1_q2_intersect', data=df_train)









    Out[19]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fdd6b71b2b0>



In [20]:

    
df_train.plot.scatter(x='q1_q2_intersect', y='q1_q2_wm_ratio', figsize=(12, 6))
print(df_train[['q1_q2_intersect', 'q1_q2_wm_ratio']].corr())









    



                 q1_q2_intersect  q1_q2_wm_ratio
q1_q2_intersect         1.000000        0.684574
q1_q2_wm_ratio          0.684574        1.000000

Build final features



In [21]:

    
columns_to_keep = [
    'q1_q2_intersect',
    'q1_q2_wm_ratio',
]



In [22]:

    
X_train = df_train[columns_to_keep].values
X_test = df_test[columns_to_keep].values

Save features



In [23]:

    
feature_names = columns_to_keep



In [24]:

    
project.save_features(X_train, X_test, feature_names, feature_list_id)