In [21]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from __future__ import division
import re, time, os, gc, datetime
import sys
import string

import numpy as np
import pandas as pd
import scipy
import config
from utils import dist_utils, ngram_utils, nlp_utils, np_utils
from utils import logging_utils, time_utils, pkl_utils
import argparse
import functools
from collections import defaultdict
from nltk.corpus import stopwords
from collections import Counter

from multiprocessing import Pool, cpu_count
num_partitions = cpu_count() #number of partitions to split dataframe
num_cores = cpu_count() #number of cores on your machine
print cpu_count()

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df


# df =  pd.read_csv(config.RAW_PATH+'train.csv',nrows=200000)
# df['question1'] = df['question1'].astype(str)
# df['question2'] = df['question2'].astype(str)
# train = df.reset_index()


28

In [22]:
train_orig =  pd.read_csv(config.RAW_PATH+'train.csv', header=0)#.sample(n=1000)
test_orig =  pd.read_csv(config.RAW_PATH+'test.csv', header=0)#.sample(n=1000)
test_orig['is_duplicate'] = -1

train = pd.concat([train_orig[['question1', 'question2','is_duplicate']], \
        test_orig[['question1', 'question2','is_duplicate']]], axis=0).reset_index(drop=True)
train['question1'] = train['question1'].astype(str)
train['question2'] = train['question2'].astype(str)
del train_orig, test_orig
gc.collect()
train.head()


Out[22]:
165
Out[22]:
question1 question2 is_duplicate
0 What is the step by step guide to invest in sh... What is the step by step guide to invest in sh... 0
1 What is the story of Kohinoor (Koh-i-Noor) Dia... What would happen if the Indian government sto... 0
2 How can I increase the speed of my internet co... How can Internet speed be increased by hacking... 0
3 Why am I mentally very lonely? How can I solve... Find the remainder when [math]23^{24}[/math] i... 0
4 Which one dissolve in water quikly sugar, salt... Which fish would survive in salt water? 0

In [23]:
def word_match_share(row, stops=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def jaccard(row):
    wic = set(row['question1']).intersection(set(row['question2']))
    uw = set(row['question1']).union(row['question2'])
    if len(uw) == 0:
        uw = [1]
    return (len(wic) / len(uw))

def common_words(row):
    return len(set(row['question1']).intersection(set(row['question2'])))

def total_unique_words(row):
    return len(set(row['question1']).union(row['question2']))

def total_unq_words_stop(row, stops):
    return len([x for x in set(row['question1']).union(row['question2']) if x not in stops])

def wc_diff(row):
    return abs(len(row['question1']) - len(row['question2']))

def wc_ratio(row):
    l1 = len(row['question1'])*1.0 
    l2 = len(row['question2'])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique(row):
    return abs(len(set(row['question1'])) - len(set(row['question2'])))

def wc_ratio_unique(row):
    l1 = len(set(row['question1'])) * 1.0
    l2 = len(set(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique_stop(row, stops=None):
    return abs(len([x for x in set(row['question1']) if x not in stops]) - len([x for x in set(row['question2']) if x not in stops]))

def wc_ratio_unique_stop(row, stops=None):
    l1 = len([x for x in set(row['question1']) if x not in stops])*1.0 
    l2 = len([x for x in set(row['question2']) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def same_start_word(row):
    if not row['question1'] or not row['question2']:
        return np.nan
    return int(row['question1'][0] == row['question2'][0])

def char_diff(row):
    return abs(len(''.join(row['question1'])) - len(''.join(row['question2'])))

def char_ratio(row):
    l1 = len(''.join(row['question1'])) 
    l2 = len(''.join(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def char_diff_unique_stop(row, stops=None):
    return abs(len(''.join([x for x in set(row['question1']) if x not in stops])) - len(''.join([x for x in set(row['question2']) if x not in stops])))


def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
    
def tfidf_word_match_share_stops(row, stops=None, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_weights = [weights.get(w,0) for w in q1words.keys() if w in q2words] +\
                 [weights.get(w,0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def tfidf_word_match_share(row, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        q1words[word] = 1
    for word in row['question2']:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

In [24]:
def build_features(train):
    f = functools.partial(word_match_share, stops=stops)
    train['word_match'] = train.apply(f, axis=1, raw=True) #1

    f = functools.partial(tfidf_word_match_share, weights=weights)
    train['tfidf_wm'] = train.apply(f, axis=1, raw=True) #2

    f = functools.partial(tfidf_word_match_share_stops, stops=stops, weights=weights)
    train['tfidf_wm_stops'] = train.apply(f, axis=1, raw=True) #3

    train['jaccard'] = train.apply(jaccard, axis=1, raw=True) #4
    train['wc_diff'] = train.apply(wc_diff, axis=1, raw=True) #5
    train['wc_ratio'] = train.apply(wc_ratio, axis=1, raw=True) #6
    train['wc_diff_unique'] = train.apply(wc_diff_unique, axis=1, raw=True) #7
    train['wc_ratio_unique'] = train.apply(wc_ratio_unique, axis=1, raw=True) #8

    f = functools.partial(wc_diff_unique_stop, stops=stops)    
    train['wc_diff_unq_stop'] = train.apply(f, axis=1, raw=True) #9
    f = functools.partial(wc_ratio_unique_stop, stops=stops)    
    train['wc_ratio_unique_stop'] = train.apply(f, axis=1, raw=True) #10

    train['same_start'] = train.apply(same_start_word, axis=1, raw=True) #11
    train['char_diff'] = train.apply(char_diff, axis=1, raw=True) #12

    f = functools.partial(char_diff_unique_stop, stops=stops) 
    train['char_diff_unq_stop'] = train.apply(f, axis=1, raw=True) #13
#     X['common_words'] = data.apply(common_words, axis=1, raw=True)  #14
    train['total_unique_words'] = train.apply(total_unique_words, axis=1, raw=True)  #15

    f = functools.partial(total_unq_words_stop, stops=stops)
    train['total_unq_words_stop'] = train.apply(f, axis=1, raw=True)  #16
    train['char_ratio'] = train.apply(char_ratio, axis=1, raw=True) #17    

    return train

In [25]:
from text_clean import *
train['question1'] = train['question1'].astype(str).apply(lambda x:text_to_wordlist(x))
train['question2'] = train['question2'].astype(str).apply(lambda x:text_to_wordlist(x))
train['question1'] = train['question1'].astype(str).apply(lambda x:substitute_thousands(x))
train['question2'] = train['question2'].astype(str).apply(lambda x:substitute_thousands(x))
# train = abbr_clean(train)
train['question1'] = train['question1'].map(lambda x: str(x).lower().split())
train['question2'] = train['question2'].map(lambda x: str(x).lower().split())


stops = set(["http","www","img","border","home","body","a","about","above","after","again","against","all","am","an",
"and","any","are","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can't",
"cannot","could","couldn't","did","didn't","do","does","doesn't","doing","don't","down","during","each","few","for","from",
"further","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here","here's","hers",
"herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","isn't","it","it's","its",
"itself","let's","me","more","most","mustn't","my","myself","no","nor","not","of","off","on","once","only","or","other","ought",
"our","ours","ourselves","out","over","own","same","shan't","she","she'd","she'll","she's","should","shouldn't","so","some","such",
"than","that","that's","the","their","theirs","them","themselves","then","there","there's","these","they","they'd","they'll","they're",
"they've","this","those","through","to","too","under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were",
"weren't","what","what's","when","when's""where","where's","which","while","who","who's","whom","why","why's","with","won't","would",
"wouldn't","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves" ])
stops = set(['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','what','which',
              'is','if','while','this'])
stops = set(stopwords.words("english"))

train_qs = pd.Series(train['question1'].tolist() + train['question2'].tolist())
words = [x for y in train_qs for x in y]
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

train.head()


Out[25]:
question1 question2 is_duplicate
0 [what, is, the, step, by, step, guide, to, inv... [what, is, the, step, by, step, guide, to, inv... 0
1 [what, is, the, story, of, kohinoor, (koh-i-no... [what, would, happen, if, the, indian, governm... 0
2 [how, can, i, increase, the, speed, of, my, in... [how, can, internet, speed, be, increased, by,... 0
3 [why, am, i, mentally, very, lonely?, how, can... [find, the, remainder, when, [math]23^{24}[/ma... 0
4 [which, one, dissolve, in, water, quickly, sug... [which, fish, would, survive, in, salt, water?] 0

In [26]:
print('Building Features')
# train = build_features(train)
train = parallelize_dataframe(train, build_features)


Building Features

In [27]:
train.to_csv(config.FEAT_PATH+'feat_158_stpf_clean.csv',index=False)

In [7]:
train[train['is_duplicate']!=-1].corr()


Out[7]:
is_duplicate word_match tfidf_wm tfidf_wm_stops jaccard wc_diff wc_ratio wc_diff_unique wc_ratio_unique wc_diff_unq_stop wc_ratio_unique_stop same_start char_diff char_diff_unq_stop total_unique_words total_unq_words_stop char_ratio
is_duplicate 1.000000 0.456740 0.426720 0.424473 0.344256 -0.201759 -0.072018 -0.207670 -0.065854 -0.214178 -0.066107 0.203637 -0.211784 -0.222334 -0.289463 -0.302564 -0.044462
word_match 0.456740 1.000000 0.933074 0.929834 0.845310 -0.393913 -0.138327 -0.417275 -0.130146 -0.426188 -0.140378 0.365201 -0.402031 -0.423160 -0.506505 -0.517122 -0.098176
tfidf_wm 0.426720 0.933074 1.000000 0.997003 0.779260 -0.368303 -0.126823 -0.389549 -0.119108 -0.392971 -0.125137 0.312027 -0.377275 -0.397022 -0.481682 -0.493831 -0.090756
tfidf_wm_stops 0.424473 0.929834 0.997003 1.000000 0.755014 -0.360898 -0.124247 -0.380887 -0.116534 -0.386744 -0.123681 0.295058 -0.370341 -0.391700 -0.476174 -0.491886 -0.089168
jaccard 0.344256 0.845310 0.779260 0.755014 1.000000 -0.398219 -0.137928 -0.431076 -0.131276 -0.412190 -0.127560 0.568942 -0.394437 -0.397736 -0.484933 -0.446700 -0.094975
wc_diff -0.201759 -0.393913 -0.368303 -0.360898 -0.398219 1.000000 0.382017 0.961980 0.334620 0.853699 0.316312 -0.257883 0.922882 0.779298 0.642812 0.611834 0.281046
wc_ratio -0.072018 -0.138327 -0.126823 -0.124247 -0.137928 0.382017 1.000000 0.378106 0.985102 0.321502 0.823967 -0.089547 0.335791 0.276684 0.182496 0.173595 0.814486
wc_diff_unique -0.207670 -0.417275 -0.389549 -0.380887 -0.431076 0.961980 0.378106 1.000000 0.350310 0.870158 0.325153 -0.273255 0.903732 0.795976 0.627535 0.595260 0.280352
wc_ratio_unique -0.065854 -0.130146 -0.119108 -0.116534 -0.131276 0.334620 0.985102 0.350310 1.000000 0.293189 0.831735 -0.083595 0.295596 0.251192 0.160382 0.152093 0.804711
wc_diff_unq_stop -0.214178 -0.426188 -0.392971 -0.386744 -0.412190 0.853699 0.321502 0.870158 0.293189 1.000000 0.351880 -0.260475 0.874144 0.899551 0.620451 0.634554 0.255916
wc_ratio_unique_stop -0.066107 -0.140378 -0.125137 -0.123681 -0.127560 0.316312 0.823967 0.325153 0.831735 0.351880 1.000000 -0.078022 0.301632 0.298304 0.159025 0.153951 0.682637
same_start 0.203637 0.365201 0.312027 0.295058 0.568942 -0.257883 -0.089547 -0.273255 -0.083595 -0.260475 -0.078022 1.000000 -0.249823 -0.244585 -0.338091 -0.293970 -0.059776
char_diff -0.211784 -0.402031 -0.377275 -0.370341 -0.394437 0.922882 0.335791 0.903732 0.295596 0.874144 0.301632 -0.249823 1.000000 0.916845 0.625181 0.612403 0.281230
char_diff_unq_stop -0.222334 -0.423160 -0.397022 -0.391700 -0.397736 0.779298 0.276684 0.795976 0.251192 0.899551 0.298304 -0.244585 0.916845 1.000000 0.596425 0.614189 0.249474
total_unique_words -0.289463 -0.506505 -0.481682 -0.476174 -0.484933 0.642812 0.182496 0.627535 0.160382 0.620451 0.159025 -0.338091 0.625181 0.596425 1.000000 0.952161 0.131561
total_unq_words_stop -0.302564 -0.517122 -0.493831 -0.491886 -0.446700 0.611834 0.173595 0.595260 0.152093 0.634554 0.153951 -0.293970 0.612403 0.614189 0.952161 1.000000 0.126143
char_ratio -0.044462 -0.098176 -0.090756 -0.089168 -0.094975 0.281046 0.814486 0.280352 0.804711 0.255916 0.682637 -0.059776 0.281230 0.249474 0.131561 0.126143 1.000000

In [10]:
train


Out[10]:
question1 question2 is_duplicate word_match tfidf_wm tfidf_wm_stops jaccard wc_diff wc_ratio wc_diff_unique wc_ratio_unique wc_diff_unq_stop wc_ratio_unique_stop same_start char_diff char_diff_unq_stop total_unique_words total_unq_words_stop char_ratio
0 [what, is, the, step, by, step, guide, to, inv... [what, is, the, step, by, step, guide, to, inv... 0 0.727273 0.821476 0.815193 0.769231 2 0.857143 1 0.916667 1 0.833333 1 7 5 13 7 0.867925
1 [what, is, the, story, of, kohinoor, (koh-i-no... [what, would, happen, if, the, indian, governm... 0 0.307692 0.444512 0.445868 0.250000 5 1.625000 4 1.500000 5 2.250000 1 32 31 16 11 1.727273
2 [how, can, i, increase, the, speed, of, my, in... [how, can, internet, speed, be, increased, by,... 0 0.363636 0.245211 0.263744 0.200000 4 0.714286 4 0.714286 1 0.833333 1 10 7 20 9 0.833333
3 [why, am, i, mentally, very, lonely?, how, can... [find, the, remainder, when, [math]23^{24}[/ma... 0 0.000000 0.000000 0.000000 0.000000 2 0.818182 1 0.900000 1 1.250000 0 17 23 19 9 1.425000
4 [which, one, dissolve, in, water, quickly, sug... [which, fish, would, survive, in, salt, water?] 0 0.000000 0.006965 0.000000 0.111111 6 0.538462 6 0.538462 5 0.500000 1 32 29 18 15 0.507692
5 [astrology:, i, am, a, capricorn, sun, cap, mo... [i, am, a, triple, capricorn, (sun,, moon, and... 1 0.500000 0.454088 0.440662 0.454545 1 1.062500 2 1.133333 0 1.000000 0 4 1 22 12 1.056338
6 [should, i, buy, tiago?] [what, keeps, childern, active, and, far, from... 0 0.000000 0.000000 0.000000 0.000000 7 2.750000 6 2.500000 5 3.500000 0 36 29 14 9 3.250000
7 [how, can, i, be, a, good, geologist?] [what, should, i, do, to, be, a, great, geolog... 1 0.500000 0.801124 0.818731 0.333333 2 1.285714 2 1.285714 0 1.000000 0 9 1 12 3 1.375000
8 [when, do, you, use, シ, instead, of, し?] [when, do, you, use, "&", instead, of, "and"?] 0 0.500000 0.243085 0.215713 0.600000 0 1.000000 0 1.000000 0 1.000000 1 2 2 10 6 1.071429
9 [motorola, (company):, can, i, hack, my, chart... [how, do, i, hack, motorola, dcx3400, for, fre... 0 0.363636 0.495962 0.500431 0.200000 0 1.000000 0 1.000000 1 0.833333 0 11 14 15 9 0.788462
10 [method, to, find, separation, of, slits, usin... [what, are, some, of, the, things, technicians... 0 0.000000 0.002016 0.000000 0.041667 10 2.111111 7 1.777778 0 1.000000 0 49 15 24 14 2.000000
11 [how, do, i, read, and, find, my, youtube, com... [how, can, i, see, all, my, youtube, comments?] 1 0.571429 0.767913 0.800469 0.416667 1 0.888889 1 0.888889 1 0.750000 1 4 5 12 5 0.885714
12 [what, can, make, physics, easy, to, learn?] [how, can, you, make, physics, easy, to, learn?] 1 1.000000 0.991732 1.000000 0.666667 1 1.142857 1 1.142857 0 1.000000 0 2 0 9 4 1.066667
13 [what, was, your, first, sexual, experience, l... [what, was, your, first, sexual, experience?] 1 0.571429 0.589627 0.552714 0.625000 1 0.857143 1 0.857143 1 0.750000 1 4 4 8 5 0.891892
14 [what, are, the, laws, to, change, your, statu... [what, are, the, laws, to, change, your, statu... 0 0.818182 0.780104 0.771194 0.833333 0 1.000000 0 1.000000 0 1.000000 1 1 1 24 13 0.991304
15 [what, would, a, trump, presidency, mean, for,... [how, will, a, trump, presidency, affect, the,... 1 0.315789 0.293317 0.297928 0.148148 2 1.133333 1 1.066667 1 0.900000 0 6 13 27 16 0.925926
16 [what, does, manipulation, mean?] [what, does, manipulation, means?] 1 0.500000 0.634968 0.627995 0.600000 0 1.000000 0 1.000000 0 1.000000 1 1 1 5 3 1.040000
17 [why, do, girls, want, to, be, friends, with, ... [how, do, guys, feel, after, rejecting, a, girl?] 0 0.000000 0.004048 0.000000 0.052632 4 0.666667 4 0.666667 1 0.800000 0 13 4 19 9 0.717391
18 [why, are, so, many, quora, users, posting, qu... [why, do, people, ask, quora, questions, which... 1 0.533333 0.473425 0.494390 0.238095 1 0.928571 0 1.000000 1 0.875000 1 8 8 21 11 0.884058
19 [which, is, the, best, digital, marketing, ins... [which, is, the, best, digital, marketing, ins... 0 0.600000 0.431952 0.421760 0.636364 0 1.000000 0 1.000000 0 1.000000 1 6 6 11 7 0.884615
20 [why, do, rockets, look, white?] [why, are, rockets, and, boosters, painted, wh... 1 0.571429 0.614905 0.615649 0.333333 2 1.400000 2 1.400000 1 1.333333 1 15 11 9 5 1.681818
21 [what, causing, someone, to, be, jealous?] [what, can, i, do, to, avoid, being, jealous, ... 0 0.000000 0.005708 0.000000 0.142857 4 1.666667 4 1.666667 0 1.000000 1 9 2 14 6 1.300000
22 [what, are, the, questions, should, not, ask, ... [which, question, should, i, ask, on, quora?] 0 0.666667 0.625907 0.636520 0.333333 2 0.777778 2 0.777778 0 1.000000 0 8 1 12 4 0.794872
23 [how, much, is, 30, kv, in, hp?] [where, can, i, find, a, conversion, chart, fo... 0 0.000000 0.000000 0.000000 0.000000 4 1.571429 4 1.571429 1 1.250000 0 29 21 18 9 2.611111
24 [what, does, it, mean, that, every, time, i, l... [how, many, times, a, day, do, a, clock’s, han... 0 0.000000 0.000000 0.000000 0.000000 7 0.588235 6 0.600000 1 0.857143 0 22 0 24 13 0.650794
25 [what, are, some, tips, on, making, it, throug... [what, are, some, tips, on, making, it, throug... 0 0.769231 0.619936 0.570454 0.812500 1 1.071429 1 1.071429 1 1.166667 1 9 9 16 8 1.136364
26 [what, is, web, application?] [what, is, the, web, application, framework?] 0 0.400000 0.236355 0.231387 0.428571 2 1.500000 2 1.500000 1 1.500000 1 12 9 7 4 1.571429
27 [does, society, place, too, much, importance, ... [how, do, sports, contribute, to, the, society?] 0 0.000000 0.000000 0.000000 0.000000 1 0.875000 1 0.875000 2 0.600000 0 8 9 15 8 0.809524
28 [what, is, best, way, to, make, money, online?] [what, is, best, way, to, ask, for, money, onl... 0 0.800000 0.740553 0.741457 0.700000 1 1.125000 1 1.125000 0 1.000000 1 2 1 10 6 1.064516
29 [how, should, i, prepare, for, ca, final, law?] [how, one, should, know, that, he/she, complet... 1 0.500000 0.514736 0.509197 0.428571 4 1.500000 4 1.500000 4 2.000000 1 27 24 14 9 1.870968
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2750056 [how, should, i, by, start, product, design, w... [what, are, the, essential, facts, water, know... -1 0.000000 0.000000 0.000000 0.000000 10 1.909091 10 1.909091 9 2.500000 0 69 65 32 21 2.078125
2750057 [at, absorb, mileage, would, front, brake, pad... [what, tax, some, tips, for, replacing, brake,... -1 0.133333 0.161921 0.163882 0.052632 4 0.666667 4 0.666667 5 0.500000 0 23 30 19 14 0.616667
2750058 [what, are, ways, to, prevent, over, fitting, ... [why, should, pca, only, be, fit, on, the, tra... -1 0.333333 0.252571 0.275781 0.086957 4 1.363636 3 1.272727 0 1.000000 0 3 9 23 10 1.058824
2750059 [what, all, the, job, levels, in, apple, techn... [nit, srinagar, or, silchar, what, agartala?, ... -1 0.000000 0.002139 0.000000 0.095238 3 1.300000 3 1.300000 1 1.166667 0 10 3 21 13 1.212766
2750060 [does, it, is, pokemon, go, going, to, be, rel... [what, are, some, go, release, in, india?] -1 0.500000 0.172739 0.177472 0.200000 4 0.636364 4 0.636364 2 0.600000 0 14 13 15 6 0.666667
2750061 [how, do, you, charge, a, laptop, without, a, ... [does, a, 90, watt, laptop, charger, charge, a... -1 0.500000 0.450046 0.458757 0.266667 6 1.666667 3 1.375000 4 2.000000 0 25 14 15 9 1.675676
2750062 [how, like, the, word, "incredulous", used, in... [how, nowadays, the, word, "incredulity", used... -1 0.600000 0.366095 0.359210 0.636364 0 1.000000 0 1.000000 0 1.000000 1 4 4 11 7 1.093023
2750063 [how, does] [what, is, peer, to, they, peer, replication?] -1 0.000000 0.000000 0.000000 0.000000 5 3.500000 4 3.000000 2 0.000000 0 25 16 8 2 4.571429
2750064 [tagline, ideas, ways, to, earn, money, from, ... [what, jobs, you, can, do, helmets, home?] -1 0.222222 0.250548 0.255656 0.071429 1 0.875000 1 0.875000 3 0.500000 0 8 14 14 8 0.777778
2750065 [are, move, to, the, america, ?] [what, would, a, canadian, have, not, to, move... -1 0.750000 0.647240 0.668680 0.416667 6 2.000000 5 1.833333 2 1.666667 0 24 13 12 5 2.200000
2750066 [how, do, i, get, more, traffic, in, along, u.... [who, can, i, get, more, traffic, for, a, webs... -1 0.500000 0.352990 0.321774 0.266667 1 0.900000 1 0.900000 2 0.600000 0 7 10 15 6 0.825000
2750067 [how, does, high, useless?] [why, do, rainforests, polymers, high, biodive... -1 0.333333 0.133446 0.135273 0.111111 2 1.500000 2 1.500000 2 2.000000 0 22 24 9 5 2.157895
2750068 [what, are, the, measures, to, correct, would,... [why, was, the, trade, deficit, so, high, used... -1 0.166667 0.207532 0.213680 0.100000 1 1.090909 0 1.000000 0 1.000000 0 7 9 20 11 0.870370
2750069 [how, do, i, fax, referring, document, from, m... [how, can, i, send, a, where, fax, from, austr... -1 0.200000 0.273038 0.264339 0.294118 2 1.200000 2 1.200000 0 1.000000 1 1 8 17 9 0.979167
2750070 [why, is, the, cost, of, living, in, namibia?] [how, much, in, us, dollars, do, i, need, to, ... -1 0.153846 0.122120 0.135535 0.076923 13 2.625000 12 2.500000 7 3.333333 0 49 37 26 12 2.633333
2750071 [what, is, a, diet, plan, for, a, 21, year, ai... [what, is, healthy, diet, chat, test, 22, year... -1 0.428571 0.406626 0.406231 0.333333 1 0.909091 0 1.000000 2 1.333333 1 0 5 15 11 1.000000
2750072 [is, there, any, need, of, who, reservation, i... [why, do, low, caste, people, get, india, stil... -1 0.428571 0.561919 0.585162 0.150000 3 0.769231 3 0.769231 2 1.333333 0 17 3 20 11 0.738462
2750073 [i, have, this, belief, that, even, if, i, hav... [i, hate, my, parents, because, they, do, not,... -1 0.066667 0.081393 0.032820 0.160000 16 0.644444 12 0.657143 8 0.578947 1 82 63 50 29 0.592040
2750074 [can, i, change, myself?] [how, can, i, change, myself?] -1 1.000000 0.996406 1.000000 0.800000 1 1.250000 1 1.250000 0 1.000000 0 3 0 5 2 1.176471
2750075 [what, are, the, best, mortgage, companies, to... [what, does, a, quora, bnbr, asian, look, like?] -1 0.000000 0.002065 0.000000 0.062500 1 0.888889 1 0.888889 0 1.000000 1 9 6 16 10 0.780488
2750076 [do, you, dye, indian, your, hair?] [what, are, the, hair, for, my, birthday?] -1 0.000000 0.000000 0.000000 0.000000 1 1.166667 1 1.166667 1 0.666667 0 5 1 13 5 1.217391
2750077 [why, has, not, trump, gotten, rid, of, latin,... [what, computer, science, department, require,... -1 0.000000 0.000000 0.000000 0.000000 0 1.000000 0 1.000000 2 1.333333 0 31 36 20 14 1.688889
2750078 [what, industries, create, the, top, 100, rich... [why, are, sheikhs, not, considered, wall, the... -1 0.222222 0.195993 0.201207 0.160000 7 1.583333 7 1.636364 2 1.250000 0 28 11 25 16 1.466667
2750079 [if, i, step, 240, volts, ac, to, 120, volts, ... [i, am, working, in, an, it, company, with, 9,... -1 0.000000 0.004990 0.000000 0.102564 1 0.960000 1 1.047619 1 0.916667 0 7 14 39 23 1.077778
2750080 [what, should, is, the, average, cost, for, a,... [what, are, the, types, of, models, used, in, ... -1 0.133333 0.089639 0.087134 0.160000 17 0.370370 9 0.526316 5 0.500000 1 62 34 25 14 0.398058
2750081 [how, do, peaks, (tv, series):, why, did, lela... [what, is, the, most, study, scene, in, twin, ... -1 0.000000 0.000000 0.000000 0.000000 2 0.818182 2 0.818182 3 0.571429 0 14 18 20 11 0.714286
2750082 [what, does, be, "in, transit", mean, on, fede... [how, question, fedex, packages, delivered?] -1 0.222222 0.265210 0.268891 0.076923 4 0.555556 4 0.555556 1 0.800000 0 7 2 13 8 0.829268
2750083 [what, are, some, famous, romanian, drinks, (a... [can, a, non-alcoholic, restaurant, be, a, hug... -1 0.000000 0.000000 0.000000 0.000000 1 0.888889 2 0.777778 2 0.666667 0 15 11 16 10 0.736842
2750084 [what, were, the, best, and, worst, things, ab... [what, are, the, best, and, worst, things, exa... -1 0.869565 0.767616 0.780372 0.739130 0 1.000000 0 1.000000 1 1.090909 1 9 15 23 13 1.090000
2750085 [what, is, the, best, medication, equation, er... [how, do, i, out, get, rid, of, erectile, dysf... -1 0.444444 0.620378 0.641802 0.133333 1 1.125000 1 1.125000 1 0.800000 0 14 16 15 7 0.725490

2750086 rows × 19 columns


In [ ]:


In [19]:
df_train = pd.read_csv(config.RAW_PATH+'train.csv')
df_train = df_train.fillna(' ')

df_test = pd.read_csv(config.RAW_PATH+'test.csv')
df_test['is_duplicate'] = -1
ques = pd.concat([df_train[['question1', 'question2','is_duplicate']], \
    df_test[['question1', 'question2','is_duplicate']]], axis=0).reset_index(drop=True)
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])

def q1_freq(row):
    return(len(q_dict[row['question1']]))

def q2_freq(row):
    return(len(q_dict[row['question2']]))

def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

df_train['q1_q2_intersect'] = df_train.apply(q1_q2_intersect, axis=1, raw=True)
df_train['q1_freq'] = df_train.apply(q1_freq, axis=1, raw=True)
df_train['q2_freq'] = df_train.apply(q2_freq, axis=1, raw=True)

df_test['q1_q2_intersect'] = df_test.apply(q1_q2_intersect, axis=1, raw=True)
df_test['q1_freq'] = df_test.apply(q1_freq, axis=1, raw=True)
df_test['q2_freq'] = df_test.apply(q2_freq, axis=1, raw=True)

test_leaky = df_test.loc[:, ['is_duplicate','q1_q2_intersect','q1_freq','q2_freq']]
del df_test
train_leaky = df_train.loc[:, ['is_duplicate','q1_q2_intersect','q1_freq','q2_freq']]
leaky = pd.concat([train_leaky, test_leaky], axis=0).reset_index(drop=True)
leaky.to_csv(config.FEAT_PATH+'magic_feature_1.csv',index=False)

In [20]:
leaky


Out[20]:
is_duplicate q1_q2_intersect q1_freq q2_freq
0 0 0 1 2
1 0 0 8 3
2 0 0 2 1
3 0 0 1 1
4 0 0 3 1
5 1 0 1 1
6 0 0 1 1
7 1 0 1 1
8 0 1 2 3
9 0 0 1 1
10 0 0 1 1
11 1 0 1 2
12 1 1 2 2
13 1 1 2 3
14 0 3 8 6
15 1 6 11 20
16 1 0 1 1
17 0 0 1 3
18 1 22 23 70
19 0 0 3 1
20 1 0 1 4
21 0 0 2 1
22 0 1 4 15
23 0 0 1 1
24 0 0 5 8
25 0 1 7 2
26 0 1 6 2
27 0 0 1 1
28 0 0 66 1
29 1 8 11 10
... ... ... ... ...
2750056 -1 0 1 1
2750057 -1 0 1 1
2750058 -1 0 2 1
2750059 -1 0 1 1
2750060 -1 0 1 1
2750061 -1 0 8 3
2750062 -1 0 1 1
2750063 -1 0 109 1
2750064 -1 0 1 1
2750065 -1 0 1 1
2750066 -1 0 1 1
2750067 -1 0 1 1
2750068 -1 0 1 1
2750069 -1 0 1 1
2750070 -1 0 1 1
2750071 -1 0 1 1
2750072 -1 0 1 1
2750073 -1 0 1 1
2750074 -1 0 1 3
2750075 -1 0 1 1
2750076 -1 0 1 1
2750077 -1 0 1 1
2750078 -1 0 1 1
2750079 -1 0 1 1
2750080 -1 0 1 1
2750081 -1 0 1 1
2750082 -1 0 1 1
2750083 -1 0 1 1
2750084 -1 0 1 1
2750085 -1 0 1 1

2750086 rows × 4 columns