In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from __future__ import division
import re, time, os, gc, datetime
import sys
import string

import numpy as np
import pandas as pd
import scipy
import config
from utils import dist_utils, ngram_utils, nlp_utils, np_utils
from utils import logging_utils, time_utils, pkl_utils
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix,hstack
from sklearn.feature_extraction import text
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from scipy import sparse as ssp
from sklearn.datasets import dump_svmlight_file,load_svmlight_file
from sklearn.utils import resample,shuffle
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pearsonr
import distance
from sklearn.model_selection import KFold
from nltk.stem.wordnet import WordNetLemmatizer
from multiprocessing import Pool, cpu_count


cpu_num = cpu_count()
num_partitions = cpu_num #number of partitions to split dataframe
num_cores = cpu_num #number of cores on your machine
print cpu_num

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df


28

In [23]:
train_orig =  pd.read_csv(config.RAW_PATH+'train.csv', header=0)#.sample(n=1000)
test_orig =  pd.read_csv(config.RAW_PATH+'test.csv', header=0)#.sample(n=1000)
test_orig['is_duplicate'] = -1

train = pd.concat([train_orig[['question1', 'question2','is_duplicate']], \
        test_orig[['question1', 'question2','is_duplicate']]], axis=0).reset_index(drop=True)
train['question1'] = train['question1'].astype(str)
train['question2'] = train['question2'].astype(str)
print train.shape
del train_orig, test_orig


(2750086, 3)

In [24]:
from text_clean import *

train.head()
train['question1'] = train['question1'].apply(lambda x:substitute_thousands(x))
train['question2'] = train['question2'].apply(lambda x:substitute_thousands(x))
train['question1'] = train['question1'].apply(lambda x:text_to_wordlist(x))
train['question2'] = train['question2'].apply(lambda x:text_to_wordlist(x))
# train = abbr_clean(train)

train.head()
train['question1'] = train['question1'].astype(str).apply(lambda x:stem_str(x.lower(), 
                                                        lemmatize=True, stem=False, stops=stops_eng))
train['question2'] = train['question2'].astype(str).apply(lambda x:stem_str(x.lower(), 
                                                        lemmatize=True, stem=False, stops=stops_eng))
train['question1'] = train['question1'].astype(str)
train['question2'] = train['question2'].astype(str)
train.head()


Out[24]:
question1 question2 is_duplicate
0 What is the step by step guide to invest in sh... What is the step by step guide to invest in sh... 0
1 What is the story of Kohinoor (Koh-i-Noor) Dia... What would happen if the Indian government sto... 0
2 How can I increase the speed of my internet co... How can Internet speed be increased by hacking... 0
3 Why am I mentally very lonely? How can I solve... Find the remainder when [math]23^{24}[/math] i... 0
4 Which one dissolve in water quikly sugar, salt... Which fish would survive in salt water? 0
Out[24]:
question1 question2 is_duplicate
0 What is the step by step guide to invest in sh... What is the step by step guide to invest in sh... 0
1 What is the story of Kohinoor (Koh - i - Noor)... What would happen if the Indian government sto... 0
2 How can I increase the speed of my internet co... How can Internet speed be increased by hacking... 0
3 Why am I mentally very lonely? How can I solve... find the remainder when [math]23 ^ {24}[ math]... 0
4 Which one dissolve in water quickly sugar salt... Which fish would survive in salt water? 0
Out[24]:
question1 question2 is_duplicate
0 step step guide invest share market india step step guide invest share market 0
1 story kohinoor koh noor diamond would happen indian government stole kohinoor ... 0
2 increase speed internet connection using vpn internet speed increased hacking dns 0
3 mentally lonely solve find remainder math 23 24 math divided 24 23 0
4 one dissolve water quickly sugar salt methane ... fish would survive salt water 0

In [4]:
def jaccard_ngram(obs, target, ngram=1, token_pattern=" "):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    return dist_utils._jaccard_coef(obs_ngrams, target_ngrams)

def dicedistence_ngram(obs, target, ngram=1, token_pattern=" "):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    return dist_utils._dice_dist(obs_ngrams, target_ngrams)

def compression_dist(obs, target):
    return dist_utils._compression_dist(obs, target)

def edit_dist(obs, target):
    return dist_utils._edit_dist(obs, target)

def compression_dist_ngram(obs, target, ngram=2, token_pattern=" "):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    val_list = []
    for w1 in obs_ngrams:
        _val_list = []
        for w2 in target_ngrams:
            s = dist_utils._compression_dist(w1, w2)
            _val_list.append(s)
        if len(_val_list) == 0:
            _val_list = [-1]
        val_list.append( max(_val_list) )
    if len(val_list) == 0:
        val_list = [-1]
    return min(val_list)

def edit_dist_ngram(obs, target, ngram=2, token_pattern=" ", agg=[np.min, np.max]):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    val_list = []

    for w1 in obs_ngrams:
        _val_list = []
        for w2 in target_ngrams:
            s = dist_utils._edit_dist(w1, w2)
            _val_list.append(s)
        if len(_val_list) == 0:
            _val_list = [-1]
        val_list.append( agg[0](_val_list) )
    if len(val_list) == 0:
        val_list = [-1]
    return float(agg[1](val_list))

In [5]:
np_dict = { 'mean':np.mean, 'min':np.mean, 'max':np.max, 'median':np.median, 'std':np.std }

def multiply_columns(train):
    for NGRAMS in [1,2,3]:
        train['jaccard_n%s'%NGRAMS] = train.apply(lambda x: jaccard_ngram(x['question1'],x['question2'],ngram=NGRAMS), axis=1)
        train['dicedistence_n%s'%NGRAMS] = train.apply(lambda x: dicedistence_ngram(x['question1'],x['question2'],ngram=NGRAMS), axis=1)

    train['compression_dist'] = train.apply(lambda x: compression_dist(x['question1'],x['question2']), axis=1)
    train['edit_dist'] = train.apply(lambda x: edit_dist(x['question1'],x['question2']), axis=1)
    return train                 

train = parallelize_dataframe(train, multiply_columns)
print train.shape

def multiply_columns(train):
    for AGG_NGRAMS in [1,2,3]:
        for agg1 in ["mean", "max", "min", "median"]:
            for agg2 in np_dict.keys():
                AGG_BY = agg1 + '_' + agg2
                AGG_FUNC = [np_dict[agg1],np_dict[agg2]]
                # train['compression_dist_agg_n%s'%AGG_NGRAMS] = train.apply(lambda x: compression_dist_ngram(x['question1'],x['question2'],ngram=AGG_NGRAMS), axis=1)
                train['edit_dist_agg_n%s_%s'%(AGG_NGRAMS,AGG_BY)] = train.apply(lambda x: 
                    edit_dist_ngram(x['question1'],x['question2'], ngram=AGG_NGRAMS, agg=AGG_FUNC), axis=1)
    return train

train = parallelize_dataframe(train, multiply_columns)
print train.shape
print datetime.datetime.now()


(2750086, 11)
(2750086, 71)
2017-06-02 17:29:44.978151

In [14]:
train.corr()


Out[14]:
is_duplicate jaccard_n1 dicedistence_n1 jaccard_n2 dicedistence_n2 jaccard_n3 dicedistence_n3 compression_dist edit_dist edit_dist_agg_n1_std_std ... edit_dist_agg_n3_median_std edit_dist_agg_n3_median_max edit_dist_agg_n3_median_min edit_dist_agg_n3_median_median edit_dist_agg_n3_median_mean edit_dist_agg_n3_mean_std edit_dist_agg_n3_mean_max edit_dist_agg_n3_mean_min edit_dist_agg_n3_mean_median edit_dist_agg_n3_mean_mean
is_duplicate 1.000000 0.399697 0.383131 0.329080 0.327099 0.264573 0.263297 -0.360152 -0.339477 -0.039252 ... -0.016777 -0.283176 -0.293514 -0.289252 -0.293514 -0.015715 -0.304435 -0.322020 -0.316129 -0.322020
jaccard_n1 0.399697 1.000000 0.978360 0.813812 0.823480 0.659944 0.671698 -0.879651 -0.788190 -0.013644 ... 0.085874 -0.536519 -0.593418 -0.588957 -0.593418 0.076215 -0.600236 -0.672360 -0.667208 -0.672360
dicedistence_n1 0.383131 0.978360 1.000000 0.736134 0.769428 0.556850 0.580012 -0.873225 -0.777030 0.111737 ... 0.126440 -0.500932 -0.569743 -0.563632 -0.569743 0.121130 -0.564974 -0.651599 -0.645253 -0.651599
jaccard_n2 0.329080 0.813812 0.736134 1.000000 0.979313 0.891471 0.897000 -0.729741 -0.737911 -0.176453 ... 0.073059 -0.488975 -0.541644 -0.537867 -0.541644 0.078950 -0.550262 -0.622252 -0.616733 -0.622252
dicedistence_n2 0.327099 0.823480 0.769428 0.979313 1.000000 0.800541 0.830115 -0.738369 -0.743548 -0.117109 ... 0.123693 -0.442300 -0.510096 -0.505781 -0.510096 0.136000 -0.507818 -0.598043 -0.591705 -0.598043
jaccard_n3 0.264573 0.659944 0.556850 0.891471 0.800541 1.000000 0.983398 -0.589671 -0.616648 -0.245261 ... 0.022048 -0.402485 -0.435692 -0.434516 -0.435692 0.024909 -0.456936 -0.505975 -0.503958 -0.505975
dicedistence_n3 0.263297 0.671698 0.580012 0.897000 0.830115 0.983398 1.000000 -0.596430 -0.628217 -0.215362 ... 0.071490 -0.348502 -0.396816 -0.396033 -0.396816 0.081255 -0.409410 -0.476869 -0.475113 -0.476869
compression_dist -0.360152 -0.879651 -0.873225 -0.729741 -0.738369 -0.589671 -0.596430 1.000000 0.840066 -0.071826 ... -0.062998 0.602621 0.650153 0.648976 0.650153 -0.051870 0.660838 0.724608 0.725285 0.724608
edit_dist -0.339477 -0.788190 -0.777030 -0.737911 -0.743548 -0.616648 -0.628217 0.840066 1.000000 -0.007200 ... -0.097859 0.635964 0.708222 0.706628 0.708222 -0.076123 0.703880 0.789244 0.788511 0.789244
edit_dist_agg_n1_std_std -0.039252 -0.013644 0.111737 -0.176453 -0.117109 -0.245261 -0.215362 -0.071826 -0.007200 1.000000 ... 0.345354 0.127540 0.018259 0.030867 0.018259 0.374805 0.148420 0.024160 0.034807 0.024160
edit_dist_agg_n1_std_max 0.215749 0.534268 0.615537 0.316734 0.345663 0.187996 0.193765 -0.523801 -0.446420 0.588839 ... 0.220615 -0.330149 -0.422030 -0.408766 -0.422030 0.229033 -0.344188 -0.450943 -0.437319 -0.450943
edit_dist_agg_n1_std_min 0.362963 0.822917 0.840942 0.614546 0.634337 0.445200 0.450541 -0.759987 -0.676246 0.040377 ... -0.007057 -0.579708 -0.600750 -0.597819 -0.600750 -0.017404 -0.630313 -0.662741 -0.658755 -0.662741
edit_dist_agg_n1_std_median 0.356162 0.791610 0.812826 0.573487 0.612090 0.383008 0.403003 -0.728355 -0.652899 0.077385 ... -0.003304 -0.531418 -0.551623 -0.551413 -0.551623 -0.013543 -0.585999 -0.617735 -0.617668 -0.617735
edit_dist_agg_n1_std_mean 0.362963 0.822917 0.840942 0.614546 0.634337 0.445200 0.450541 -0.759987 -0.676246 0.040377 ... -0.007057 -0.579708 -0.600750 -0.597819 -0.600750 -0.017404 -0.630313 -0.662741 -0.658755 -0.662741
edit_dist_agg_n1_max_std -0.038427 -0.024286 -0.017533 -0.081958 -0.086144 -0.078928 -0.084497 0.005228 -0.003195 -0.019873 ... 0.129641 -0.098223 -0.153521 -0.147927 -0.153521 0.123176 -0.070826 -0.121976 -0.114661 -0.121976
edit_dist_agg_n1_max_max -0.065736 -0.142607 -0.113377 -0.139845 -0.106902 -0.140800 -0.109582 0.203503 0.191909 0.167923 ... 0.114137 0.372716 0.348068 0.344189 0.348068 0.114832 0.361913 0.338891 0.333637 0.338891
edit_dist_agg_n1_max_min -0.018380 -0.093733 -0.078623 -0.052616 -0.026606 -0.047897 -0.020177 0.153824 0.154363 0.132355 ... -0.001324 0.360127 0.378317 0.370368 0.378317 0.003990 0.333960 0.350994 0.341024 0.350994
edit_dist_agg_n1_max_median -0.018945 -0.095762 -0.084981 -0.050078 -0.028490 -0.033941 -0.008763 0.152408 0.157574 0.114198 ... 0.020934 0.351014 0.360213 0.352211 0.360213 0.025002 0.328751 0.337920 0.327941 0.337920
edit_dist_agg_n1_max_mean -0.018380 -0.093733 -0.078623 -0.052616 -0.026606 -0.047897 -0.020177 0.153824 0.154363 0.132355 ... -0.001324 0.360127 0.378317 0.370368 0.378317 0.003990 0.333960 0.350994 0.341024 0.350994
edit_dist_agg_n1_min_std -0.045657 -0.016185 0.057889 -0.152152 -0.123443 -0.203102 -0.191826 -0.049595 0.010332 0.526719 ... 0.275909 0.000967 -0.092538 -0.080212 -0.092538 0.285547 0.032896 -0.065185 -0.052819 -0.065185
edit_dist_agg_n1_min_max -0.224318 -0.474300 -0.423531 -0.421754 -0.377885 -0.373726 -0.329632 0.463059 0.453955 0.425851 ... 0.225354 0.680935 0.631795 0.630898 0.631795 0.241496 0.692136 0.642765 0.638936 0.642765
edit_dist_agg_n1_min_min -0.257545 -0.589399 -0.582418 -0.437937 -0.421047 -0.327620 -0.298588 0.597623 0.575551 0.093992 ... -0.008183 0.746086 0.785346 0.775777 0.785346 0.004521 0.746826 0.790885 0.778337 0.790885
edit_dist_agg_n1_min_median -0.270977 -0.611119 -0.607015 -0.446539 -0.444221 -0.304541 -0.286323 0.612967 0.594340 0.030866 ... -0.006079 0.731194 0.768331 0.760506 0.768331 0.007212 0.738003 0.780153 0.769583 0.780153
edit_dist_agg_n1_min_mean -0.257545 -0.589399 -0.582418 -0.437937 -0.421047 -0.327620 -0.298588 0.597623 0.575551 0.093992 ... -0.008183 0.746086 0.785346 0.775777 0.785346 0.004521 0.746826 0.790885 0.778337 0.790885
edit_dist_agg_n1_median_std -0.005800 -0.022634 0.008446 -0.096884 -0.084857 -0.132019 -0.123868 0.021304 0.029929 0.193602 ... 0.127225 -0.004515 -0.051173 -0.043998 -0.051173 0.126860 0.011880 -0.035047 -0.027980 -0.035047
edit_dist_agg_n1_median_max -0.098525 -0.237987 -0.201517 -0.226877 -0.180865 -0.229242 -0.181623 0.276964 0.291238 0.250501 ... 0.157371 0.540129 0.509252 0.506351 0.509252 0.165157 0.530220 0.499394 0.495195 0.499394
edit_dist_agg_n1_median_min -0.107007 -0.226004 -0.207691 -0.185404 -0.151847 -0.159139 -0.120787 0.252780 0.290779 0.118290 ... 0.019854 0.520436 0.543325 0.534716 0.543325 0.029095 0.501992 0.524563 0.513999 0.524563
edit_dist_agg_n1_median_median -0.125736 -0.251792 -0.231135 -0.214320 -0.186350 -0.164731 -0.130255 0.273259 0.312440 0.095760 ... 0.024916 0.526749 0.548522 0.541708 0.548522 0.033248 0.511211 0.533186 0.524364 0.533186
edit_dist_agg_n1_median_mean -0.107007 -0.226004 -0.207691 -0.185404 -0.151847 -0.159139 -0.120787 0.252780 0.290779 0.118290 ... 0.019854 0.520436 0.543325 0.534716 0.543325 0.029095 0.501992 0.524563 0.513999 0.524563
edit_dist_agg_n1_mean_std -0.045657 -0.016185 0.057889 -0.152152 -0.123443 -0.203102 -0.191826 -0.049595 0.010332 0.526719 ... 0.275909 0.000967 -0.092538 -0.080212 -0.092538 0.285547 0.032896 -0.065185 -0.052819 -0.065185
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
edit_dist_agg_n2_mean_std 0.024081 0.096042 0.145589 0.054270 0.116370 -0.121108 -0.096850 -0.107411 -0.084610 0.496545 ... 0.523977 0.014938 -0.170563 -0.154717 -0.170563 0.544774 0.048635 -0.148666 -0.132267 -0.148666
edit_dist_agg_n2_mean_max -0.254817 -0.531742 -0.490878 -0.479905 -0.427233 -0.433820 -0.380075 0.566823 0.592105 0.292015 ... 0.277257 0.844671 0.784440 0.779385 0.784440 0.299188 0.861807 0.801210 0.792760 0.801210
edit_dist_agg_n2_mean_min -0.289411 -0.627076 -0.607269 -0.556904 -0.532116 -0.431408 -0.392753 0.662901 0.688019 0.071665 ... 0.001933 0.864196 0.908302 0.898523 0.908302 0.014748 0.867813 0.919533 0.907735 0.919533
edit_dist_agg_n2_mean_median -0.294781 -0.638954 -0.616018 -0.573321 -0.547856 -0.453825 -0.422648 0.676455 0.696727 0.079809 ... 0.005578 0.856581 0.900737 0.896512 0.900737 0.018674 0.862553 0.915340 0.910506 0.915340
edit_dist_agg_n2_mean_mean -0.289411 -0.627076 -0.607269 -0.556904 -0.532116 -0.431408 -0.392753 0.662901 0.688019 0.071665 ... 0.001933 0.864196 0.908302 0.898523 0.908302 0.014748 0.867813 0.919533 0.907735 0.919533
edit_dist_agg_n3_std_std 0.006050 0.117891 0.178510 0.099592 0.179120 0.058816 0.137622 -0.089524 -0.082458 0.195636 ... 0.258905 0.337665 0.249139 0.245976 0.249139 0.320635 0.312122 0.189983 0.181980 0.189983
edit_dist_agg_n3_std_max 0.074637 0.277510 0.329926 0.285870 0.366694 0.223779 0.310974 -0.223755 -0.274563 0.034146 ... 0.078685 0.245694 0.217043 0.206243 0.217043 0.095364 0.159112 0.112326 0.099133 0.112326
edit_dist_agg_n3_std_min 0.102158 0.341868 0.377346 0.359970 0.425016 0.296215 0.372397 -0.299038 -0.359755 -0.054751 ... -0.028987 0.126303 0.135548 0.120625 0.135548 -0.044453 0.020771 0.021939 0.002817 0.021939
edit_dist_agg_n3_std_median 0.100426 0.347171 0.377735 0.368834 0.427798 0.312141 0.384810 -0.309438 -0.372284 -0.064572 ... -0.039213 0.109467 0.121129 0.104001 0.121129 -0.060126 0.003864 0.007503 -0.016925 0.007503
edit_dist_agg_n3_std_mean 0.102158 0.341868 0.377346 0.359970 0.425016 0.296215 0.372397 -0.299038 -0.359755 -0.054751 ... -0.028987 0.126303 0.135548 0.120625 0.135548 -0.044453 0.020771 0.021939 0.002817 0.021939
edit_dist_agg_n3_max_std 0.002480 0.048790 0.079244 0.035792 0.074372 -0.005369 0.032958 -0.016003 -0.059682 0.312597 ... 0.881284 0.242667 -0.057288 -0.033034 -0.057288 0.892026 0.269762 -0.038974 -0.012773 -0.038974
edit_dist_agg_n3_max_max -0.247606 -0.466387 -0.423600 -0.414090 -0.356330 -0.344860 -0.279774 0.548891 0.546899 0.070554 ... 0.218143 0.923450 0.885436 0.875642 0.885436 0.222809 0.907425 0.874470 0.861714 0.874470
edit_dist_agg_n3_max_min -0.251671 -0.479970 -0.445285 -0.425879 -0.377769 -0.345882 -0.290039 0.550274 0.568941 -0.016113 ... -0.036263 0.862206 0.916206 0.902732 0.916206 -0.035642 0.836573 0.898857 0.882282 0.898857
edit_dist_agg_n3_max_median -0.249460 -0.474931 -0.439774 -0.422594 -0.373890 -0.343675 -0.287203 0.548901 0.567843 -0.007740 ... -0.011778 0.864274 0.913625 0.909233 0.913625 -0.011388 0.838960 0.896878 0.890304 0.896878
edit_dist_agg_n3_max_mean -0.251671 -0.479970 -0.445285 -0.425879 -0.377769 -0.345882 -0.290039 0.550274 0.568941 -0.016113 ... -0.036263 0.862206 0.916206 0.902732 0.916206 -0.035642 0.836573 0.898857 0.882282 0.898857
edit_dist_agg_n3_min_std -0.015715 0.076215 0.121130 0.078950 0.136000 0.024909 0.081255 -0.051870 -0.076123 0.374805 ... 0.959496 0.303076 -0.025521 -0.002873 -0.025521 1.000000 0.335983 -0.015380 0.008109 -0.015380
edit_dist_agg_n3_min_max -0.304435 -0.600236 -0.564974 -0.550262 -0.507818 -0.456936 -0.409410 0.660838 0.703880 0.148420 ... 0.309413 0.979279 0.911144 0.905090 0.911144 0.335983 1.000000 0.930965 0.921830 0.930965
edit_dist_agg_n3_min_min -0.322020 -0.672360 -0.651599 -0.622252 -0.598043 -0.505975 -0.476869 0.724608 0.789244 0.024160 ... -0.029368 0.921150 0.980506 0.972920 0.980506 -0.015380 0.930965 1.000000 0.990825 1.000000
edit_dist_agg_n3_min_median -0.316129 -0.667208 -0.645253 -0.616733 -0.591705 -0.503958 -0.475113 0.725285 0.788511 0.034807 ... -0.007946 0.910880 0.969421 0.976711 0.969421 0.008109 0.921830 0.990825 1.000000 0.990825
edit_dist_agg_n3_min_mean -0.322020 -0.672360 -0.651599 -0.622252 -0.598043 -0.505975 -0.476869 0.724608 0.789244 0.024160 ... -0.029368 0.921150 0.980506 0.972920 0.980506 -0.015380 0.930965 1.000000 0.990825 1.000000
edit_dist_agg_n3_median_std -0.016777 0.085874 0.126440 0.073059 0.123693 0.022048 0.071490 -0.062998 -0.097859 0.345354 ... 1.000000 0.300112 -0.044429 -0.022088 -0.044429 0.959496 0.309413 -0.029368 -0.007946 -0.029368
edit_dist_agg_n3_median_max -0.283176 -0.536519 -0.500932 -0.488975 -0.442300 -0.402485 -0.348502 0.602621 0.635964 0.127540 ... 0.300112 1.000000 0.934423 0.925439 0.934423 0.303076 0.979279 0.921150 0.910880 0.921150
edit_dist_agg_n3_median_min -0.293514 -0.593418 -0.569743 -0.541644 -0.510096 -0.435692 -0.396816 0.650153 0.708222 0.018259 ... -0.044429 0.934423 1.000000 0.991240 1.000000 -0.025521 0.911144 0.980506 0.969421 0.980506
edit_dist_agg_n3_median_median -0.289252 -0.588957 -0.563632 -0.537867 -0.505781 -0.434516 -0.396033 0.648976 0.706628 0.030867 ... -0.022088 0.925439 0.991240 1.000000 0.991240 -0.002873 0.905090 0.972920 0.976711 0.972920
edit_dist_agg_n3_median_mean -0.293514 -0.593418 -0.569743 -0.541644 -0.510096 -0.435692 -0.396816 0.650153 0.708222 0.018259 ... -0.044429 0.934423 1.000000 0.991240 1.000000 -0.025521 0.911144 0.980506 0.969421 0.980506
edit_dist_agg_n3_mean_std -0.015715 0.076215 0.121130 0.078950 0.136000 0.024909 0.081255 -0.051870 -0.076123 0.374805 ... 0.959496 0.303076 -0.025521 -0.002873 -0.025521 1.000000 0.335983 -0.015380 0.008109 -0.015380
edit_dist_agg_n3_mean_max -0.304435 -0.600236 -0.564974 -0.550262 -0.507818 -0.456936 -0.409410 0.660838 0.703880 0.148420 ... 0.309413 0.979279 0.911144 0.905090 0.911144 0.335983 1.000000 0.930965 0.921830 0.930965
edit_dist_agg_n3_mean_min -0.322020 -0.672360 -0.651599 -0.622252 -0.598043 -0.505975 -0.476869 0.724608 0.789244 0.024160 ... -0.029368 0.921150 0.980506 0.972920 0.980506 -0.015380 0.930965 1.000000 0.990825 1.000000
edit_dist_agg_n3_mean_median -0.316129 -0.667208 -0.645253 -0.616733 -0.591705 -0.503958 -0.475113 0.725285 0.788511 0.034807 ... -0.007946 0.910880 0.969421 0.976711 0.969421 0.008109 0.921830 0.990825 1.000000 0.990825
edit_dist_agg_n3_mean_mean -0.322020 -0.672360 -0.651599 -0.622252 -0.598043 -0.505975 -0.476869 0.724608 0.789244 0.024160 ... -0.029368 0.921150 0.980506 0.972920 0.980506 -0.015380 0.930965 1.000000 0.990825 1.000000

84 rows × 84 columns


In [6]:
def get_position_list(obs, target, ngram=1, token_pattern=" "):
    """
        Get the list of positions of obs in target
    """
    obs_tokens = nlp_utils._tokenize(str(obs), token_pattern)
    target_tokens = nlp_utils._tokenize(str(target), token_pattern)
    obs = ngram_utils._ngrams(obs_tokens, ngram)
    target = ngram_utils._ngrams(target_tokens, ngram)
    
    pos_of_obs_in_target = [0]
    if len(obs) != 0:
        pos_of_obs_in_target = [j for j,w in enumerate(obs, start=1) if w in target]
        if len(pos_of_obs_in_target) == 0:
            pos_of_obs_in_target = [0]
    return pos_of_obs_in_target, len(obs)

def count_close_ngram(obs, target, idx=-1, ratio='count', ngram=123, aggr="", token_pattern=" ", threshold=config.STR_MATCH_THRESHOLD):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs = ngram_utils._ngrams(obs_tokens, ngram)
    target = ngram_utils._ngrams(target_tokens, ngram)
    cnt = 0
    if (len(obs) != 0) and (len(target) != 0):
        if idx == -1:
            for obs_word in obs:
                for word in target:
                    if dist_utils._is_str_match(word, obs_word, threshold):
                        cnt += 1
        else:
            for word in target:
                if dist_utils._is_str_match(word, obs[idx], threshold):
                    cnt += 1
    if ratio == 'count': 
        return cnt
    else: return np_utils._try_divide(cnt, (len(obs)+len(target))/2.0)

def cooccurrence_ngram(obs, target, ngram=1, threshold=0.8, ratio='ratio', token_pattern=" "):
    """
        Get the count cooccurrence_ngram in obs and target
    """
    obs_tokens = nlp_utils._tokenize(str(obs), token_pattern)
    target_tokens = nlp_utils._tokenize(str(target), token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)

    s = 0.
    for w1 in obs_ngrams:
        for w2 in target_ngrams:
            if dist_utils._is_str_match(w1, w2, threshold):
                s += 1.
    if ratio == 'count': 
        return s
    else: return np_utils._try_divide(s, len(obs_ngrams)*len(target_ngrams))
    
def LongestMatchSize(obs_corpus, target_corpus):
    return dist_utils._longest_match_size(obs_corpus, target_corpus)

def LongestMatchRatio(obs_corpus, target_corpus):
    return dist_utils._longest_match_ratio(obs_corpus, target_corpus)

from collections import defaultdict

def _get_df_dict(target_corpus, ngram=1, token_pattern=" "):
    d = defaultdict(lambda : 1)
    for target in target_corpus:
        target_tokens = nlp_utils._tokenize(target, token_pattern)
        target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
        for w in set(target_ngrams):
            d[w] += 1
    return d

def _get_idf(word, idf_dict, N):
    return np.log((N - idf_dict[word] + 0.5)/(idf_dict[word] + 0.5))

def cooc_tfidf_ngram(obs, target, idf_dict=None, ngram=1, threshold=0.8, ratio="ratio", token_pattern=" ", AGG_FUNC=None):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    val_list = []
    for w1 in obs_ngrams:
        s = 0.
        for w2 in target_ngrams:
            if dist_utils._is_str_match(w1, w2, threshold):
                s += 1.
        if ratio == "count":
            val_list.append(s * _get_idf(w1, idf_dict, doc_num))
        elif ratio == "ratio":
            val_list.append(np_utils._try_divide(s, len(target_ngrams)) * _get_idf(w1, idf_dict, doc_num))
            
    if len(val_list) == 0:
        val_list = [config.MISSING_VALUE_NUMERIC]
    return AGG_FUNC(val_list)


def _get_avg_ngram_doc_len(target_corpus, ngram=1, token_pattern=" "):
    lst = []
    for target in target_corpus:
        target_tokens = nlp_utils._tokenize(target, token_pattern)
        target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
        lst.append(len(target_ngrams))
    return np.mean(lst)

def bm25(obs, target, ngram=1, threshold=0.8, ratio="ratio", token_pattern=" ", b=None, k1=None, doc_len=None, idf_dict=None, AGG_FUNC=None):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    K = k1 * (1 - b + b * np_utils._try_divide(len(target_ngrams), doc_len))
    val_list = []
    for w1 in obs_ngrams:
        s = 0.
        for w2 in target_ngrams:
            if dist_utils._is_str_match(w1, w2, threshold):
                s += 1.
        bm25 = s * _get_idf(w1, idf_dict, doc_num) * np_utils._try_divide(1 + k1, s + K)
        val_list.append(bm25)
    if len(val_list) == 0:
        val_list = [config.MISSING_VALUE_NUMERIC]
    return AGG_FUNC(val_list)

In [7]:
np_dict = { 'mean':np.mean, 'min':np.mean, 'max':np.max, 'median':np.median, 'std':np.std }

doc_num = train.shape[0]
df_all=train[['question1','question2']].copy()
BM25_K1=config.BM25_K1
BM25_B=config.BM25_B

def multiply_columns(train):
    for ngram in [1,2]:
        for target_name in ['question1','question2']:
            for obs_name in ['question1','question2']:
                if target_name != obs_name:
                    position = train[['question1','question2']].apply(lambda x: get_position_list(obs=x[obs_name],target=x[target_name],ngram=ngram), axis=1)
                    pos = [i[0] for i in position]
                    obs_len = [i[1] for i in position]
                    ## stats feat on pos
                    train["pos_of_%s_n%s_in_%s_min" % (obs_name, ngram, target_name)] = map(np.min, pos)
                    train["pos_of_%s_n%s_in_%s_mean" % (obs_name, ngram, target_name)] = map(np.mean, pos)
                    train["pos_of_%s_n%s_in_%s_median" % (obs_name, ngram, target_name)] = map(np.median, pos)
                    train["pos_of_%s_n%s_in_%s_max" % (obs_name, ngram, target_name)] = map(np.max, pos)
                    train["pos_of_%s_n%s_in_%s_std" % (obs_name, ngram, target_name)] = map(np.std, pos)
                    # stats feat on normalized_pos
                    train["norm_pos_of_%s_n%s_in_%s_min" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_min" % (obs_name, ngram, target_name)], obs_len)
                    train["norm_pos_of_%s_n%s_in_%s_mean" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_mean" % (obs_name, ngram, target_name)], obs_len)
                    train["norm_pos_of_%s_n%s_in_%s_median" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_median" % (obs_name, ngram, target_name)], obs_len)
                    train["norm_pos_of_%s_n%s_in_%s_max" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_max" % (obs_name, ngram, target_name)], obs_len)
                    train["norm_pos_of_%s_n%s_in_%s_std" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_std" % (obs_name, ngram, target_name)] ,obs_len)
    
    NGRAMS=[1,2,3]
    RATIO=['count','ratio']
    for ngram in NGRAMS:
        for ratio in RATIO:
            train['intersect_close_%s_n%s'%(ratio,ngram)] = train[['question1','question2']].apply(lambda x: 
                        count_close_ngram(x[0],x[1],threshold=0.8,ngram=ngram,ratio=ratio), axis=1) 
            
    NGRAMS=[1,2,3]
    RATIO=['count','ratio']
    for ngram in NGRAMS:
        for ratio in RATIO:
            train['cooccurrence_close_%s_n%s'%(ratio,ngram)] = train[['question1','question2']].apply(lambda x: 
                        cooccurrence_ngram(x[0],x[1],threshold=0.8,ngram=ngram,ratio=ratio), axis=1)
            
    train['LongestMatchSize'] = train[['question1','question2']].apply(lambda x: LongestMatchSize(x[0],x[1]), axis=1)
    train['LongestMatchRatio'] = train[['question1','question2']].apply(lambda x: LongestMatchRatio(x[0],x[1]), axis=1)

                    
    return train

def multiply_cooc(train):
    for agg in np_dict.keys():
        AGG_FUNC = np_dict[agg] 
        train["cooc_tfidf_%s_n%s_%s_%s" % (obs_name, ngram, ratio,agg)] = train[['question1','question2']].apply(lambda x: 
                            cooc_tfidf_ngram( obs=x[obs_name],target=x[target_name], threshold=0.8, 
                            idf_dict=idf_dict,  ngram=ngram, ratio=ratio, AGG_FUNC=AGG_FUNC), axis=1)
    return train

def multiply_bm25(train):
    for agg in np_dict.keys():
        AGG_FUNC = np_dict[agg] 
        train["bm25_tfidf_%s_n%s_%s_%s" % (obs_name, ngram, ratio,agg)] = train[['question1','question2']].apply(lambda x: 
                            bm25(obs=x[obs_name],target=x[target_name], ngram=ngram,threshold=0.8,b=BM25_B, 
                            k1=BM25_K1, idf_dict=idf_dict, doc_len=avg_target_len, AGG_FUNC=AGG_FUNC), axis=1)
#     train["bm25_%s_n%s_min" % (obs_name, ngram)] = map(np.min, pos)
#     train["bm25_%s_n%s_mean" % (obs_name, ngram)] = map(np.mean, pos)
#     train["bm25_%s_n%s_median" % (obs_name, ngram)] = map(np.median, pos)
#     train["bm25_%s_n%s_max" % (obs_name, ngram)] = map(np.max, pos)
#     train["bm25_%s_n%s_std" % (obs_name, ngram)] = map(np.std, pos)
    return train



for ngram in [1,2,3]:
    idf_dict = _get_df_dict(np.concatenate((df_all['question1'].values , df_all['question2'].values)), ngram=ngram)
    for ratio in ['count','ratio']:
        for target_name in ['question1','question2']:
            for obs_name in ['question1','question2']:
                if target_name != obs_name:              
                    train = parallelize_dataframe(train, multiply_cooc)
                    print "cooc_tfidf_%s_n%s_%s" % (obs_name, ngram, ratio)    
print 'cooc: {}'.format(train.shape)     


for ngram in [1,2,3]:
    idf_dict = _get_df_dict(np.concatenate((df_all['question1'].values , df_all['question2'].values)), ngram=ngram)
#     for ratio in ['count','ratio']:
    for target_name in ['question1','question2']:
        avg_target_len = _get_avg_ngram_doc_len(df_all[target_name].values, ngram=ngram)
        for obs_name in ['question1','question2']:
            if target_name != obs_name:
                train = parallelize_dataframe(train, multiply_bm25)
                print "bm25_tfidf_%s_n%s_%s" % (obs_name, ngram, ratio)  
print 'bm25: {}'.format(train.shape)     


train = parallelize_dataframe(train, multiply_columns)
print 'postion, insection close, longestmatch'                 
print train.shape
print datetime.datetime.now()

del df_all, idf_dict
gc.collect()


cooc_tfidf_question2_n1_count
cooc_tfidf_question1_n1_count
cooc_tfidf_question2_n1_ratio
cooc_tfidf_question1_n1_ratio
cooc_tfidf_question2_n2_count
cooc_tfidf_question1_n2_count
cooc_tfidf_question2_n2_ratio
cooc_tfidf_question1_n2_ratio
cooc_tfidf_question2_n3_count
cooc_tfidf_question1_n3_count
cooc_tfidf_question2_n3_ratio
cooc_tfidf_question1_n3_ratio
cooc: (2750086, 131)
bm25_tfidf_question2_n1_ratio
bm25_tfidf_question1_n1_ratio
bm25_tfidf_question2_n2_ratio
bm25_tfidf_question1_n2_ratio
bm25_tfidf_question2_n3_ratio
bm25_tfidf_question1_n3_ratio
bm25: (2750086, 161)
postion, insection close, longestmatch
(2750086, 215)
2017-06-02 18:26:02.116673
Out[7]:
3755

In [17]:
train[train['is_duplicate']>=0].corr()


Out[17]:
is_duplicate jaccard_n1 dicedistence_n1 jaccard_n2 dicedistence_n2 jaccard_n3 dicedistence_n3 compression_dist edit_dist edit_dist_agg_n1_std_std ... intersect_close_count_n3 intersect_close_ratio_n3 cooccurrence_close_count_n1 cooccurrence_close_ratio_n1 cooccurrence_close_count_n2 cooccurrence_close_ratio_n2 cooccurrence_close_count_n3 cooccurrence_close_ratio_n3 LongestMatchSize LongestMatchRatio
is_duplicate 1.000000 0.377921 0.401909 0.249890 0.262632 0.176085 0.170442 -0.371918 -0.315380 0.031145 ... 0.038224 0.178993 0.105424 0.306094 0.094097 0.247348 0.043303 0.191612 0.164952 0.243107
jaccard_n1 0.377921 1.000000 0.978751 0.826183 0.834713 0.688344 0.697550 -0.898338 -0.794967 -0.169688 ... 0.476687 0.656737 0.478993 0.706852 0.556472 0.626399 0.485536 0.551224 0.469957 0.439766
dicedistence_n1 0.401909 0.978751 1.000000 0.754835 0.786491 0.592007 0.612920 -0.894976 -0.782811 -0.047973 ... 0.452074 0.597575 0.513906 0.692766 0.545909 0.564890 0.455632 0.471371 0.534685 0.474301
jaccard_n2 0.249890 0.826183 0.754835 1.000000 0.980004 0.904096 0.907215 -0.759159 -0.765429 -0.285129 ... 0.582986 0.812851 0.399864 0.561232 0.644674 0.762431 0.595273 0.706642 0.328600 0.277589
dicedistence_n2 0.262632 0.834713 0.786491 0.980004 1.000000 0.819878 0.846184 -0.766617 -0.767694 -0.229019 ... 0.596678 0.782621 0.443227 0.537554 0.690133 0.730919 0.603926 0.628773 0.369407 0.273766
jaccard_n3 0.176085 0.688344 0.592007 0.904096 0.819878 1.000000 0.984633 -0.633291 -0.660300 -0.325083 ... 0.599217 0.818210 0.367395 0.440421 0.580984 0.625810 0.620359 0.748907 0.237448 0.147856
dicedistence_n3 0.170442 0.697550 0.612920 0.907215 0.846184 0.984633 1.000000 -0.638237 -0.669317 -0.292773 ... 0.661484 0.836541 0.424021 0.402823 0.651291 0.584081 0.682111 0.707134 0.268599 0.118255
compression_dist -0.371918 -0.898338 -0.894976 -0.759159 -0.766617 -0.633291 -0.638237 1.000000 0.860596 0.094389 ... -0.460893 -0.675791 -0.382781 -0.716231 -0.514085 -0.634455 -0.465567 -0.568947 -0.494407 -0.492390
edit_dist -0.315380 -0.794967 -0.782811 -0.765429 -0.767694 -0.660300 -0.669317 0.860596 1.000000 0.135423 ... -0.531934 -0.744574 -0.373698 -0.602602 -0.554656 -0.625064 -0.521965 -0.589474 -0.483815 -0.417968
edit_dist_agg_n1_std_std 0.031145 -0.169688 -0.047973 -0.285129 -0.229019 -0.325083 -0.292773 0.094389 0.135423 1.000000 ... -0.167886 -0.290046 -0.003456 -0.149251 -0.133696 -0.268509 -0.168929 -0.313394 0.107507 0.017032
edit_dist_agg_n1_std_max 0.318444 0.524938 0.602631 0.330731 0.357108 0.205221 0.206864 -0.526465 -0.415513 0.439703 ... 0.059641 0.216263 0.216579 0.425360 0.139449 0.322574 0.063379 0.231115 0.395022 0.429802
edit_dist_agg_n1_std_min 0.393622 0.815605 0.831212 0.628510 0.646097 0.467749 0.467307 -0.779828 -0.668705 -0.135906 ... 0.242410 0.491672 0.297631 0.660129 0.327343 0.581959 0.246159 0.472372 0.399637 0.516130
edit_dist_agg_n1_std_median 0.399154 0.773153 0.802466 0.568297 0.608810 0.383047 0.398421 -0.745677 -0.646978 -0.000168 ... 0.220385 0.435240 0.289121 0.623227 0.319237 0.525427 0.221189 0.390855 0.371889 0.478111
edit_dist_agg_n1_std_mean 0.393622 0.815605 0.831212 0.628510 0.646097 0.467749 0.467307 -0.779828 -0.668705 -0.135906 ... 0.242410 0.491672 0.297631 0.660129 0.327343 0.581959 0.246159 0.472372 0.399637 0.516130
edit_dist_agg_n1_max_std -0.029435 -0.039871 -0.026457 -0.100331 -0.102781 -0.101722 -0.108886 0.015842 0.008833 0.047654 ... -0.130695 -0.086811 -0.154827 0.099337 -0.149819 -0.074454 -0.145184 -0.050002 -0.016690 0.207853
edit_dist_agg_n1_max_max -0.031449 -0.156995 -0.126438 -0.155690 -0.124137 -0.147400 -0.114926 0.231346 0.216293 0.204936 ... 0.019071 -0.174057 0.114854 -0.530240 0.052887 -0.388144 0.026804 -0.279425 -0.017074 -0.411861
edit_dist_agg_n1_max_min -0.021035 -0.111217 -0.095609 -0.072415 -0.044745 -0.059704 -0.027041 0.191418 0.186346 0.144100 ... 0.086077 -0.099330 0.184200 -0.511475 0.130032 -0.294666 0.101796 -0.200704 -0.020638 -0.484009
edit_dist_agg_n1_max_median -0.018306 -0.101725 -0.090326 -0.057022 -0.032292 -0.037940 -0.007586 0.177144 0.177394 0.134813 ... 0.077748 -0.085958 0.166915 -0.486625 0.122706 -0.281541 0.092042 -0.166807 -0.030226 -0.466561
edit_dist_agg_n1_max_mean -0.021035 -0.111217 -0.095609 -0.072415 -0.044745 -0.059704 -0.027041 0.191418 0.186346 0.144100 ... 0.086077 -0.099330 0.184200 -0.511475 0.130032 -0.294666 0.101796 -0.200704 -0.020638 -0.484009
edit_dist_agg_n1_min_std -0.005106 -0.116899 -0.037925 -0.223682 -0.193777 -0.262601 -0.247422 0.046729 0.093377 0.595503 ... -0.145648 -0.223870 -0.081852 0.043586 -0.151656 -0.179117 -0.148217 -0.238234 0.018263 0.225427
edit_dist_agg_n1_min_max -0.198053 -0.495317 -0.442148 -0.447485 -0.400766 -0.393551 -0.340036 0.510806 0.480273 0.523404 ... -0.029582 -0.395245 0.062531 -0.715190 -0.021014 -0.637291 -0.019861 -0.546386 -0.153620 -0.548561
edit_dist_agg_n1_min_min -0.246524 -0.566457 -0.556101 -0.440606 -0.416883 -0.338782 -0.299338 0.612085 0.563910 0.210252 ... -0.026992 -0.376807 0.023350 -0.858719 -0.021108 -0.650155 -0.010317 -0.495962 -0.245396 -0.733233
edit_dist_agg_n1_min_median -0.252876 -0.577202 -0.575686 -0.432711 -0.425389 -0.299350 -0.270927 0.614704 0.572562 0.114117 ... -0.030471 -0.349586 0.004742 -0.833905 -0.039486 -0.631673 -0.014215 -0.439951 -0.253190 -0.701902
edit_dist_agg_n1_min_mean -0.246524 -0.566457 -0.556101 -0.440606 -0.416883 -0.338782 -0.299338 0.612085 0.563910 0.210252 ... -0.026992 -0.376807 0.023350 -0.858719 -0.021108 -0.650155 -0.010317 -0.495962 -0.245396 -0.733233
edit_dist_agg_n1_median_std -0.030345 -0.078827 -0.040486 -0.141685 -0.127418 -0.175599 -0.163689 0.058138 0.073331 0.261524 ... -0.070021 -0.116070 -0.052429 0.053984 -0.090548 -0.121992 -0.071182 -0.141984 -0.042084 0.174278
edit_dist_agg_n1_median_max -0.103780 -0.264027 -0.224689 -0.257441 -0.206254 -0.256633 -0.200285 0.320390 0.317852 0.299945 ... 0.062120 -0.230577 0.159018 -0.612305 0.109816 -0.541816 0.076971 -0.403382 -0.070451 -0.482925
edit_dist_agg_n1_median_min -0.074472 -0.222151 -0.201711 -0.196230 -0.155890 -0.174119 -0.128445 0.279548 0.289527 0.153616 ... 0.063115 -0.184683 0.143896 -0.614214 0.124518 -0.473611 0.081311 -0.321913 -0.080474 -0.531308
edit_dist_agg_n1_median_median -0.068783 -0.245263 -0.223948 -0.219572 -0.186505 -0.171700 -0.130449 0.291637 0.305197 0.129072 ... 0.048367 -0.190147 0.126663 -0.600908 0.098748 -0.477967 0.065835 -0.308468 -0.093682 -0.504671
edit_dist_agg_n1_median_mean -0.074472 -0.222151 -0.201711 -0.196230 -0.155890 -0.174119 -0.128445 0.279548 0.289527 0.153616 ... 0.063115 -0.184683 0.143896 -0.614214 0.124518 -0.473611 0.081311 -0.321913 -0.080474 -0.531308
edit_dist_agg_n1_mean_std -0.005106 -0.116899 -0.037925 -0.223682 -0.193777 -0.262601 -0.247422 0.046729 0.093377 0.595503 ... -0.145648 -0.223870 -0.081852 0.043586 -0.151656 -0.179117 -0.148217 -0.238234 0.018263 0.225427
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
pos_of_question2_n2_in_question1_std 0.032397 0.395918 0.383312 0.440551 0.475940 0.400310 0.464624 -0.322748 -0.351170 -0.150304 ... 0.688937 0.408547 0.691813 -0.024205 0.772805 0.046429 0.685738 0.098433 0.297262 -0.182974
norm_pos_of_question2_n2_in_question1_min 0.209585 0.415548 0.444838 0.438920 0.517819 0.193183 0.187072 -0.394721 -0.350548 0.060606 ... 0.053513 0.241754 0.119410 0.381073 0.194611 0.518730 0.042735 0.278559 0.228851 0.282616
norm_pos_of_question2_n2_in_question1_mean 0.233358 0.584129 0.598333 0.634751 0.714663 0.391079 0.407886 -0.537464 -0.503329 -0.001557 ... 0.286161 0.427799 0.321702 0.392167 0.449191 0.550044 0.279136 0.361391 0.324086 0.232728
norm_pos_of_question2_n2_in_question1_median 0.234463 0.583642 0.597221 0.633182 0.712286 0.390579 0.406958 -0.537378 -0.501446 -0.003622 ... 0.283159 0.427053 0.318688 0.393066 0.445634 0.550239 0.276906 0.362155 0.321612 0.233869
norm_pos_of_question2_n2_in_question1_max 0.229853 0.672036 0.673116 0.741330 0.815886 0.519233 0.554069 -0.608076 -0.587220 -0.050927 ... 0.456315 0.543399 0.463360 0.365492 0.623727 0.526885 0.452127 0.396032 0.376372 0.169660
norm_pos_of_question2_n2_in_question1_std 0.115176 0.570138 0.533912 0.651196 0.672752 0.600722 0.667705 -0.486424 -0.503566 -0.163656 ... 0.615693 0.574691 0.567516 0.118552 0.727923 0.206128 0.621801 0.327106 0.317777 -0.074254
pos_of_question1_n2_in_question2_min 0.049286 0.112627 0.166808 0.127195 0.205654 0.014480 0.036411 -0.085261 -0.078170 0.117680 ... 0.146236 0.060588 0.325421 -0.055783 0.292520 0.036625 0.121536 -0.017397 0.242925 -0.107895
pos_of_question1_n2_in_question2_mean 0.056138 0.269003 0.307703 0.304791 0.383612 0.198664 0.243719 -0.218435 -0.227973 0.045708 ... 0.449368 0.239370 0.579149 -0.055199 0.593502 0.051827 0.430442 0.025783 0.333441 -0.168026
pos_of_question1_n2_in_question2_median 0.053615 0.265478 0.303898 0.301805 0.379681 0.197221 0.241467 -0.215034 -0.224532 0.046081 ... 0.438838 0.235892 0.565693 -0.053764 0.583148 0.052463 0.421025 0.026306 0.329214 -0.163557
pos_of_question1_n2_in_question2_max 0.055445 0.356577 0.380493 0.403401 0.475433 0.310400 0.368332 -0.294756 -0.315952 -0.010590 ... 0.628618 0.345061 0.711133 -0.050729 0.757254 0.056802 0.614853 0.052867 0.367888 -0.198002
pos_of_question1_n2_in_question2_std 0.049051 0.437062 0.419937 0.488433 0.522825 0.452583 0.522438 -0.363409 -0.398618 -0.140214 ... 0.763201 0.463722 0.733316 -0.011472 0.829867 0.059920 0.765695 0.119040 0.314813 -0.184927
norm_pos_of_question1_n2_in_question2_min 0.209608 0.389588 0.423706 0.424102 0.505772 0.169841 0.162003 -0.375925 -0.339552 -0.021531 ... 0.043589 0.219255 0.114530 0.371578 0.192534 0.516104 0.026203 0.250405 0.236381 0.285637
norm_pos_of_question1_n2_in_question2_mean 0.235657 0.555659 0.574607 0.618872 0.700858 0.368544 0.382976 -0.516012 -0.491758 -0.094001 ... 0.274726 0.405810 0.312623 0.384043 0.442207 0.547983 0.263260 0.334197 0.328839 0.236856
norm_pos_of_question1_n2_in_question2_median 0.233662 0.553041 0.571975 0.616569 0.698042 0.367600 0.381633 -0.513742 -0.489800 -0.093570 ... 0.270417 0.404093 0.306947 0.383917 0.437489 0.547421 0.259428 0.334016 0.326111 0.238144
norm_pos_of_question1_n2_in_question2_max 0.237834 0.650500 0.655157 0.731811 0.807812 0.503621 0.536009 -0.592351 -0.580378 -0.147109 ... 0.449235 0.528670 0.457003 0.362366 0.618934 0.529310 0.443176 0.375481 0.381173 0.175001
norm_pos_of_question1_n2_in_question2_std 0.131628 0.582018 0.542915 0.671154 0.690112 0.625810 0.693381 -0.495184 -0.518824 -0.227410 ... 0.642583 0.600121 0.585271 0.124093 0.747242 0.211986 0.655093 0.339427 0.321772 -0.077886
intersect_close_count_n1 0.101552 0.468811 0.504074 0.391113 0.434103 0.360105 0.416005 -0.375693 -0.368924 -0.005579 ... 0.741017 0.384290 0.995235 0.021403 0.813741 0.011975 0.731088 0.047487 0.499928 -0.176848
intersect_close_ratio_n1 0.362244 0.898014 0.922882 0.703821 0.733318 0.564348 0.587411 -0.851915 -0.743941 -0.033988 ... 0.510784 0.600948 0.652727 0.685777 0.604316 0.529653 0.509083 0.445590 0.552123 0.424755
intersect_close_count_n2 0.090241 0.544790 0.535936 0.631178 0.675909 0.570605 0.640119 -0.502940 -0.549042 -0.134208 ... 0.917581 0.621427 0.827515 0.079257 0.993768 0.203507 0.915277 0.217054 0.398139 -0.114822
intersect_close_ratio_n2 0.262136 0.787750 0.748053 0.921850 0.941206 0.773150 0.800086 -0.779679 -0.796593 -0.226644 ... 0.619608 0.813835 0.455254 0.546032 0.725524 0.756553 0.622404 0.654775 0.381285 0.295546
intersect_close_count_n3 0.038224 0.476687 0.452074 0.582986 0.596678 0.599217 0.661484 -0.460893 -0.531934 -0.167886 ... 1.000000 0.705029 0.745287 0.063664 0.919601 0.161920 0.982313 0.250921 0.351662 -0.116223
intersect_close_ratio_n3 0.178993 0.656737 0.597575 0.812851 0.782621 0.818210 0.836541 -0.675791 -0.744574 -0.290046 ... 0.705029 1.000000 0.387515 0.441533 0.627433 0.603866 0.678871 0.775592 0.335573 0.198758
cooccurrence_close_count_n1 0.105424 0.478993 0.513906 0.399864 0.443227 0.367395 0.424021 -0.382781 -0.373698 -0.003456 ... 0.745287 0.387515 1.000000 0.029442 0.819652 0.015944 0.736445 0.047972 0.496984 -0.173040
cooccurrence_close_ratio_n1 0.306094 0.706852 0.692766 0.561232 0.537554 0.440421 0.402823 -0.716231 -0.602602 -0.149251 ... 0.063664 0.441533 0.029442 1.000000 0.089371 0.767282 0.065476 0.575358 0.226687 0.796519
cooccurrence_close_count_n2 0.094097 0.556472 0.545909 0.644674 0.690133 0.580984 0.651291 -0.514085 -0.554656 -0.133696 ... 0.919601 0.627433 0.819652 0.089371 1.000000 0.213081 0.921096 0.221735 0.393379 -0.111691
cooccurrence_close_ratio_n2 0.247348 0.626399 0.564890 0.762431 0.730919 0.625810 0.584081 -0.634455 -0.625064 -0.268509 ... 0.161920 0.603866 0.015944 0.767282 0.213081 1.000000 0.163565 0.752216 0.176285 0.533128
cooccurrence_close_count_n3 0.043303 0.485536 0.455632 0.595273 0.603926 0.620359 0.682111 -0.465567 -0.521965 -0.168929 ... 0.982313 0.678871 0.736445 0.065476 0.921096 0.163565 1.000000 0.268216 0.329265 -0.124851
cooccurrence_close_ratio_n3 0.191612 0.551224 0.471371 0.706642 0.628773 0.748907 0.707134 -0.568947 -0.589474 -0.313394 ... 0.250921 0.775592 0.047972 0.575358 0.221735 0.752216 0.268216 1.000000 0.168849 0.331927
LongestMatchSize 0.164952 0.469957 0.534685 0.328600 0.369407 0.237448 0.268599 -0.494407 -0.483815 0.107507 ... 0.351662 0.335573 0.496984 0.226687 0.393379 0.176285 0.329265 0.168849 1.000000 0.292501
LongestMatchRatio 0.243107 0.439766 0.474301 0.277589 0.273766 0.147856 0.118255 -0.492390 -0.417968 0.017032 ... -0.116223 0.198758 -0.173040 0.796519 -0.111691 0.533128 -0.124851 0.331927 0.292501 1.000000

228 rows × 228 columns


In [8]:
train.to_csv(config.FEAT_PATH+'feature_base_lemmer.csv',index=False)

In [ ]:


In [ ]:


In [ ]:


In [9]:
train = train[['is_duplicate','question1', 'question2']]
gc.collect()


Out[9]:
36

In [2]:
train =  pd.read_csv(config.FEAT_PATH+'feature_vect_lemmer.csv')

In [3]:
# ------------------------ Vector Space Features -------------------------------

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import config
from utils import dist_utils, ngram_utils, nlp_utils, np_utils, pkl_utils
from utils import logging_utils, time_utils

class VectorSpace:
    ## word based
    def _init_word_bow(self, ngram, vocabulary=None):
        bow = CountVectorizer(min_df=config.MIN_DF,
                                max_df=config.MAX_DF,
                                max_features=None,
                                # norm="l2",
                                strip_accents="unicode",
                                analyzer="word",
                                token_pattern=r"\w{1,}",
                                ngram_range=(1, ngram),
                                vocabulary=vocabulary)
        return bow

    ## word based
    def _init_word_ngram_tfidf(self, ngram, vocabulary=None):
        tfidf = TfidfVectorizer(min_df=config.MIN_DF,
                                max_df=config.MAX_DF,                                
                                max_features=None,
                                norm="l2",
                                strip_accents="unicode",
                                analyzer="word",
                                token_pattern=r"\w{1,}",
                                ngram_range=(1, ngram),
                                use_idf=1,
                                smooth_idf=1,
                                sublinear_tf=1,
                                # stop_words="english",
                                vocabulary=vocabulary)
        return tfidf

    ## char based
    def _init_char_tfidf(self, include_digit=False):
        chars = list(string.ascii_lowercase)
        if include_digit:
            chars += list(string.digits)        
        vocabulary = dict(zip(chars, range(len(chars))))
        tfidf = TfidfVectorizer(strip_accents="unicode",
                                analyzer="char",
                                norm=None,
                                token_pattern=r"\w{1,}",
                                ngram_range=(1, 1), 
                                use_idf=0,
                                vocabulary=vocabulary)
        return tfidf

    ## char based ngram
    def _init_char_ngram_tfidf(self, ngram, vocabulary=None):
        tfidf = TfidfVectorizer(min_df=config.MIN_DF,
                                max_df=config.MAX_DF, 
                                max_features=None, 
                                norm="l2",
                                strip_accents="unicode", 
                                analyzer="char",
                                token_pattern=r"\w{1,}",
                                ngram_range=(1, ngram), 
                                use_idf=1,
                                smooth_idf=1,
                                sublinear_tf=1, 
                                # stop_words="english",
                                vocabulary=vocabulary)
        return tfidf

# ------------------------ LSA -------------------------------
class LSA_Ngram(VectorSpace):
    def __init__(self, corpus, obs_corpus, target_corpus=None, ngram=3, svd_dim=100, svd_n_iter=5):
        self.obs_corpus = obs_corpus
        self.ngram = ngram
        self.svd_dim = svd_dim
        self.svd_n_iter = svd_n_iter
        self.corpus = corpus
        self.target_corpus = target_corpus
        
    def word_transform(self):
        tfidf = self._init_word_ngram_tfidf(self.ngram)
        tfidf.fit(self.corpus)
        X = tfidf.transform(self.obs_corpus)
#         word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
        svd = TruncatedSVD(n_components = self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)
    
    def char_transform(self):
        tfidf = self._init_char_ngram_tfidf(self.ngram)
        tfidf.fit(self.corpus)
        X = tfidf.transform(self.obs_corpus)
        svd = TruncatedSVD(n_components = self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)

    def pair_transform(self):
        ## tfidf
        tfidf = self._init_word_ngram_tfidf(ngram=self.ngram)
        tfidf.fit(self.corpus)
        X_obs = tfidf.transform(self.obs_corpus)
        tfidf = self._init_word_ngram_tfidf(ngram=self.ngram)
        tfidf.fit(self.corpus)
        X_target = tfidf.transform(self.target_corpus)
        X_tfidf = scipy.sparse.hstack([X_obs, X_target]).tocsr()
        ## svd
        svd = TruncatedSVD(n_components=self.svd_dim, n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        X_svd = svd.fit_transform(X_tfidf)
        return X_svd
    
all_corpus = []
feats_corpus = ['question1','question2']
for f in feats_corpus:
    train[f] = train[f].astype(str)
    all_corpus += train[f].values.tolist()
print len(all_corpus)


5500172

In [ ]:
w_ngram = 3
c_ngram = 10
svd_dim = config.SVD_DIM

lsa_word = LSA_Ngram(all_corpus,train['question1'], ngram=w_ngram, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
lsa_word_q1 = lsa_word.word_transform()
print 'lsa_word_q1'
lsa_word = LSA_Ngram(all_corpus,train['question2'], ngram=w_ngram, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
lsa_word_q2 = lsa_word.word_transform()
print 'lsa_word_q2'
lsa_pair = LSA_Ngram(all_corpus,train['question1'], target_corpus=train['question2'], ngram=w_ngram, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
lsa_pair = lsa_pair.pair_transform()
print 'lsa_pair'
lsa_char = LSA_Ngram(all_corpus,train['question1'], ngram=c_ngram, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
lsa_char_q1 = lsa_word.word_transform()
print 'lsa_char_q1'
lsa_char = LSA_Ngram(all_corpus,train['question2'], ngram=c_ngram, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
lsa_char_q2 = lsa_word.word_transform()
print 'lsa_char_q2'

lsa_w_q1_df = pd.DataFrame(lsa_word_q1,columns=['lsa_wn%s_q1_%s'%(str(w_ngram),i) for i in range(svd_dim)])
lsa_w_q2_df = pd.DataFrame(lsa_word_q2,columns=['lsa_wn%s_q2_%s'%(str(w_ngram),i) for i in range(svd_dim)])
lsa_c_q1_df = pd.DataFrame(lsa_char_q1,columns=['lsa_cn%s_q1_%s'%(str(c_ngram),i) for i in range(svd_dim)])
lsa_c_q2_df = pd.DataFrame(lsa_char_q2,columns=['lsa_cn%s_q2_%s'%(str(c_ngram),i) for i in range(svd_dim)])
lsa_pair_df = pd.DataFrame(lsa_pair,columns=['lsa_pn%s_%s'%(str(c_ngram),i) for i in range(svd_dim)])
train = pd.concat([train, lsa_w_q1_df, lsa_w_q2_df, lsa_c_q1_df, lsa_c_q2_df, lsa_pair_df], axis=1)
print 'lsa shape: {}'.format(train.shape)
del lsa_w_q1_df,lsa_w_q2_df,lsa_c_q1_df,lsa_c_q2_df,lsa_pair_df
gc.collect()

In [19]:
train[train['is_duplicate']!=-1].corr()


Out[19]:
is_duplicate nmf_w_question1_n2_0 nmf_w_question1_n2_1 nmf_w_question2_n2_0 nmf_w_question2_n2_1 nmf_w_question1_n3_0 nmf_w_question1_n3_1 nmf_w_question2_n3_0 nmf_w_question2_n3_1 nmf_c_question1_n5_0 ... lsa_pn10_0 lsa_pn10_1 lsa_pn10_2 lsa_pn10_3 lsa_pn10_4 lsa_pn10_5 lsa_pn10_6 lsa_pn10_7 lsa_pn10_8 lsa_pn10_9
is_duplicate 1.000000 -0.058080 0.119426 0.053889 -0.032951 0.008327 0.028104 -0.058362 0.077801 0.190158 ... 0.133392 0.010142 -0.014930 0.120775 0.048751 0.004378 0.043981 -0.000681 0.003712 0.044517
nmf_w_question1_n2_0 -0.058080 1.000000 -0.847950 -0.289491 0.376751 0.412114 -0.077468 0.078538 -0.004384 0.049978 ... -0.644813 0.619593 0.037699 0.001731 0.110802 0.081337 -0.206075 0.219168 0.192366 0.009580
nmf_w_question1_n2_1 0.119426 -0.847950 1.000000 0.249053 -0.331806 -0.403109 0.303338 -0.176393 0.101475 -0.004041 ... 0.815746 -0.500481 0.025050 0.221024 -0.088543 -0.020016 0.150933 0.072284 -0.082271 0.008744
nmf_w_question2_n2_0 0.053889 -0.289491 0.249053 1.000000 -0.894158 -0.156686 0.057523 0.430402 -0.527214 0.079745 ... 0.409492 -0.533911 -0.108115 0.107207 -0.012903 0.218550 -0.030829 -0.150484 -0.202274 -0.124783
nmf_w_question2_n2_1 -0.032951 0.376751 -0.331806 -0.894158 1.000000 0.195430 -0.059060 -0.160798 0.423257 -0.048486 ... -0.457665 0.738446 0.002167 -0.102232 0.166780 -0.170274 0.081416 0.004967 0.199238 0.086714
nmf_w_question1_n3_0 0.008327 0.412114 -0.403109 -0.156686 0.195430 1.000000 -0.877356 0.115417 -0.094365 0.106528 ... -0.356666 0.499983 -0.103048 0.206312 -0.542668 -0.014617 -0.304955 -0.179373 0.004722 0.021046
nmf_w_question1_n3_1 0.028104 -0.077468 0.303338 0.057523 -0.059060 -0.877356 1.000000 -0.146963 0.154252 -0.073281 ... 0.311819 -0.254491 0.144221 -0.031413 0.615541 0.013404 0.185986 0.406767 0.090908 -0.010602
nmf_w_question2_n3_0 -0.058362 0.078538 -0.176393 0.430402 -0.160798 0.115417 -0.146963 1.000000 -0.873306 -0.060488 ... -0.260762 -0.001740 -0.073954 -0.028328 -0.012455 0.017337 -0.124595 -0.375886 0.178040 -0.370603
nmf_w_question2_n3_1 0.077801 -0.004384 0.101475 -0.527214 0.423257 -0.094365 0.154252 -0.873306 1.000000 0.077307 ... 0.222915 0.195202 -0.192921 -0.013081 0.175609 -0.060497 0.220421 0.248554 -0.021242 0.388637
nmf_c_question1_n5_0 0.190158 0.049978 -0.004041 0.079745 -0.048486 0.106528 -0.073281 -0.060488 0.077307 1.000000 ... -0.021562 0.040121 -0.192731 0.260354 -0.055649 0.276118 0.015915 0.097859 -0.074003 0.039410
nmf_c_question1_n5_1 -0.084058 0.027826 0.012115 -0.057412 0.078459 -0.092495 0.132959 0.040830 -0.025333 -0.748120 ... 0.017946 0.047146 0.113775 -0.102970 0.180938 -0.022952 -0.105591 -0.014987 0.075195 -0.085053
nmf_c_question2_n5_0 -0.056386 0.019698 0.048240 0.136513 -0.071644 -0.080025 0.130102 0.070134 -0.049091 -0.356966 ... 0.156434 0.004448 0.039782 -0.015287 0.152844 0.127628 -0.047670 0.010135 0.013926 -0.088354
nmf_c_question2_n5_1 0.052822 0.100135 -0.090696 -0.346082 0.271524 0.275692 -0.247077 -0.177124 0.167189 0.370423 ... -0.207789 0.242665 -0.026858 0.184525 -0.314311 -0.070512 0.071357 0.062813 0.011345 0.011064
nmf_c_question1_n10_0 -0.213885 0.067446 -0.018897 -0.038417 0.078130 0.056684 -0.015294 0.016005 -0.012932 -0.540306 ... -0.038201 0.152275 0.064060 -0.057557 0.029505 0.238601 -0.057526 -0.019068 -0.091714 0.005354
nmf_c_question1_n10_1 0.303387 -0.179079 0.273084 0.054205 -0.043893 -0.128398 0.156254 -0.035274 0.090519 0.475879 ... 0.257008 -0.106537 -0.128133 0.187826 0.062869 -0.084841 0.042311 0.058280 0.213102 0.011335
nmf_c_question2_n10_0 -0.233471 0.068308 -0.060567 0.019551 0.005600 0.023922 -0.014576 0.061503 -0.052520 -0.340837 ... 0.017911 0.051129 0.002879 -0.185613 -0.053212 0.183163 -0.088853 -0.006643 0.026437 -0.029511
nmf_c_question2_n10_1 0.231363 0.060704 -0.038656 -0.101668 0.137485 -0.016022 0.063253 -0.024860 0.064302 0.224315 ... -0.127270 0.109006 0.066106 0.219819 0.202554 -0.140879 0.077015 -0.022473 0.052249 0.156289
nmf_p_n1_0 -0.136864 0.703238 -0.790293 -0.489777 0.588296 0.228163 -0.120173 0.226455 -0.130188 -0.047816 ... -0.903524 0.588855 0.081490 -0.288358 0.171120 0.077581 -0.076919 0.058227 0.163037 -0.141970
nmf_p_n1_1 0.155769 -0.675476 0.836750 0.452750 -0.526089 -0.392583 0.329401 -0.230745 0.147768 -0.006453 ... 0.973350 -0.534818 0.035897 0.178705 -0.044062 -0.003182 0.197729 0.085098 -0.076183 0.018371
nmf_p_n2_0 -0.075997 0.758085 -0.744044 -0.529364 0.636930 0.324605 -0.127256 0.127702 -0.043468 0.062853 ... -0.872755 0.732920 0.096163 0.025731 0.147053 0.055300 0.003683 0.157904 0.069797 -0.123454
nmf_p_n2_1 0.124155 -0.712525 0.843019 0.490658 -0.578740 -0.435066 0.343384 -0.227853 0.137173 -0.015881 ... 0.975827 -0.618769 0.008681 0.147122 -0.024986 -0.018409 0.178850 0.075676 -0.077691 0.040349
nmf_p_n3_0 -0.059816 0.756271 -0.753153 -0.548821 0.673486 0.343967 -0.155692 0.119235 -0.003290 0.048907 ... -0.858468 0.751868 0.150851 0.037071 0.171966 0.083210 0.000975 0.041388 0.183909 0.065504
nmf_p_n3_1 0.122319 -0.694970 0.831068 0.462938 -0.548492 -0.439306 0.354405 -0.252318 0.179144 -0.017312 ... 0.982708 -0.600286 -0.007333 0.139569 -0.003100 -0.016721 0.188165 0.068026 -0.047215 0.085943
svd_cooc_on1_tn1_0 0.131651 -0.555091 0.657257 0.267352 -0.364999 -0.447077 0.372745 -0.306814 0.277740 0.008708 ... 0.715357 -0.512797 -0.124833 0.094969 0.102467 0.010771 0.164794 0.126094 -0.011111 0.080380
svd_cooc_on1_tn1_1 0.038832 0.129923 -0.106261 -0.066342 0.199959 -0.470988 0.537292 0.000781 0.090710 -0.077970 ... -0.026021 0.118694 0.263773 -0.190786 0.704409 0.001077 0.100750 -0.022921 -0.032865 -0.009181
svd_cooc_on1_tn1_2 0.148057 0.247946 -0.024874 -0.090264 0.243964 0.265721 -0.025638 -0.067248 0.216338 0.291958 ... -0.012043 0.476972 -0.339268 0.644098 0.140611 0.024623 0.089417 0.117001 -0.116875 0.045685
svd_cooc_on1_tn1_3 0.025690 0.099593 -0.058879 0.110237 -0.067668 0.049806 -0.040324 -0.028564 0.006268 0.190588 ... -0.050459 0.085572 -0.064023 -0.190191 -0.071691 0.746376 -0.132020 0.147157 -0.000132 0.118883
svd_cooc_on1_tn1_4 0.009097 0.222462 -0.025086 -0.108737 0.070719 0.010896 0.178636 -0.030817 -0.008102 -0.046769 ... 0.001030 0.149537 -0.029431 -0.224763 -0.147824 -0.219967 -0.144276 0.629280 0.037505 0.002594
svd_cooc_on1_tn1_5 0.031120 0.103412 -0.072281 -0.167169 0.276353 0.133343 -0.085523 0.178594 0.059074 -0.038980 ... -0.023596 0.170465 -0.120799 -0.029295 -0.025406 -0.071280 0.030646 -0.221658 0.616849 0.050114
svd_cooc_on1_tn1_6 0.077903 0.007568 0.099284 -0.188547 0.104234 0.168997 -0.111443 -0.185769 0.000285 -0.024157 ... 0.104061 0.226433 0.695037 0.187815 -0.306114 0.007015 0.186668 -0.005562 -0.018300 0.022104
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
lsa_cn10_q1_0 0.109538 -0.400786 0.500068 0.569333 -0.596009 -0.285141 0.246564 -0.204686 0.169412 0.015990 ... 0.881618 -0.487719 -0.057207 0.095237 0.019389 -0.008343 0.140284 0.038275 -0.038553 0.101874
lsa_cn10_q1_1 -0.003234 0.301252 -0.249542 -0.625126 0.811679 0.205391 -0.084979 -0.109504 0.304597 -0.054459 ... -0.257028 0.810229 0.152508 -0.191448 0.020840 -0.098603 0.216621 -0.025837 -0.107784 0.073238
lsa_cn10_q1_2 -0.007182 -0.109982 -0.028642 0.041419 0.001414 0.120248 -0.247848 -0.003354 0.192281 0.053818 ... 0.011656 0.015964 -0.690691 -0.320149 -0.365890 -0.071653 0.265883 -0.079071 -0.164900 0.024871
lsa_cn10_q1_3 0.003527 -0.056882 -0.046532 0.029773 0.091593 -0.522264 0.440057 0.025701 0.126679 -0.131104 ... -0.019905 -0.149437 -0.001641 -0.438223 0.731220 0.015722 0.285492 -0.016207 -0.059797 -0.112310
lsa_cn10_q1_4 0.026248 0.128168 -0.095150 -0.023438 0.146787 0.096739 -0.002612 0.023112 0.141100 0.071495 ... -0.083842 0.114358 -0.417350 0.102057 0.338642 -0.161522 -0.587398 -0.024376 -0.123735 0.196654
lsa_cn10_q1_5 0.091251 0.153032 0.016878 0.063231 0.089390 0.052942 0.104218 0.065186 0.107715 0.248084 ... -0.005865 0.241453 -0.295461 0.642479 0.234898 0.183411 0.391393 0.068711 0.119722 -0.179156
lsa_cn10_q1_6 -0.026047 0.098229 -0.075862 0.090049 -0.051275 0.028308 -0.026484 -0.000507 0.010002 0.144936 ... -0.087368 0.035446 -0.123369 -0.223351 -0.044146 0.771709 -0.169608 0.126815 0.129589 0.092285
lsa_cn10_q1_7 -0.042580 -0.000044 0.011899 -0.471223 0.368675 -0.049233 0.052157 -0.003208 0.191530 -0.106010 ... -0.033646 -0.107839 -0.197280 -0.033501 0.049501 -0.219851 -0.036854 -0.013218 0.677865 -0.071892
lsa_cn10_q1_8 -0.042248 0.024935 -0.112954 0.499539 -0.216141 0.087562 -0.125767 0.983964 -0.858493 -0.072403 ... -0.136206 -0.026568 -0.038408 -0.042661 -0.032018 -0.000624 -0.078345 -0.390568 0.154411 -0.352251
lsa_cn10_q1_9 0.046044 -0.003270 -0.043698 -0.001542 0.141362 0.063167 -0.087693 -0.019549 0.227470 0.009594 ... 0.051020 0.113792 0.081326 -0.007781 0.058297 -0.034769 0.122291 -0.353028 0.330960 0.378764
lsa_cn10_q2_0 0.109538 -0.400786 0.500068 0.569333 -0.596009 -0.285141 0.246564 -0.204686 0.169412 0.015990 ... 0.881618 -0.487719 -0.057207 0.095237 0.019389 -0.008343 0.140284 0.038275 -0.038553 0.101874
lsa_cn10_q2_1 -0.003234 0.301252 -0.249542 -0.625126 0.811679 0.205391 -0.084979 -0.109504 0.304597 -0.054459 ... -0.257028 0.810229 0.152508 -0.191448 0.020840 -0.098603 0.216621 -0.025837 -0.107784 0.073238
lsa_cn10_q2_2 -0.007182 -0.109982 -0.028642 0.041419 0.001414 0.120248 -0.247848 -0.003354 0.192281 0.053818 ... 0.011656 0.015964 -0.690691 -0.320149 -0.365890 -0.071653 0.265883 -0.079071 -0.164900 0.024871
lsa_cn10_q2_3 0.003527 -0.056882 -0.046532 0.029773 0.091593 -0.522264 0.440057 0.025701 0.126679 -0.131104 ... -0.019905 -0.149437 -0.001641 -0.438223 0.731220 0.015722 0.285492 -0.016207 -0.059797 -0.112310
lsa_cn10_q2_4 0.026248 0.128168 -0.095150 -0.023438 0.146787 0.096739 -0.002612 0.023112 0.141100 0.071495 ... -0.083842 0.114358 -0.417350 0.102057 0.338642 -0.161522 -0.587398 -0.024376 -0.123735 0.196654
lsa_cn10_q2_5 0.091251 0.153032 0.016878 0.063231 0.089390 0.052942 0.104218 0.065186 0.107715 0.248084 ... -0.005865 0.241453 -0.295461 0.642479 0.234898 0.183411 0.391393 0.068711 0.119722 -0.179156
lsa_cn10_q2_6 -0.026047 0.098229 -0.075862 0.090049 -0.051275 0.028308 -0.026484 -0.000507 0.010002 0.144936 ... -0.087368 0.035446 -0.123369 -0.223351 -0.044146 0.771709 -0.169608 0.126815 0.129589 0.092285
lsa_cn10_q2_7 -0.042580 -0.000044 0.011899 -0.471223 0.368675 -0.049233 0.052157 -0.003208 0.191530 -0.106010 ... -0.033646 -0.107839 -0.197280 -0.033501 0.049501 -0.219851 -0.036854 -0.013218 0.677865 -0.071892
lsa_cn10_q2_8 -0.042248 0.024935 -0.112954 0.499539 -0.216141 0.087562 -0.125767 0.983964 -0.858493 -0.072403 ... -0.136206 -0.026568 -0.038408 -0.042661 -0.032018 -0.000624 -0.078345 -0.390568 0.154411 -0.352251
lsa_cn10_q2_9 0.046044 -0.003270 -0.043698 -0.001542 0.141362 0.063167 -0.087693 -0.019549 0.227470 0.009594 ... 0.051020 0.113792 0.081326 -0.007781 0.058297 -0.034769 0.122291 -0.353028 0.330960 0.378764
lsa_pn10_0 0.133392 -0.644813 0.815746 0.409492 -0.457665 -0.356666 0.311819 -0.260762 0.222915 -0.021562 ... 1.000000 -0.456090 -0.027271 0.130244 -0.029419 -0.025479 0.179323 0.067297 -0.069755 0.093889
lsa_pn10_1 0.010142 0.619593 -0.500481 -0.533911 0.738446 0.499983 -0.254491 -0.001740 0.195202 0.040121 ... -0.456090 1.000000 0.004767 0.026890 -0.026471 0.004638 0.093951 0.039588 -0.026690 0.100233
lsa_pn10_2 -0.014930 0.037699 0.025050 -0.108115 0.002167 -0.103048 0.144221 -0.073954 -0.192921 -0.192731 ... -0.027271 0.004767 1.000000 0.030073 0.022368 -0.008152 -0.003089 -0.006995 0.002638 0.003323
lsa_pn10_3 0.120775 0.001731 0.221024 0.107207 -0.102232 0.206312 -0.031413 -0.028328 -0.013081 0.260354 ... 0.130244 0.026890 0.030073 1.000000 0.008186 -0.031486 -0.014287 -0.020341 0.043929 -0.055640
lsa_pn10_4 0.048751 0.110802 -0.088543 -0.012903 0.166780 -0.542668 0.615541 -0.012455 0.175609 -0.055649 ... -0.029419 -0.026471 0.022368 0.008186 1.000000 -0.012202 0.007866 -0.005920 0.037461 -0.011341
lsa_pn10_5 0.004378 0.081337 -0.020016 0.218550 -0.170274 -0.014617 0.013404 0.017337 -0.060497 0.276118 ... -0.025479 0.004638 -0.008152 -0.031486 -0.012202 1.000000 -0.005504 0.052061 -0.000850 0.028690
lsa_pn10_6 0.043981 -0.206075 0.150933 -0.030829 0.081416 -0.304955 0.185986 -0.124595 0.220421 0.015915 ... 0.179323 0.093951 -0.003089 -0.014287 0.007866 -0.005504 1.000000 -0.036986 -0.004858 -0.047068
lsa_pn10_7 -0.000681 0.219168 0.072284 -0.150484 0.004967 -0.179373 0.406767 -0.375886 0.248554 0.097859 ... 0.067297 0.039588 -0.006995 -0.020341 -0.005920 0.052061 -0.036986 1.000000 0.008285 0.074918
lsa_pn10_8 0.003712 0.192366 -0.082271 -0.202274 0.199238 0.004722 0.090908 0.178040 -0.021242 -0.074003 ... -0.069755 -0.026690 0.002638 0.043929 0.037461 -0.000850 -0.004858 0.008285 1.000000 -0.028552
lsa_pn10_9 0.044517 0.009580 0.008744 -0.124783 0.086714 0.021046 -0.010602 -0.370603 0.388637 0.039410 ... 0.093889 0.100233 0.003323 -0.055640 -0.011341 0.028690 -0.047068 0.074918 -0.028552 1.000000

282 rows × 282 columns


In [27]:
class TSNE_LSA_Ngram(LSA_Ngram):
    def __init__(self, corpus, obs_corpus, target_corpus=None, ngram=3, svd_dim=100, svd_n_iter=5):
        LSA_Ngram.__init__(self, corpus, obs_corpus, target_corpus, ngram, svd_dim, svd_n_iter)

    def tsne_word_transform(self):
        X_svd = self.word_transform()
        X_scaled = StandardScaler().fit_transform(X_svd)
        X_tsne = TSNE(init='pca').fit_transform(X_scaled)
        return X_tsne
    
    def tsne_char_transform(self):
        X_svd = self.chsvd_dim
        X_scaled = StandardScaler().fit_transform(X_svd)
        X_tsne = TSNE().fit_transform(X_scaled)
        return X_tsne
    
    def tsne_pair_transform(self):
        X_svd = self.pair_transform()
        X_scaled = StandardScaler().fit_transform(X_svd)
        X_tsne = TSNE().fit_transform(X_scaled)
        return X_tsne

    def nmf_word_transform(self):
        X_svd = self.word_transform()
        X_scaled = MinMaxScaler().fit_transform(X_svd)
        X_nmf = NMF(n_components=2, init='random', random_state=np.random.randint(1, 10000)).fit_transform(X_scaled)
        return X_nmf
    
    def nmf_char_transform(self):
        X_svd = self.char_transform()
        X_scaled = MinMaxScaler().fit_transform(X_svd)
        X_nmf = NMF(n_components=2, init='random', random_state=np.random.randint(1, 10000)).fit_transform(X_scaled)
        return X_nmf
    
    def nmf_pair_transform(self):
        X_svd = self.pair_transform()
        X_scaled = MinMaxScaler().fit_transform(X_svd)
        X_nmf = NMF(n_components=2, init='random', random_state=np.random.randint(1, 10000)).fit_transform(X_scaled)
        return X_nmf

# svd_dim = config.SVD_DIM
# svd_dim = 10


############# tsne ###################    
# for NGRAM in [2,3]:
#     for COL in ['question1','question2']:
#         model = TSNE_LSA_Ngram(all_corpus, train[COL], ngram=NGRAM, svd_dim=svd_dim, svd_n_iter=config.SVD_N_ITER)
#         lsa_word = model.tsne_word_transform()
#         lsa_word = pd.DataFrame(lsa_word,columns=['tsne_w_%s_n%s_%s'%(COL,str(NGRAM),i) for i in range(2)])
#         train = pd.concat([train, lsa_word], axis=1)
# print 'tsne_word_transform: {}'.format(train.shape)

# for NGRAM in [5,10]:
#     for COL in ['question1','question2']:
#         model = TSNE_LSA_Ngram(all_corpus, train[COL], ngram=NGRAM, svd_dim=svd_dim, 
#                                   svd_n_iter=config.SVD_N_ITER)
#         lsa_word = model.tsne_char_transform()
#         lsa_word = pd.DataFrame(lsa_word,columns=['tsne_c_%s_n%s_%s'%(COL,str(NGRAM),i) for i in range(2)])
#         train = pd.concat([train, lsa_word], axis=1)
# print 'tsne_char_transform: {}'.format(train.shape)

# for NGRAM in [1,2]:
#     model = TSNE_LSA_Ngram(all_corpus,train['question1'], target_corpus=train['question2'], ngram=NGRAM, 
#                                svd_dim=svd_dim, svd_n_iter=config.SVD_N_ITER)
#     tsne_pair = model.tsne_pair_transform()
#     tsne_pair = pd.DataFrame(tsne_pair,columns=['tsne_p_n%s_%s'%(str(NGRAM),i) for i in range(2)])
#     train = pd.concat([train, tsne_pair], axis=1)
# print 'tsne_pair_transform: {}'.format(train.shape)

############# nmf ###################    
for NGRAM in [2,3]:
    for COL in ['question1','question2']:
        model = TSNE_LSA_Ngram(all_corpus, train[COL], ngram=NGRAM, svd_dim=svd_dim, svd_n_iter=config.SVD_N_ITER)
        lsa_word = model.nmf_word_transform()
        lsa_word = pd.DataFrame(lsa_word,columns=['nmf_w_%s_n%s_%s'%(COL,str(NGRAM),i) for i in range(2)])
        train = pd.concat([train, lsa_word], axis=1)
print 'nmf_word_transform: {}'.format(train.shape)

for NGRAM in [5,10]:
    for COL in ['question1','question2']:
        model = TSNE_LSA_Ngram(all_corpus, train[COL], ngram=NGRAM, svd_dim=svd_dim, 
                                  svd_n_iter=config.SVD_N_ITER)
        lsa_word = model.nmf_char_transform()
        lsa_word = pd.DataFrame(lsa_word,columns=['nmf_c_%s_n%s_%s'%(COL,str(NGRAM),i) for i in range(2)])
        train = pd.concat([train, lsa_word], axis=1)
print 'nmf_char_transform: {}'.format(train.shape)

for NGRAM in [1,2,3]:
    model = TSNE_LSA_Ngram(all_corpus,train['question1'], target_corpus=train['question2'], ngram=NGRAM, 
                               svd_dim=svd_dim, svd_n_iter=config.SVD_N_ITER)
    tsne_pair = model.nmf_pair_transform()
    tsne_pair = pd.DataFrame(tsne_pair,columns=['nmf_p_n%s_%s'%(str(NGRAM),i) for i in range(2)])
    train = pd.concat([train, tsne_pair], axis=1)
print 'nmf_pair_transform: {}'.format(train.shape)

del lsa_word, tsne_pair
gc.collect()


nmf_word_transform: (2750086, 61)
nmf_char_transform: (2750086, 69)
nmf_pair_transform: (2750086, 75)
Out[27]:
92

In [7]:
train[train['is_duplicate']!=-1].corr()


Out[7]:
is_duplicate lsa_wn3_q1_0 lsa_wn3_q1_1 lsa_wn3_q1_2 lsa_wn3_q1_3 lsa_wn3_q1_4 lsa_wn3_q1_5 lsa_wn3_q1_6 lsa_wn3_q1_7 lsa_wn3_q1_8 ... svd_cooc_on2_tn2_3 svd_cooc_on2_tn2_4 svd_cooc_on2_tn2_5 svd_cooc_on2_tn2_6 svd_cooc_on2_tn2_7 svd_cooc_on2_tn2_8 svd_cooc_on2_tn2_9 svd_cosine_w_n2_10 svd_cosine_w_n3_10 svd_cosine_c_n5_10
is_duplicate 1.000000 0.149820 0.014555 0.026671 0.056715 -0.049383 0.004689 0.031299 0.047388 -0.025191 ... -0.024450 0.016145 0.043374 0.013766 0.031068 -0.036887 -0.009420 0.188513 0.180911 0.222302
lsa_wn3_q1_0 0.149820 1.000000 -0.108788 0.007928 -0.163416 -0.230964 0.040128 -0.113475 0.011017 -0.176637 ... -0.009490 0.016581 0.135017 0.025705 0.001945 -0.018439 0.025034 -0.045139 -0.046287 -0.018910
lsa_wn3_q1_1 0.014555 -0.108788 1.000000 -0.165131 0.017630 0.021461 -0.000716 0.014486 -0.000030 0.018098 ... -0.005530 0.020691 -0.033469 -0.046963 0.001166 -0.013803 -0.002231 0.015786 0.013346 0.015597
lsa_wn3_q1_2 0.026671 0.007928 -0.165131 1.000000 0.008654 0.006449 -0.000063 0.011678 0.002074 0.007082 ... 0.004254 -0.074210 -0.064871 0.099541 -0.001805 0.033724 0.011517 0.035399 0.034342 0.035107
lsa_wn3_q1_3 0.056715 -0.163416 0.017630 0.008654 1.000000 -0.081576 0.033906 -0.055381 0.003350 -0.059820 ... -0.004422 -0.011350 -0.071556 -0.021209 0.027925 0.018838 -0.014323 0.070929 0.070132 0.052950
lsa_wn3_q1_4 -0.049383 -0.230964 0.021461 0.006449 -0.081576 1.000000 0.108260 -0.217296 0.002171 -0.078603 ... 0.005066 -0.018954 -0.127575 -0.018958 0.117457 0.021715 -0.028912 0.019382 0.012625 0.018984
lsa_wn3_q1_5 0.004689 0.040128 -0.000716 -0.000063 0.033906 0.108260 1.000000 -0.050229 -0.012692 0.032243 ... -0.012284 0.121040 0.496112 -0.039306 -0.006954 0.007704 0.002205 -0.008748 0.001378 0.014799
lsa_wn3_q1_6 0.031299 -0.113475 0.014486 0.011678 -0.055381 -0.217296 -0.050229 1.000000 0.134473 0.194451 ... 0.001741 0.064017 0.251591 -0.037554 0.384134 0.012058 -0.013296 -0.024241 -0.020748 -0.026015
lsa_wn3_q1_7 0.047388 0.011017 -0.000030 0.002074 0.003350 0.002171 -0.012692 0.134473 1.000000 -0.014828 ... -0.002459 0.094413 0.234282 -0.024720 -0.153890 -0.018820 -0.011751 0.026097 0.010432 -0.001146
lsa_wn3_q1_8 -0.025191 -0.176637 0.018098 0.007082 -0.059820 -0.078603 0.032243 0.194451 -0.014828 1.000000 ... -0.001214 0.071817 0.201427 -0.054332 -0.287545 0.047919 -0.022639 0.014240 0.036719 -0.029828
lsa_wn3_q1_9 0.056474 0.029392 -0.012754 -0.005112 0.009023 0.005554 0.055233 -0.030812 0.036289 -0.047997 ... -0.005612 0.047295 0.221090 0.024729 0.037723 -0.037134 0.006541 0.056755 0.025628 0.038336
lsa_wn3_q2_0 0.114369 0.569742 -0.097516 0.117163 -0.095464 -0.121947 0.006018 -0.074691 0.007755 -0.097858 ... -0.010381 0.018349 0.141364 0.018318 -0.001418 -0.017006 0.024539 -0.077624 -0.076342 -0.054870
lsa_wn3_q2_1 0.008334 0.103844 -0.575711 0.657305 -0.012849 -0.022890 0.000865 -0.010003 -0.002786 -0.025001 ... 0.005603 -0.065048 -0.040271 0.049757 -0.001560 0.035636 0.008435 0.023334 0.023175 0.025523
lsa_wn3_q2_2 0.046130 0.077817 0.481105 0.498460 -0.013459 -0.022910 0.002800 -0.006579 -0.002626 -0.020817 ... -0.001247 -0.043588 -0.082925 0.023223 0.001205 0.017448 0.009713 0.045839 0.042646 0.047272
lsa_wn3_q2_3 0.048507 -0.084403 0.005796 -0.039043 0.813994 -0.042698 0.041126 -0.093931 -0.002978 -0.103546 ... -0.003901 -0.016668 -0.095847 -0.012575 0.039862 0.020785 -0.017951 0.076002 0.073627 0.054044
lsa_wn3_q2_4 -0.063876 -0.108754 0.012123 -0.057812 -0.196678 0.783254 0.071936 -0.256004 -0.021414 -0.135390 ... 0.005375 -0.031582 -0.174508 -0.008502 0.113007 0.015960 -0.026392 0.012036 0.003923 0.015017
lsa_wn3_q2_5 0.006272 -0.035754 0.002727 0.014784 0.038026 0.173295 0.719770 0.044398 0.163463 0.064610 ... -0.009750 0.148801 0.564331 -0.044715 0.008119 0.017620 -0.007790 0.007853 0.013402 0.008022
lsa_wn3_q2_6 -0.045395 0.056034 -0.008962 0.006282 0.064297 0.243674 0.326753 -0.626924 -0.542938 -0.155565 ... -0.001294 -0.064024 -0.173935 0.013321 -0.131850 -0.002036 0.017876 -0.006698 -0.000647 0.024252
lsa_wn3_q2_7 -0.013450 -0.063268 0.009760 -0.011249 -0.070982 -0.163654 0.041571 0.433826 -0.517427 0.156335 ... 0.003886 -0.009181 0.059061 -0.014733 0.375673 0.018326 -0.002597 -0.027324 -0.009951 -0.005059
lsa_wn3_q2_8 -0.018749 -0.093514 0.006926 -0.024192 -0.091511 -0.118040 -0.012484 0.057054 -0.049721 0.638594 ... -0.002798 0.055835 0.161950 -0.000283 -0.341947 0.050086 -0.020829 0.029413 0.050312 -0.025886
lsa_wn3_q2_9 0.057383 0.064139 -0.013217 -0.005749 0.026115 0.042420 0.043811 -0.044534 0.021453 -0.106786 ... -0.006943 0.040230 0.167567 0.006761 0.061502 -0.026272 0.011140 0.059770 0.016553 0.037322
lsa_cn10_q1_0 0.114369 0.569742 -0.097516 0.117163 -0.095464 -0.121947 0.006018 -0.074691 0.007755 -0.097858 ... -0.010381 0.018349 0.141364 0.018318 -0.001418 -0.017006 0.024539 -0.077624 -0.076342 -0.054870
lsa_cn10_q1_1 0.008334 0.103844 -0.575711 0.657305 -0.012849 -0.022890 0.000865 -0.010003 -0.002786 -0.025001 ... 0.005603 -0.065048 -0.040271 0.049757 -0.001560 0.035636 0.008435 0.023334 0.023175 0.025523
lsa_cn10_q1_2 0.046130 0.077817 0.481105 0.498460 -0.013459 -0.022910 0.002800 -0.006579 -0.002626 -0.020817 ... -0.001247 -0.043588 -0.082925 0.023223 0.001205 0.017448 0.009713 0.045839 0.042646 0.047272
lsa_cn10_q1_3 0.048507 -0.084403 0.005796 -0.039043 0.813994 -0.042698 0.041126 -0.093931 -0.002978 -0.103546 ... -0.003901 -0.016668 -0.095847 -0.012575 0.039862 0.020785 -0.017951 0.076002 0.073627 0.054044
lsa_cn10_q1_4 -0.063876 -0.108754 0.012123 -0.057812 -0.196678 0.783254 0.071936 -0.256004 -0.021414 -0.135390 ... 0.005375 -0.031582 -0.174508 -0.008502 0.113007 0.015960 -0.026392 0.012036 0.003923 0.015017
lsa_cn10_q1_5 0.006272 -0.035754 0.002727 0.014784 0.038026 0.173295 0.719770 0.044398 0.163463 0.064610 ... -0.009750 0.148801 0.564331 -0.044715 0.008119 0.017620 -0.007790 0.007853 0.013402 0.008022
lsa_cn10_q1_6 -0.045395 0.056034 -0.008962 0.006282 0.064297 0.243674 0.326753 -0.626924 -0.542938 -0.155565 ... -0.001294 -0.064024 -0.173935 0.013321 -0.131850 -0.002036 0.017876 -0.006698 -0.000647 0.024252
lsa_cn10_q1_7 -0.013450 -0.063268 0.009760 -0.011249 -0.070982 -0.163654 0.041571 0.433826 -0.517427 0.156335 ... 0.003886 -0.009181 0.059061 -0.014733 0.375673 0.018326 -0.002597 -0.027324 -0.009951 -0.005059
lsa_cn10_q1_8 -0.018749 -0.093514 0.006926 -0.024192 -0.091511 -0.118040 -0.012484 0.057054 -0.049721 0.638594 ... -0.002798 0.055835 0.161950 -0.000283 -0.341947 0.050086 -0.020829 0.029413 0.050312 -0.025886
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
svd_cooc_on1_tn2_3 0.037603 0.009341 -0.005055 -0.004686 0.019521 0.121224 0.006839 0.397958 -0.142202 -0.275442 ... 0.008562 -0.044670 0.014261 -0.016381 0.912852 -0.006919 0.004108 0.021936 0.020644 0.017523
svd_cooc_on1_tn2_4 0.052092 0.152320 -0.021981 -0.086951 -0.079565 -0.140367 0.509134 0.249479 0.245866 0.205505 ... -0.020238 0.235532 0.901113 -0.063090 -0.027643 0.014111 -0.002820 0.035136 0.034767 0.023688
svd_cooc_on1_tn2_5 0.014640 0.017134 -0.024884 0.049786 -0.014080 -0.019282 0.000065 -0.009349 -0.011636 -0.032206 ... -0.004619 -0.094771 0.061828 0.865215 0.025292 -0.020561 -0.011886 0.003308 0.003061 0.002216
svd_cooc_on1_tn2_6 -0.026988 -0.004655 0.000953 -0.000781 -0.014022 0.005280 -0.004873 0.003123 0.004842 0.002747 ... 0.997132 -0.445545 0.025286 -0.007023 -0.021036 0.026715 -0.003833 0.022136 0.020948 0.026818
svd_cooc_on1_tn2_7 0.054587 0.040885 -0.023912 0.013894 0.535453 -0.193177 0.033407 -0.037721 0.007054 -0.081436 ... 0.019259 -0.001386 -0.006018 -0.000868 0.011818 0.066920 0.021538 0.045238 0.043216 0.049539
svd_cooc_on1_tn2_8 -0.006333 -0.009536 -0.006909 -0.027727 0.026128 0.030959 -0.014296 0.028337 0.043808 0.010271 ... -0.094607 0.762141 -0.017323 0.015659 0.013663 0.221065 0.099166 0.002096 0.001173 -0.000499
svd_cooc_on1_tn2_9 -0.021278 -0.029666 -0.000419 0.009439 -0.018862 0.036852 -0.009708 0.015489 -0.005953 0.051323 ... -0.022159 0.140842 -0.015016 0.016781 0.018623 0.931752 -0.060764 0.014112 0.013183 0.014557
svd_cooc_on2_tn1_0 -0.032895 0.026947 -0.006795 -0.018272 0.076602 0.611643 0.504274 -0.581891 -0.225127 -0.261840 ... -0.001231 -0.000634 -0.002345 -0.000288 -0.001069 -0.000277 -0.000530 0.031980 0.030217 0.035500
svd_cooc_on2_tn1_1 0.066209 0.381160 -0.422548 0.737396 -0.139728 -0.089371 -0.009308 -0.059509 -0.001893 -0.088501 ... 0.000634 -0.031987 0.004900 0.040115 -0.003700 0.012960 0.012785 0.040877 0.039735 0.043739
svd_cooc_on2_tn1_2 0.050103 0.143760 0.654098 0.176621 -0.018569 -0.029459 -0.003589 -0.011601 -0.012174 -0.026914 ... -0.006711 -0.002994 -0.005605 -0.003116 0.001907 -0.002704 0.002680 0.026585 0.025210 0.029953
svd_cooc_on2_tn1_3 0.036918 0.012584 -0.006675 -0.007863 0.020949 0.140565 0.009024 0.460081 -0.163398 -0.315764 ... 0.008152 -0.038960 0.013214 -0.016399 0.889424 0.001909 0.010582 0.022907 0.021523 0.018373
svd_cooc_on2_tn1_4 0.059330 0.188683 -0.026858 -0.107255 -0.093060 -0.169268 0.608015 0.294933 0.285016 0.245547 ... -0.034230 0.233148 0.899970 -0.049892 -0.028125 -0.003593 -0.010640 0.035085 0.034563 0.026194
svd_cooc_on2_tn1_5 0.007119 -0.017194 -0.007218 0.032620 -0.007028 -0.017543 -0.010003 -0.011093 -0.024745 -0.036311 ... 0.026862 -0.117013 0.056473 0.898806 0.031049 0.000580 0.003600 0.002258 0.002014 0.001701
svd_cooc_on2_tn1_6 -0.034116 -0.018537 0.009709 -0.005982 -0.148255 0.052328 -0.008005 0.013016 0.002159 0.026531 ... 0.970124 -0.441454 0.034879 -0.023718 -0.024619 -0.026421 -0.015253 0.010393 0.009637 0.014164
svd_cooc_on2_tn1_7 0.050264 0.042751 -0.019619 0.019177 0.585457 -0.212389 0.038451 -0.039386 0.007863 -0.090121 ... 0.336887 -0.158934 0.008751 -0.003815 0.005215 -0.000618 -0.004556 0.047586 0.045375 0.052396
svd_cooc_on2_tn1_8 -0.035782 -0.018556 -0.000966 0.006273 0.029034 0.008154 -0.009812 0.027748 -0.013448 0.011295 ... -0.159111 0.633886 -0.050787 0.053487 0.031099 0.635048 0.019615 0.004510 0.004668 0.007491
svd_cooc_on2_tn1_9 -0.019872 -0.009769 0.005503 -0.004875 0.005839 0.013946 0.013900 0.008930 -0.048086 0.041847 ... 0.074032 0.059928 -0.002307 0.002793 0.006036 0.909479 -0.182098 0.014714 0.013138 0.020120
svd_cooc_on2_tn2_0 -0.032836 0.026828 -0.006790 -0.018234 0.076317 0.609192 0.502308 -0.579634 -0.224296 -0.260817 ... -0.001208 -0.000609 -0.002338 -0.000258 -0.001055 -0.000346 -0.000546 0.031849 0.030099 0.035358
svd_cooc_on2_tn2_1 0.057560 0.336183 -0.379361 0.665776 -0.125842 -0.075093 -0.012204 -0.053335 -0.001100 -0.076404 ... -0.000172 -0.016342 -0.005817 -0.083371 -0.006398 0.016551 0.010388 0.037644 0.036653 0.041116
svd_cooc_on2_tn2_2 0.036080 0.103439 0.481469 0.134973 -0.014219 -0.027009 0.001107 -0.009476 -0.014362 -0.016561 ... 0.014103 -0.011418 0.001288 0.001892 0.001171 -0.006236 -0.000016 0.020168 0.019289 0.023327
svd_cooc_on2_tn2_3 -0.024450 -0.009490 -0.005530 0.004254 -0.004422 0.005066 -0.012284 0.001741 -0.002459 -0.001214 ... 1.000000 -0.456629 0.011427 -0.008208 -0.014758 0.011104 -0.001070 0.020967 0.019772 0.025104
svd_cooc_on2_tn2_4 0.016145 0.016581 0.020691 -0.074210 -0.011350 -0.018954 0.121040 0.064017 0.094413 0.071817 ... -0.456629 1.000000 0.211220 -0.099845 -0.030625 0.049872 0.017657 0.003660 0.004121 0.000556
svd_cooc_on2_tn2_5 0.043374 0.135017 -0.033469 -0.064871 -0.071556 -0.127575 0.496112 0.251591 0.234282 0.201427 ... 0.011427 0.211220 1.000000 -0.000552 -0.014841 0.008734 -0.002434 0.033094 0.032369 0.025461
svd_cooc_on2_tn2_6 0.013766 0.025705 -0.046963 0.099541 -0.021209 -0.018958 -0.039306 -0.037554 -0.024720 -0.054332 ... -0.008208 -0.099845 -0.000552 1.000000 0.019128 -0.021405 -0.011365 0.002559 0.002523 0.003234
svd_cooc_on2_tn2_7 0.031068 0.001945 0.001166 -0.001805 0.027925 0.117457 -0.006954 0.384134 -0.153890 -0.287545 ... -0.014758 -0.030625 -0.014841 0.019128 1.000000 -0.002505 -0.003600 0.017760 0.016800 0.014000
svd_cooc_on2_tn2_8 -0.036887 -0.018439 -0.013803 0.033724 0.018838 0.021715 0.007704 0.012058 -0.018820 0.047919 ... 0.011104 0.049872 0.008734 -0.021405 -0.002505 1.000000 0.005192 0.011831 0.010546 0.013387
svd_cooc_on2_tn2_9 -0.009420 0.025034 -0.002231 0.011517 -0.014323 -0.028912 0.002205 -0.013296 -0.011751 -0.022639 ... -0.001070 0.017657 -0.002434 -0.011365 -0.003600 0.005192 1.000000 0.010585 0.010182 0.014615
svd_cosine_w_n2_10 0.188513 -0.045139 0.015786 0.035399 0.070929 0.019382 -0.008748 -0.024241 0.026097 0.014240 ... 0.020967 0.003660 0.033094 0.002559 0.017760 0.011831 0.010585 1.000000 0.945268 0.529895
svd_cosine_w_n3_10 0.180911 -0.046287 0.013346 0.034342 0.070132 0.012625 0.001378 -0.020748 0.010432 0.036719 ... 0.019772 0.004121 0.032369 0.002523 0.016800 0.010546 0.010182 0.945268 1.000000 0.527833
svd_cosine_c_n5_10 0.222302 -0.018910 0.015597 0.035107 0.052950 0.018984 0.014799 -0.026015 -0.001146 -0.029828 ... 0.025104 0.000556 0.025461 0.003234 0.014000 0.013387 0.014615 0.529895 0.527833 1.000000

116 rows × 116 columns


In [28]:
class LSA_Ngram_Cooc(VectorSpace):
    def __init__(self, obs_corpus, target_corpus, 
            obs_ngram=1, target_ngram=1, svd_dim=100, svd_n_iter=5):
        self.obs_corpus = obs_corpus
        self.target_corpus = target_corpus
        self.obs_ngram = obs_ngram
        self.target_ngram = target_ngram
        self.svd_dim = svd_dim
        self.svd_n_iter = svd_n_iter
        self.obs_ngram_str = ngram_utils._ngram_str_map[self.obs_ngram]
        self.target_ngram_str = ngram_utils._ngram_str_map[self.target_ngram]

    def _get_cooc_terms(self, lst1, lst2, join_str):
        out = [""] * len(lst1) * len(lst2)
        cnt =  0
        for item1 in lst1:
            for item2 in lst2:
                out[cnt] = item1 + join_str + item2
                cnt += 1
        res = " ".join(out)
        return res

    def transform(self):
        obs_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.obs_ngram, "_"), self.obs_corpus))
        target_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.target_ngram, "_"), self.target_corpus))
        cooc_terms = list(map(lambda lst1,lst2: self._get_cooc_terms(lst1, lst2, "X"), obs_ngrams, target_ngrams))

        tfidf = self._init_word_ngram_tfidf(ngram=1)
        X = tfidf.fit_transform(cooc_terms)
        svd = TruncatedSVD(n_components=self.svd_dim, n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)


NGRAMS=[1,2]
for ngram1 in NGRAMS:
    for ngram2 in NGRAMS:
        lsa_word = LSA_Ngram_Cooc(train['question1'],train['question2'], svd_dim=svd_dim, svd_n_iter=config.SVD_N_ITER,
                                 obs_ngram=ngram1, target_ngram=ngram2)
        lsa_cooc = lsa_word.transform()
        lsa_cooc = pd.DataFrame(lsa_cooc,columns=['svd_cooc_on%s_tn%s_%s'%(str(ngram1),str(ngram2),i) for i in range(svd_dim)])
        train = pd.concat([train, lsa_cooc], axis=1)
        print 'svd_word_cooc_transform: {}'.format(train.shape)
del lsa_cooc, lsa_word
gc.collect()


svd_word_cooc_transform: (2750086, 85)
svd_word_cooc_transform: (2750086, 95)
svd_word_cooc_transform: (2750086, 105)
svd_word_cooc_transform: (2750086, 115)
Out[28]:
7

In [ ]:
# ------------------------ LSA Cosine Similarity -------------------------------
class LSA_Ngram_CosineSim(VectorSpace):
    def __init__(self, obs_corpus, target_corpus, ngram=3, svd_dim=100, svd_n_iter=5):
        self.obs_corpus = obs_corpus
        self.target_corpus = target_corpus
        self.ngram = ngram
        self.svd_dim = svd_dim
        self.svd_n_iter = svd_n_iter

    def word_transform(self):
        ## get common vocabulary
        tfidf = self._init_word_ngram_tfidf(self.ngram)
        tfidf.fit(list(self.obs_corpus) + list(self.target_corpus))
        vocabulary = tfidf.vocabulary_
        ## obs tfidf
        tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
        X_obs = tfidf.fit_transform(self.obs_corpus)
        ## targetument tfidf
        tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
        X_target = tfidf.fit_transform(self.target_corpus)
        ## svd
        svd = TruncatedSVD(n_components = self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        svd.fit(scipy.sparse.vstack((X_obs, X_target)))
        X_obs = svd.transform(X_obs)
        X_target = svd.transform(X_target)
        ## cosine similarity
        sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
        sim = np.asarray(sim).squeeze()
        return sim
    
    def char_transform(self):
        ## get common vocabulary
        tfidf = self._init_char_ngram_tfidf(self.ngram)
        tfidf.fit(list(self.obs_corpus) + list(self.target_corpus))
        vocabulary = tfidf.vocabulary_
        ## obs tfidf
        tfidf = self._init_char_ngram_tfidf(self.ngram, vocabulary)
        X_obs = tfidf.fit_transform(self.obs_corpus)
        ## targetument tfidf
        tfidf = self._init_char_ngram_tfidf(self.ngram, vocabulary)
        X_target = tfidf.fit_transform(self.target_corpus)
        ## svd
        svd = TruncatedSVD(n_components=self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        svd.fit(scipy.sparse.vstack((X_obs, X_target)))
        X_obs = svd.transform(X_obs)
        X_target = svd.transform(X_target)
        ## cosine similarity
        sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
        sim = np.asarray(sim).squeeze()
        return sim

svd_dim = config.SVD_DIM
    
# for NGRAM in [2,3]:
#     cosinesim_word = LSA_Ngram_CosineSim(train['question1'],train['question2'], ngram=NGRAM, svd_dim=svd_dim, svd_n_iter=config.SVD_N_ITER)
#     cosinesim_word = cosinesim_word.word_transform()
#     train['svd_cosine_w_n%s_%s'%(str(NGRAM),config.SVD_DIM)] = cosinesim_word
# print 'cosinesim_word_svd: {}'.format(train.shape)

for NGRAM in [10]:
    cosinesim_word = LSA_Ngram_CosineSim(train['question1'],train['question2'], ngram=NGRAM, svd_dim=svd_dim, svd_n_iter=config.SVD_N_ITER)
    cosinesim_word = cosinesim_word.char_transform()
    train['svd_cosine_c_n%s_%s'%(str(NGRAM),config.SVD_DIM)] = cosinesim_word
print 'cosinesim_char_svd: {}'.format(train.shape)

In [4]:
# ------------------- Char distribution -------------------
class CharDistribution(VectorSpace):
    def __init__(self, obs_corpus, target_corpus):
        self.obs_corpus = obs_corpus
        self.target_corpus = target_corpus

    def normalize(self, text):
        # pat = re.compile("[a-z0-9]")
        pat = re.compile("[a-z]")
        group = pat.findall(text.lower())
        if group is None:
            res = " "
        else:
            res = "".join(group)
            res += " "
        return res

    def preprocess(self, corpus):
        return [self.normalize(text) for text in corpus]

    def get_distribution(self):
        ## obs tfidf
        tfidf = self._init_char_tfidf()
        X_obs = tfidf.fit_transform(self.preprocess(self.obs_corpus)).todense()
        X_obs = np.asarray(X_obs)
        # apply laplacian smoothing
        s = 1.
        X_obs = (X_obs + s) / (np.sum(X_obs, axis=1)[:,None] + X_obs.shape[1]*s)
        ## targetument tfidf
        tfidf = self._init_char_tfidf()
        X_target = tfidf.fit_transform(self.preprocess(self.target_corpus)).todense()
        X_target = np.asarray(X_target)
        X_target = (X_target + s) / (np.sum(X_target, axis=1)[:,None] + X_target.shape[1]*s)
        return X_obs, X_target

class CharDistribution_transform(CharDistribution):
    def __init__(self, obs_corpus, target_corpus, const_A=1., const_B=1.):
        CharDistribution.__init__(self, obs_corpus, target_corpus)
        self.const_A = const_A
        self.const_B = const_B
        self.X_obs, self.X_target = self.get_distribution()

    def ratio_transform(self):
#         X_obs, X_target = self.get_distribution()
        ratio = (self.X_obs + self.const_A) / (self.X_target + self.const_B)
        return ratio

    def cosine_transform(self):
#         X_obs, X_target = self.get_distribution()
        ## cosine similarity
        sim = list(map(dist_utils._cosine_sim, self.X_obs, self.X_target))
        sim = np.asarray(sim).squeeze()
        return sim

    def kl_transform(self):
#         X_obs, X_target = self.get_distribution()
        kl = dist_utils._KL(self.X_obs, self.X_target)
        return kl
    
# cosinesim_word = CharDistribution(train['question1'],train['question2'])
# print cosinesim_word.get_distribution()
cosinesim_word = CharDistribution_transform(train['question1'],train['question2'])
cosinesim_word_ratio = cosinesim_word.ratio_transform()
cosinesim_word_ratio_shape = cosinesim_word_ratio.shape[1]
cosinesim_word_ratio = pd.DataFrame(cosinesim_word_ratio, columns=['cosinesim_word_ratio_%s'%(i) for i in range(cosinesim_word_ratio_shape)])
# train = pd.concat([train, cosinesim_word_df], axis=1)
train['char_distribution_ratio_mean'] = cosinesim_word_ratio.mean(axis=1)
train['char_distribution_ratio_min'] = cosinesim_word_ratio.min(axis=1)
train['char_distribution_ratio_max'] = cosinesim_word_ratio.max(axis=1)
train['char_distribution_ratio_std'] = cosinesim_word_ratio.std(axis=1)
train['char_distribution_cosine'] = cosinesim_word.cosine_transform()
train['char_distribution_kl'] = cosinesim_word.kl_transform()
print 'char_distribution: {}'.format(train.shape)


char_distribution: (2750086, 124)

In [ ]:
from nltk.corpus import wordnet as wn
from utils import dist_utils, ngram_utils, nlp_utils, pkl_utils
from utils import logging_utils, time_utils

# tune the token pattern to get a better correlation with y_train
token_pattern = r"(?u)\b\w\w+\b"
# token_pattern = r"\w{1,}"
# token_pattern = r"\w+"
# token_pattern = r"[\w']+"
token_pattern = " " 

class WordNet_Similarity:
    """Double aggregation features"""
    def __init__(self, metric="path"):
#         super().__init__(obs_corpus, target_corpus, aggregation_mode, None, aggregation_mode_prev)
        self.metric = metric
        if self.metric == "path":
            self.metric_func = lambda syn1, syn2: wn.path_similarity(syn1, syn2)
        elif self.metric == "lch":
            self.metric_func = lambda syn1, syn2: wn.lch_similarity(syn1, syn2)
        elif self.metric == "wup":
            self.metric_func = lambda syn1, syn2: wn.wup_similarity(syn1, syn2)
        else:
            raise(ValueError("Wrong similarity metric: %s, should be one of path/lch/wup."%self.metric))
            
    def _maximum_similarity_for_two_synset_list(self, syn_list1, syn_list2):
        s = 0.
        if syn_list1 and syn_list2:
            for syn1 in syn_list1:
                for syn2 in syn_list2:
                    try:
                        _s = self.metric_func(syn1, syn2)
                    except:
                        _s = config.MISSING_VALUE_NUMERIC
                    if _s and _s > s:
                        s = _s
        return s

    def transform_one(self, obs, target):
        obs_tokens = nlp_utils._tokenize(obs, token_pattern)
        target_tokens = nlp_utils._tokenize(target, token_pattern)
        obs_synset_list = [wn.synsets((obs_token).decode('utf-8')) for obs_token in obs_tokens]
        target_synset_list = [wn.synsets((target_token).decode('utf-8')) for target_token in target_tokens]
        val_list = []
        for obs_synset in obs_synset_list:
            _val_list = []
            for target_synset in target_synset_list:
                _s = self._maximum_similarity_for_two_synset_list(obs_synset, target_synset)
                _val_list.append(_s)
            if len(_val_list) == 0:
                _val_list = [config.MISSING_VALUE_NUMERIC]
            val_list.append( _val_list )
        if len(val_list) == 0:
            val_list = [[config.MISSING_VALUE_NUMERIC]]
        return val_list
    
wn_list = ["lch", "path", "wup"]
# wn_list = ["wup"]

np_dict = { 'mean':np.mean, 'min':np.mean, 'max':np.max, 'median':np.median, 'std':np.std }
agg1 = ["mean", "max", "min", "median"]
agg2 = ["mean", "std", "max", "min", "median"]
def wn_sim(train):
    wn_sim = WordNet_Similarity(metric=wn_method)
    wn_matrix = train.apply(lambda x: wn_sim.transform_one(x['question1'],x['question2']), axis=1)
    for AGG1 in agg1:
        for AGG2 in agg2:
            train['WordNet_%s_%s_%s'%(wn_method,AGG1,AGG2)] = [np_dict[AGG2](np_dict[AGG1](wn_row,axis=1)) for wn_row in wn_matrix]
    return train

for wn_method in wn_list:   
    train = parallelize_dataframe(train, wn_sim)
    print '{} ==> WordNet_Similarity {}: {}'.format(datetime.datetime.now(), wn_method, train.shape)
    train.to_csv(config.FEAT_PATH+'feature_vect_lemmer.csv',index=False)
    gc.collect()

In [5]:
train[train['is_duplicate']!=-1].corr()


Out[5]:
is_duplicate lsa_wn3_q1_0 lsa_wn3_q1_1 lsa_wn3_q1_2 lsa_wn3_q1_3 lsa_wn3_q1_4 lsa_wn3_q1_5 lsa_wn3_q1_6 lsa_wn3_q1_7 lsa_wn3_q1_8 ... svd_cooc_on2_tn2_9 svd_cosine_w_n2_10 svd_cosine_w_n3_10 svd_cosine_c_n5_10 char_distribution_ratio_mean char_distribution_ratio_min char_distribution_ratio_max char_distribution_ratio_std char_distribution_cosine char_distribution_kl
is_duplicate 1.000000 0.149820 0.014555 0.026671 0.056715 -0.049383 0.004689 0.031299 0.047388 -0.025191 ... -0.009420 0.188513 0.180911 0.222302 -0.249714 0.306985 -0.316919 -0.369171 0.350929 -0.367569
lsa_wn3_q1_0 0.149820 1.000000 -0.108788 0.007928 -0.163416 -0.230964 0.040128 -0.113475 0.011017 -0.176637 ... 0.025034 -0.045139 -0.046287 -0.018910 -0.062499 0.079855 -0.086704 -0.088144 0.090050 -0.095905
lsa_wn3_q1_1 0.014555 -0.108788 1.000000 -0.165131 0.017630 0.021461 -0.000716 0.014486 -0.000030 0.018098 ... -0.002231 0.015786 0.013346 0.015597 0.002787 -0.002380 -0.000301 0.004992 -0.007835 0.003691
lsa_wn3_q1_2 0.026671 0.007928 -0.165131 1.000000 0.008654 0.006449 -0.000063 0.011678 0.002074 0.007082 ... 0.011517 0.035399 0.034342 0.035107 0.008896 -0.017686 0.020686 0.021005 -0.013354 0.015961
lsa_wn3_q1_3 0.056715 -0.163416 0.017630 0.008654 1.000000 -0.081576 0.033906 -0.055381 0.003350 -0.059820 ... -0.014323 0.070929 0.070132 0.052950 -0.016600 0.018521 -0.019268 -0.022653 0.023531 -0.021964
lsa_wn3_q1_4 -0.049383 -0.230964 0.021461 0.006449 -0.081576 1.000000 0.108260 -0.217296 0.002171 -0.078603 ... -0.028912 0.019382 0.012625 0.018984 -0.008869 0.007741 0.006789 -0.005133 0.003537 0.001567
lsa_wn3_q1_5 0.004689 0.040128 -0.000716 -0.000063 0.033906 0.108260 1.000000 -0.050229 -0.012692 0.032243 ... 0.002205 -0.008748 0.001378 0.014799 0.005866 -0.006400 0.004910 0.013093 -0.007349 0.004434
lsa_wn3_q1_6 0.031299 -0.113475 0.014486 0.011678 -0.055381 -0.217296 -0.050229 1.000000 0.134473 0.194451 ... -0.013296 -0.024241 -0.020748 -0.026015 0.033165 -0.042783 0.049302 0.056754 -0.050440 0.052332
lsa_wn3_q1_7 0.047388 0.011017 -0.000030 0.002074 0.003350 0.002171 -0.012692 0.134473 1.000000 -0.014828 ... -0.011751 0.026097 0.010432 -0.001146 -0.003934 0.018486 -0.023852 -0.019712 0.010328 -0.017039
lsa_wn3_q1_8 -0.025191 -0.176637 0.018098 0.007082 -0.059820 -0.078603 0.032243 0.194451 -0.014828 1.000000 ... -0.022639 0.014240 0.036719 -0.029828 0.027663 -0.038347 0.054263 0.051732 -0.046150 0.048701
lsa_wn3_q1_9 0.056474 0.029392 -0.012754 -0.005112 0.009023 0.005554 0.055233 -0.030812 0.036289 -0.047997 ... 0.006541 0.056755 0.025628 0.038336 -0.013392 0.009065 -0.018438 -0.020961 0.024031 -0.025407
lsa_wn3_q2_0 0.114369 0.569742 -0.097516 0.117163 -0.095464 -0.121947 0.006018 -0.074691 0.007755 -0.097858 ... 0.024539 -0.077624 -0.076342 -0.054870 -0.061904 0.066922 -0.060487 -0.064961 0.069539 -0.075081
lsa_wn3_q2_1 0.008334 0.103844 -0.575711 0.657305 -0.012849 -0.022890 0.000865 -0.010003 -0.002786 -0.025001 ... 0.008435 0.023334 0.023175 0.025523 0.013397 -0.017793 0.010462 0.011944 -0.005515 0.010300
lsa_wn3_q2_2 0.046130 0.077817 0.481105 0.498460 -0.013459 -0.022910 0.002800 -0.006579 -0.002626 -0.020817 ... 0.009713 0.045839 0.042646 0.047272 0.002005 -0.006325 0.008225 0.010772 -0.007422 0.006360
lsa_wn3_q2_3 0.048507 -0.084403 0.005796 -0.039043 0.813994 -0.042698 0.041126 -0.093931 -0.002978 -0.103546 ... -0.017951 0.076002 0.073627 0.054044 -0.003812 0.008484 -0.020197 -0.018620 0.019549 -0.016132
lsa_wn3_q2_4 -0.063876 -0.108754 0.012123 -0.057812 -0.196678 0.783254 0.071936 -0.256004 -0.021414 -0.135390 ... -0.026392 0.012036 0.003923 0.015017 0.025266 -0.018653 -0.001814 0.003172 -0.004897 0.011309
lsa_wn3_q2_5 0.006272 -0.035754 0.002727 0.014784 0.038026 0.173295 0.719770 0.044398 0.163463 0.064610 ... -0.007790 0.007853 0.013402 0.008022 0.007524 -0.012040 0.013332 0.022126 -0.018613 0.015011
lsa_wn3_q2_6 -0.045395 0.056034 -0.008962 0.006282 0.064297 0.243674 0.326753 -0.626924 -0.542938 -0.155565 ... 0.017876 -0.006698 -0.000647 0.024252 -0.031589 0.030810 -0.017905 -0.030602 0.031993 -0.031001
lsa_wn3_q2_7 -0.013450 -0.063268 0.009760 -0.011249 -0.070982 -0.163654 0.041571 0.433826 -0.517427 0.156335 ... -0.002597 -0.027324 -0.009951 -0.005059 0.049989 -0.061664 0.041287 0.057294 -0.043841 0.050621
lsa_wn3_q2_8 -0.018749 -0.093514 0.006926 -0.024192 -0.091511 -0.118040 -0.012484 0.057054 -0.049721 0.638594 ... -0.020829 0.029413 0.050312 -0.025886 0.051570 -0.061719 0.040350 0.054091 -0.046401 0.050838
lsa_wn3_q2_9 0.057383 0.064139 -0.013217 -0.005749 0.026115 0.042420 0.043811 -0.044534 0.021453 -0.106786 ... 0.011140 0.059770 0.016553 0.037322 -0.027853 0.028120 -0.016861 -0.031218 0.031444 -0.033154
lsa_cn10_q1_0 0.114369 0.569742 -0.097516 0.117163 -0.095464 -0.121947 0.006018 -0.074691 0.007755 -0.097858 ... 0.024539 -0.077624 -0.076342 -0.054870 -0.061904 0.066922 -0.060487 -0.064961 0.069539 -0.075081
lsa_cn10_q1_1 0.008334 0.103844 -0.575711 0.657305 -0.012849 -0.022890 0.000865 -0.010003 -0.002786 -0.025001 ... 0.008435 0.023334 0.023175 0.025523 0.013397 -0.017793 0.010462 0.011944 -0.005515 0.010300
lsa_cn10_q1_2 0.046130 0.077817 0.481105 0.498460 -0.013459 -0.022910 0.002800 -0.006579 -0.002626 -0.020817 ... 0.009713 0.045839 0.042646 0.047272 0.002005 -0.006325 0.008225 0.010772 -0.007422 0.006360
lsa_cn10_q1_3 0.048507 -0.084403 0.005796 -0.039043 0.813994 -0.042698 0.041126 -0.093931 -0.002978 -0.103546 ... -0.017951 0.076002 0.073627 0.054044 -0.003812 0.008484 -0.020197 -0.018620 0.019549 -0.016132
lsa_cn10_q1_4 -0.063876 -0.108754 0.012123 -0.057812 -0.196678 0.783254 0.071936 -0.256004 -0.021414 -0.135390 ... -0.026392 0.012036 0.003923 0.015017 0.025266 -0.018653 -0.001814 0.003172 -0.004897 0.011309
lsa_cn10_q1_5 0.006272 -0.035754 0.002727 0.014784 0.038026 0.173295 0.719770 0.044398 0.163463 0.064610 ... -0.007790 0.007853 0.013402 0.008022 0.007524 -0.012040 0.013332 0.022126 -0.018613 0.015011
lsa_cn10_q1_6 -0.045395 0.056034 -0.008962 0.006282 0.064297 0.243674 0.326753 -0.626924 -0.542938 -0.155565 ... 0.017876 -0.006698 -0.000647 0.024252 -0.031589 0.030810 -0.017905 -0.030602 0.031993 -0.031001
lsa_cn10_q1_7 -0.013450 -0.063268 0.009760 -0.011249 -0.070982 -0.163654 0.041571 0.433826 -0.517427 0.156335 ... -0.002597 -0.027324 -0.009951 -0.005059 0.049989 -0.061664 0.041287 0.057294 -0.043841 0.050621
lsa_cn10_q1_8 -0.018749 -0.093514 0.006926 -0.024192 -0.091511 -0.118040 -0.012484 0.057054 -0.049721 0.638594 ... -0.020829 0.029413 0.050312 -0.025886 0.051570 -0.061719 0.040350 0.054091 -0.046401 0.050838
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
svd_cooc_on1_tn2_9 -0.021278 -0.029666 -0.000419 0.009439 -0.018862 0.036852 -0.009708 0.015489 -0.005953 0.051323 ... -0.060764 0.014112 0.013183 0.014557 -0.011125 0.012687 -0.013038 -0.017419 0.017725 -0.016904
svd_cooc_on2_tn1_0 -0.032895 0.026947 -0.006795 -0.018272 0.076602 0.611643 0.504274 -0.581891 -0.225127 -0.261840 ... -0.000530 0.031980 0.030217 0.035500 -0.028498 0.030273 -0.027552 -0.035549 0.036840 -0.037009
svd_cooc_on2_tn1_1 0.066209 0.381160 -0.422548 0.737396 -0.139728 -0.089371 -0.009308 -0.059509 -0.001893 -0.088501 ... 0.012785 0.040877 0.039735 0.043739 -0.023764 0.024452 -0.026527 -0.030414 0.035552 -0.036647
svd_cooc_on2_tn1_2 0.050103 0.143760 0.654098 0.176621 -0.018569 -0.029459 -0.003589 -0.011601 -0.012174 -0.026914 ... 0.002680 0.026585 0.025210 0.029953 -0.009067 0.009424 -0.012532 -0.008613 0.010077 -0.013995
svd_cooc_on2_tn1_3 0.036918 0.012584 -0.006675 -0.007863 0.020949 0.140565 0.009024 0.460081 -0.163398 -0.315764 ... 0.010582 0.022907 0.021523 0.018373 -0.005261 0.005127 -0.011046 -0.009040 0.011628 -0.012689
svd_cooc_on2_tn1_4 0.059330 0.188683 -0.026858 -0.107255 -0.093060 -0.169268 0.608015 0.294933 0.285016 0.245547 ... -0.010640 0.035085 0.034563 0.026194 -0.008684 0.011046 -0.012413 -0.008919 0.010437 -0.016005
svd_cooc_on2_tn1_5 0.007119 -0.017194 -0.007218 0.032620 -0.007028 -0.017543 -0.010003 -0.011093 -0.024745 -0.036311 ... 0.003600 0.002258 0.002014 0.001701 -0.004383 -0.000677 0.003257 -0.000008 0.002132 -0.001431
svd_cooc_on2_tn1_6 -0.034116 -0.018537 0.009709 -0.005982 -0.148255 0.052328 -0.008005 0.013016 0.002159 0.026531 ... -0.015253 0.010393 0.009637 0.014164 -0.019103 0.024070 -0.023522 -0.030577 0.025376 -0.024709
svd_cooc_on2_tn1_7 0.050264 0.042751 -0.019619 0.019177 0.585457 -0.212389 0.038451 -0.039386 0.007863 -0.090121 ... -0.004556 0.047586 0.045375 0.052396 -0.026415 0.029599 -0.032604 -0.036835 0.037420 -0.036374
svd_cooc_on2_tn1_8 -0.035782 -0.018556 -0.000966 0.006273 0.029034 0.008154 -0.009812 0.027748 -0.013448 0.011295 ... 0.019615 0.004510 0.004668 0.007491 -0.003564 0.001288 -0.001745 -0.003459 0.007457 -0.005568
svd_cooc_on2_tn1_9 -0.019872 -0.009769 0.005503 -0.004875 0.005839 0.013946 0.013900 0.008930 -0.048086 0.041847 ... -0.182098 0.014714 0.013138 0.020120 -0.017655 0.022605 -0.021522 -0.028066 0.024673 -0.023803
svd_cooc_on2_tn2_0 -0.032836 0.026828 -0.006790 -0.018234 0.076317 0.609192 0.502308 -0.579634 -0.224296 -0.260817 ... -0.000546 0.031849 0.030099 0.035358 -0.028388 0.030150 -0.027492 -0.035439 0.036734 -0.036897
svd_cooc_on2_tn2_1 0.057560 0.336183 -0.379361 0.665776 -0.125842 -0.075093 -0.012204 -0.053335 -0.001100 -0.076404 ... 0.010388 0.037644 0.036653 0.041116 -0.023079 0.023570 -0.024384 -0.028701 0.032850 -0.033898
svd_cooc_on2_tn2_2 0.036080 0.103439 0.481469 0.134973 -0.014219 -0.027009 0.001107 -0.009476 -0.014362 -0.016561 ... -0.000016 0.020168 0.019289 0.023327 -0.010877 0.009671 -0.010431 -0.008355 0.009536 -0.012407
svd_cooc_on2_tn2_3 -0.024450 -0.009490 -0.005530 0.004254 -0.004422 0.005066 -0.012284 0.001741 -0.002459 -0.001214 ... -0.001070 0.020967 0.019772 0.025104 -0.023692 0.028467 -0.028868 -0.036578 0.032483 -0.031171
svd_cooc_on2_tn2_4 0.016145 0.016581 0.020691 -0.074210 -0.011350 -0.018954 0.121040 0.064017 0.094413 0.071817 ... 0.017657 0.003660 0.004121 0.000556 0.007265 -0.009436 0.009430 0.012818 -0.008389 0.006562
svd_cooc_on2_tn2_5 0.043374 0.135017 -0.033469 -0.064871 -0.071556 -0.127575 0.496112 0.251591 0.234282 0.201427 ... -0.002434 0.033094 0.032369 0.025461 -0.010969 0.014111 -0.013070 -0.012279 0.012801 -0.016865
svd_cooc_on2_tn2_6 0.013766 0.025705 -0.046963 0.099541 -0.021209 -0.018958 -0.039306 -0.037554 -0.024720 -0.054332 ... -0.011365 0.002559 0.002523 0.003234 -0.002005 -0.001835 0.001898 -0.000329 0.003179 -0.002224
svd_cooc_on2_tn2_7 0.031068 0.001945 0.001166 -0.001805 0.027925 0.117457 -0.006954 0.384134 -0.153890 -0.287545 ... -0.003600 0.017760 0.016800 0.014000 -0.004505 0.002764 -0.007824 -0.006120 0.008459 -0.009191
svd_cooc_on2_tn2_8 -0.036887 -0.018439 -0.013803 0.033724 0.018838 0.021715 0.007704 0.012058 -0.018820 0.047919 ... 0.005192 0.011831 0.010546 0.013387 -0.013895 0.017892 -0.017401 -0.022469 0.020164 -0.018393
svd_cooc_on2_tn2_9 -0.009420 0.025034 -0.002231 0.011517 -0.014323 -0.028912 0.002205 -0.013296 -0.011751 -0.022639 ... 1.000000 0.010585 0.010182 0.014615 -0.013048 0.017332 -0.016722 -0.020445 0.019150 -0.016971
svd_cosine_w_n2_10 0.188513 -0.045139 0.015786 0.035399 0.070929 0.019382 -0.008748 -0.024241 0.026097 0.014240 ... 0.010585 1.000000 0.945268 0.529895 -0.258029 0.312220 -0.309098 -0.373235 0.356538 -0.366777
svd_cosine_w_n3_10 0.180911 -0.046287 0.013346 0.034342 0.070132 0.012625 0.001378 -0.020748 0.010432 0.036719 ... 0.010182 0.945268 1.000000 0.527833 -0.242380 0.294614 -0.292300 -0.352326 0.336052 -0.345801
svd_cosine_c_n5_10 0.222302 -0.018910 0.015597 0.035107 0.052950 0.018984 0.014799 -0.026015 -0.001146 -0.029828 ... 0.014615 0.529895 0.527833 1.000000 -0.362550 0.423254 -0.425251 -0.514557 0.518944 -0.511100
char_distribution_ratio_mean -0.249714 -0.062499 0.002787 0.008896 -0.016600 -0.008869 0.005866 0.033165 -0.003934 0.027663 ... -0.013048 -0.258029 -0.242380 -0.362550 1.000000 -0.761884 0.439683 0.682147 -0.725736 0.723613
char_distribution_ratio_min 0.306985 0.079855 -0.002380 -0.017686 0.018521 0.007741 -0.006400 -0.042783 0.018486 -0.038347 ... 0.017332 0.312220 0.294614 0.423254 -0.761884 1.000000 -0.599147 -0.853939 0.817731 -0.778946
char_distribution_ratio_max -0.316919 -0.086704 -0.000301 0.020686 -0.019268 0.006789 0.004910 0.049302 -0.023852 0.054263 ... -0.016722 -0.309098 -0.292300 -0.425251 0.439683 -0.599147 1.000000 0.866309 -0.821484 0.814729
char_distribution_ratio_std -0.369171 -0.088144 0.004992 0.021005 -0.022653 -0.005133 0.013093 0.056754 -0.019712 0.051732 ... -0.020445 -0.373235 -0.352326 -0.514557 0.682147 -0.853939 0.866309 1.000000 -0.943815 0.935703
char_distribution_cosine 0.350929 0.090050 -0.007835 -0.013354 0.023531 0.003537 -0.007349 -0.050440 0.010328 -0.046150 ... 0.019150 0.356538 0.336052 0.518944 -0.725736 0.817731 -0.821484 -0.943815 1.000000 -0.972486
char_distribution_kl -0.367569 -0.095905 0.003691 0.015961 -0.021964 0.001567 0.004434 0.052332 -0.017039 0.048701 ... -0.016971 -0.366777 -0.345801 -0.511100 0.723613 -0.778946 0.814729 0.935703 -0.972486 1.000000

122 rows × 122 columns


In [3]:
train.to_csv(config.FEAT_PATH+'feature_vect_lemmer.csv',index=False)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-3adc7067eabd> in <module>()
----> 1 train.to_csv(config.FEAT_PATH+'feature_vect_lemmer.csv',index=False)

NameError: name 'train' is not defined

In [12]:
train.head()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-058d1d044437> in <module>()
----> 1 train.head()

NameError: name 'train' is not defined

In [11]:
print '{} ==> WordNet_Similarity {}: {}'.format(datetime.datetime.now(), 1, 2)


2017-06-04 11:12:36.823693 ==> WordNet_Similarity 1: 2