In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from __future__ import division
import re, time, os, gc
import sys
import string

import numpy as np
import pandas as pd
import scipy
import config
from utils import dist_utils, ngram_utils, nlp_utils, np_utils
from utils import logging_utils, time_utils, pkl_utils

df =  pd.read_csv(config.RAW_PATH+'train.csv')
df['question1'] = df['question1'].astype(str)
df['question2'] = df['question2'].astype(str)
train = df.sample(n=100)[['question1', 'question2']]

In [3]:
train_orig =  pd.read_csv(config.RAW_PATH+'train.csv', header=0)
test_orig =  pd.read_csv(config.RAW_PATH+'test.csv', header=0)

train = pd.concat([train_orig[['question1', 'question2']], \
        test_orig[['question1', 'question2']]], axis=0).reset_index(drop='index')
train['question1'] = train['question1'].astype(str)
train['question2'] = train['question2'].astype(str)

In [2]:
def jaccard_ngram(obs, target, ngram=1, token_pattern=" "):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    return dist_utils._jaccard_coef(obs_ngrams, target_ngrams)

def dicedistence_ngram(obs, target, ngram=1, token_pattern=" "):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    return dist_utils._dice_dist(obs_ngrams, target_ngrams)

def compression_dist(obs, target):
    return dist_utils._compression_dist(obs, target)

def edit_dist(obs, target):
    return dist_utils._edit_dist(obs, target)

def compression_dist_ngram(obs, target, ngram=2, token_pattern=" "):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    val_list = []
#     for w1 in obs_ngrams:
#         _val_list = []
#         for w2 in target_ngrams:
#             s = dist_utils._compression_dist(w1, w2)
#             _val_list.append(s)
#         if len(_val_list) == 0:
#             _val_list = [ -1 ]
#         val_list.append( _val_list )
#     if len(val_list) == 0:
#         val_list = [ [-1] ]
    for w1 in obs_ngrams:
        _val_list = []
        for w2 in target_ngrams:
            s = dist_utils._compression_dist(w1, w2)
            _val_list.append(s)
        if len(_val_list) == 0:
            _val_list = [-1]
        val_list.append( max(_val_list) )
    if len(val_list) == 0:
        val_list = [-1]
    return min(val_list)

def edit_dist_ngram(obs, target, ngram=2, token_pattern=" ", agg=[np.min, np.max]):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    val_list = []

    for w1 in obs_ngrams:
        _val_list = []
        for w2 in target_ngrams:
            s = dist_utils._edit_dist(w1, w2)
            _val_list.append(s)
        if len(_val_list) == 0:
            _val_list = [-1]
        val_list.append( agg[0](_val_list) )
    if len(val_list) == 0:
        val_list = [-1]
    return float(agg[1](val_list))

In [3]:
for NGRAMS in [1,2,3]:
    train['jaccard_n%s'%NGRAMS] = train.apply(lambda x: jaccard_ngram(x['question1'],x['question2'],ngram=NGRAMS), axis=1)
    train['dicedistence_n%s'%NGRAMS] = train.apply(lambda x: dicedistence_ngram(x['question1'],x['question2'],ngram=NGRAMS), axis=1)

train['compression_dist'] = train.apply(lambda x: compression_dist(x['question1'],x['question2']), axis=1)
train['edit_dist'] = train.apply(lambda x: edit_dist(x['question1'],x['question2']), axis=1)

np_dict = { 'mean':np.mean, 'min':np.mean, 'max':np.max, 'median':np.median, 'std':np.std }

for AGG_NGRAMS in [2,3]:
    for agg1 in np_dict.keys():
        for agg2 in np_dict.keys():
            AGG_BY = agg1 + '_' + agg2
            AGG_FUNC = [np_dict[agg1],np_dict[agg2]]
            # train['compression_dist_agg_n%s'%AGG_NGRAMS] = train.apply(lambda x: compression_dist_ngram(x['question1'],x['question2'],ngram=AGG_NGRAMS), axis=1)
            train['edit_dist_agg_n%s_%s'%(AGG_NGRAMS,AGG_BY)] = train.apply(lambda x: 
                edit_dist_ngram(x['question1'],x['question2'], ngram=AGG_NGRAMS, agg=AGG_FUNC), axis=1)

train.corr()


Out[3]:
jaccard_n1 dicedistence_n1 jaccard_n2 dicedistence_n2 jaccard_n3 dicedistence_n3 compression_dist edit_dist edit_dist_agg_n2_std_std edit_dist_agg_n2_std_max ... edit_dist_agg_n3_median_std edit_dist_agg_n3_median_max edit_dist_agg_n3_median_min edit_dist_agg_n3_median_median edit_dist_agg_n3_median_mean edit_dist_agg_n3_mean_std edit_dist_agg_n3_mean_max edit_dist_agg_n3_mean_min edit_dist_agg_n3_mean_median edit_dist_agg_n3_mean_mean
jaccard_n1 1.000000 0.982943 0.934118 0.942921 0.868539 0.886974 -0.877246 -0.890037 0.138353 0.556003 ... 0.088286 -0.246377 -0.197416 -0.190264 -0.197416 0.114764 -0.561728 -0.627769 -0.648024 -0.627769
dicedistence_n1 0.982943 1.000000 0.883537 0.909730 0.804859 0.830277 -0.878147 -0.875469 0.227130 0.634191 ... 0.125850 -0.258933 -0.240955 -0.227452 -0.240955 0.155383 -0.563821 -0.655982 -0.666951 -0.655982
jaccard_n2 0.934118 0.883537 1.000000 0.990633 0.972191 0.977776 -0.813558 -0.851613 0.021721 0.429524 ... 0.028798 -0.207780 -0.119122 -0.118729 -0.119122 0.096264 -0.503711 -0.568843 -0.587434 -0.568843
dicedistence_n2 0.942921 0.909730 0.990633 1.000000 0.942598 0.960532 -0.829129 -0.863995 0.107304 0.501708 ... 0.051561 -0.210232 -0.136617 -0.126057 -0.136617 0.136016 -0.501605 -0.592744 -0.601548 -0.592744
jaccard_n3 0.868539 0.804859 0.972191 0.942598 1.000000 0.993172 -0.744493 -0.784451 -0.102441 0.306800 ... -0.024181 -0.208826 -0.078909 -0.089606 -0.078909 0.027464 -0.488843 -0.514491 -0.551257 -0.514491
dicedistence_n3 0.886974 0.830277 0.977776 0.960532 0.993172 1.000000 -0.759210 -0.800785 -0.061266 0.343137 ... -0.020811 -0.202706 -0.075709 -0.084389 -0.075709 0.042157 -0.485523 -0.526487 -0.564213 -0.526487
compression_dist -0.877246 -0.878147 -0.813558 -0.829129 -0.744493 -0.759210 1.000000 0.909040 -0.315299 -0.667567 ... -0.248710 0.396686 0.408671 0.382662 0.408671 -0.278608 0.649737 0.792047 0.786380 0.792047
edit_dist -0.890037 -0.875469 -0.851613 -0.863995 -0.784451 -0.800785 0.909040 1.000000 -0.266988 -0.632327 ... -0.202496 0.348619 0.342492 0.347726 0.342492 -0.217338 0.625375 0.745811 0.741253 0.745811
edit_dist_agg_n2_std_std 0.138353 0.227130 0.021721 0.107304 -0.102441 -0.061266 -0.315299 -0.266988 1.000000 0.781036 ... 0.555425 -0.106513 -0.483549 -0.351782 -0.483549 0.720441 0.008608 -0.474480 -0.357057 -0.474480
edit_dist_agg_n2_std_max 0.556003 0.634191 0.429524 0.501708 0.306800 0.343137 -0.667567 -0.632327 0.781036 1.000000 ... 0.475221 -0.251333 -0.516681 -0.448040 -0.516681 0.532920 -0.417932 -0.741256 -0.666954 -0.741256
edit_dist_agg_n2_std_min 0.851095 0.871559 0.787678 0.821856 0.713768 0.745780 -0.904368 -0.879628 0.347696 0.756577 ... 0.276326 -0.402295 -0.427186 -0.412173 -0.427186 0.280806 -0.712559 -0.859758 -0.857794 -0.859758
edit_dist_agg_n2_std_median 0.829180 0.842296 0.764045 0.794163 0.695818 0.729524 -0.871313 -0.833437 0.291834 0.655888 ... 0.211872 -0.399988 -0.381115 -0.383029 -0.381115 0.223399 -0.676569 -0.810936 -0.841486 -0.810936
edit_dist_agg_n2_std_mean 0.851095 0.871559 0.787678 0.821856 0.713768 0.745780 -0.904368 -0.879628 0.347696 0.756577 ... 0.276326 -0.402295 -0.427186 -0.412173 -0.427186 0.280806 -0.712559 -0.859758 -0.857794 -0.859758
edit_dist_agg_n2_max_std 0.010454 0.016230 -0.066117 -0.063150 -0.087500 -0.086634 -0.192694 -0.096633 0.169235 0.115380 ... 0.145523 -0.155330 -0.297613 -0.410785 -0.297613 0.131246 -0.118600 -0.226801 -0.269334 -0.226801
edit_dist_agg_n2_max_max -0.160835 -0.148975 -0.172274 -0.159412 -0.167163 -0.134106 0.192889 0.208851 -0.301776 -0.256738 ... -0.405893 0.174703 0.377508 0.222189 0.377508 -0.491801 0.088920 0.360744 0.244825 0.360744
edit_dist_agg_n2_max_min 0.035422 0.038300 0.037323 0.051476 0.027659 0.060555 0.160104 0.113877 -0.315762 -0.226423 ... -0.447492 0.277208 0.566274 0.531946 0.566274 -0.446318 0.177322 0.426427 0.359025 0.426427
edit_dist_agg_n2_max_median 0.129493 0.125554 0.106930 0.114613 0.100546 0.129177 0.056511 0.020058 -0.251644 -0.133157 ... -0.391895 0.335152 0.530257 0.476662 0.530257 -0.374832 0.181187 0.368299 0.300138 0.368299
edit_dist_agg_n2_max_mean 0.035422 0.038300 0.037323 0.051476 0.027659 0.060555 0.160104 0.113877 -0.315762 -0.226423 ... -0.447492 0.277208 0.566274 0.531946 0.566274 -0.446318 0.177322 0.426427 0.359025 0.426427
edit_dist_agg_n2_min_std -0.014947 0.038638 -0.075161 -0.031109 -0.146630 -0.128776 -0.156427 -0.059137 0.692777 0.451432 ... 0.773903 0.090170 -0.577464 -0.421909 -0.577464 0.892789 0.231117 -0.389648 -0.266805 -0.389648
edit_dist_agg_n2_min_max -0.491705 -0.487911 -0.461371 -0.451262 -0.459216 -0.445738 0.577263 0.582751 0.029921 -0.393605 ... -0.074074 0.651361 0.402426 0.419721 0.402426 0.103166 0.884805 0.648109 0.623791 0.648109
edit_dist_agg_n2_min_min -0.617661 -0.646925 -0.550569 -0.572689 -0.496923 -0.504740 0.785681 0.729538 -0.456667 -0.733514 ... -0.566262 0.545719 0.772967 0.726884 0.772967 -0.508398 0.696590 0.964702 0.913646 0.964702
edit_dist_agg_n2_min_median -0.657343 -0.675227 -0.587556 -0.604566 -0.549332 -0.561770 0.791280 0.731323 -0.389883 -0.662481 ... -0.458633 0.504874 0.678971 0.685379 0.678971 -0.429264 0.649240 0.915035 0.925209 0.915035
edit_dist_agg_n2_min_mean -0.617661 -0.646925 -0.550569 -0.572689 -0.496923 -0.504740 0.785681 0.729538 -0.456667 -0.733514 ... -0.566262 0.545719 0.772967 0.726884 0.772967 -0.508398 0.696590 0.964702 0.913646 0.964702
edit_dist_agg_n2_median_std -0.108235 -0.077153 -0.108837 -0.083364 -0.165583 -0.166629 -0.010691 0.018010 0.470717 0.254537 ... 0.697806 0.073879 -0.502093 -0.351949 -0.502093 0.770701 0.226274 -0.262351 -0.128054 -0.262351
edit_dist_agg_n2_median_max -0.346413 -0.338734 -0.349710 -0.333362 -0.385087 -0.369743 0.433587 0.390597 0.081115 -0.237288 ... -0.138844 0.566589 0.415600 0.355884 0.415600 0.016170 0.678424 0.533248 0.483071 0.533248
edit_dist_agg_n2_median_min -0.113907 -0.132541 -0.111626 -0.111498 -0.098781 -0.074973 0.272870 0.236477 -0.294306 -0.317865 ... -0.621648 0.416998 0.741864 0.616850 0.741864 -0.542874 0.318355 0.582533 0.460465 0.582533
edit_dist_agg_n2_median_median -0.092187 -0.110295 -0.075173 -0.067344 -0.067004 -0.037603 0.229700 0.191058 -0.209619 -0.240017 ... -0.498560 0.401533 0.650840 0.581260 0.650840 -0.400046 0.332671 0.517737 0.439759 0.517737
edit_dist_agg_n2_median_mean -0.113907 -0.132541 -0.111626 -0.111498 -0.098781 -0.074973 0.272870 0.236477 -0.294306 -0.317865 ... -0.621648 0.416998 0.741864 0.616850 0.741864 -0.542874 0.318355 0.582533 0.460465 0.582533
edit_dist_agg_n2_mean_std -0.014947 0.038638 -0.075161 -0.031109 -0.146630 -0.128776 -0.156427 -0.059137 0.692777 0.451432 ... 0.773903 0.090170 -0.577464 -0.421909 -0.577464 0.892789 0.231117 -0.389648 -0.266805 -0.389648
edit_dist_agg_n2_mean_max -0.491705 -0.487911 -0.461371 -0.451262 -0.459216 -0.445738 0.577263 0.582751 0.029921 -0.393605 ... -0.074074 0.651361 0.402426 0.419721 0.402426 0.103166 0.884805 0.648109 0.623791 0.648109
edit_dist_agg_n2_mean_min -0.617661 -0.646925 -0.550569 -0.572689 -0.496923 -0.504740 0.785681 0.729538 -0.456667 -0.733514 ... -0.566262 0.545719 0.772967 0.726884 0.772967 -0.508398 0.696590 0.964702 0.913646 0.964702
edit_dist_agg_n2_mean_median -0.657343 -0.675227 -0.587556 -0.604566 -0.549332 -0.561770 0.791280 0.731323 -0.389883 -0.662481 ... -0.458633 0.504874 0.678971 0.685379 0.678971 -0.429264 0.649240 0.915035 0.925209 0.915035
edit_dist_agg_n2_mean_mean -0.617661 -0.646925 -0.550569 -0.572689 -0.496923 -0.504740 0.785681 0.729538 -0.456667 -0.733514 ... -0.566262 0.545719 0.772967 0.726884 0.772967 -0.508398 0.696590 0.964702 0.913646 0.964702
edit_dist_agg_n3_std_std 0.232869 0.290246 0.153703 0.222360 0.078719 0.126468 -0.330967 -0.329875 0.699781 0.579657 ... 0.101109 -0.013119 -0.111181 -0.121014 -0.111181 0.297907 0.061116 -0.259276 -0.285587 -0.259276
edit_dist_agg_n3_std_max 0.653597 0.694704 0.577022 0.631161 0.504242 0.548873 -0.689165 -0.702987 0.477269 0.745748 ... 0.028669 -0.202065 -0.146182 -0.191351 -0.146182 0.114552 -0.447787 -0.566398 -0.605436 -0.566398
edit_dist_agg_n3_std_min 0.824385 0.826922 0.794074 0.812937 0.751881 0.777890 -0.839538 -0.839686 0.162336 0.610878 ... 0.030578 -0.376074 -0.227561 -0.291665 -0.227561 0.030810 -0.729903 -0.739328 -0.799889 -0.739328
edit_dist_agg_n3_std_median 0.807114 0.801209 0.789296 0.799398 0.757237 0.777063 -0.810100 -0.802879 0.124622 0.556205 ... 0.069679 -0.412648 -0.277760 -0.334739 -0.277760 0.069954 -0.710542 -0.753077 -0.813109 -0.753077
edit_dist_agg_n3_std_mean 0.824385 0.826922 0.794074 0.812937 0.751881 0.777890 -0.839538 -0.839686 0.162336 0.610878 ... 0.030578 -0.376074 -0.227561 -0.291665 -0.227561 0.030810 -0.729903 -0.739328 -0.799889 -0.739328
edit_dist_agg_n3_max_std -0.024934 0.002885 -0.047272 -0.022063 -0.108195 -0.103605 -0.071870 -0.031551 0.443110 0.281031 ... 0.711130 0.049121 -0.526648 -0.317456 -0.526648 0.778628 0.119868 -0.318411 -0.123093 -0.318411
edit_dist_agg_n3_max_max -0.205253 -0.200019 -0.205397 -0.194804 -0.222608 -0.205963 0.394244 0.329641 -0.275644 -0.331246 ... -0.415451 0.420766 0.556069 0.502596 0.556069 -0.320941 0.406439 0.559622 0.524683 0.559622
edit_dist_agg_n3_max_min -0.090108 -0.098373 -0.073264 -0.080557 -0.040176 -0.026663 0.292093 0.241289 -0.515081 -0.407239 ... -0.768603 0.243762 0.765209 0.624330 0.765209 -0.764135 0.162036 0.592363 0.443093 0.592363
edit_dist_agg_n3_max_median -0.079149 -0.081990 -0.056280 -0.064417 -0.022640 -0.009417 0.264872 0.242410 -0.495864 -0.405828 ... -0.661407 0.233187 0.698093 0.641834 0.698093 -0.643262 0.177940 0.553877 0.449258 0.553877
edit_dist_agg_n3_max_mean -0.090108 -0.098373 -0.073264 -0.080557 -0.040176 -0.026663 0.292093 0.241289 -0.515081 -0.407239 ... -0.768603 0.243762 0.765209 0.624330 0.765209 -0.764135 0.162036 0.592363 0.443093 0.592363
edit_dist_agg_n3_min_std 0.114764 0.155383 0.096264 0.136016 0.027464 0.042157 -0.278608 -0.217338 0.720441 0.532920 ... 0.852495 0.045251 -0.655640 -0.460291 -0.655640 1.000000 0.142937 -0.537708 -0.382294 -0.537708
edit_dist_agg_n3_min_max -0.561728 -0.563821 -0.503711 -0.501605 -0.488843 -0.485523 0.649737 0.625375 0.008608 -0.417932 ... -0.056184 0.719553 0.418558 0.438128 0.418558 0.142937 1.000000 0.704132 0.682101 0.704132
edit_dist_agg_n3_min_min -0.627769 -0.655982 -0.568843 -0.592744 -0.514491 -0.526487 0.792047 0.745811 -0.474480 -0.741256 ... -0.597825 0.529070 0.788455 0.741003 0.788455 -0.537708 0.704132 1.000000 0.946076 1.000000
edit_dist_agg_n3_min_median -0.648024 -0.666951 -0.587434 -0.601548 -0.551257 -0.564213 0.786380 0.741253 -0.357057 -0.666954 ... -0.459888 0.478250 0.679944 0.732094 0.679944 -0.382294 0.682101 0.946076 1.000000 0.946076
edit_dist_agg_n3_min_mean -0.627769 -0.655982 -0.568843 -0.592744 -0.514491 -0.526487 0.792047 0.745811 -0.474480 -0.741256 ... -0.597825 0.529070 0.788455 0.741003 0.788455 -0.537708 0.704132 1.000000 0.946076 1.000000
edit_dist_agg_n3_median_std 0.088286 0.125850 0.028798 0.051561 -0.024181 -0.020811 -0.248710 -0.202496 0.555425 0.475221 ... 1.000000 -0.015066 -0.837682 -0.667226 -0.837682 0.852495 -0.056184 -0.597825 -0.459888 -0.597825
edit_dist_agg_n3_median_max -0.246377 -0.258933 -0.207780 -0.210232 -0.208826 -0.202706 0.396686 0.348619 -0.106513 -0.251333 ... -0.015066 1.000000 0.483725 0.440354 0.483725 0.045251 0.719553 0.529070 0.478250 0.529070
edit_dist_agg_n3_median_min -0.197416 -0.240955 -0.119122 -0.136617 -0.078909 -0.075709 0.408671 0.342492 -0.483549 -0.516681 ... -0.837682 0.483725 1.000000 0.906894 1.000000 -0.655640 0.418558 0.788455 0.679944 0.788455
edit_dist_agg_n3_median_median -0.190264 -0.227452 -0.118729 -0.126057 -0.089606 -0.084389 0.382662 0.347726 -0.351782 -0.448040 ... -0.667226 0.440354 0.906894 1.000000 0.906894 -0.460291 0.438128 0.741003 0.732094 0.741003
edit_dist_agg_n3_median_mean -0.197416 -0.240955 -0.119122 -0.136617 -0.078909 -0.075709 0.408671 0.342492 -0.483549 -0.516681 ... -0.837682 0.483725 1.000000 0.906894 1.000000 -0.655640 0.418558 0.788455 0.679944 0.788455
edit_dist_agg_n3_mean_std 0.114764 0.155383 0.096264 0.136016 0.027464 0.042157 -0.278608 -0.217338 0.720441 0.532920 ... 0.852495 0.045251 -0.655640 -0.460291 -0.655640 1.000000 0.142937 -0.537708 -0.382294 -0.537708
edit_dist_agg_n3_mean_max -0.561728 -0.563821 -0.503711 -0.501605 -0.488843 -0.485523 0.649737 0.625375 0.008608 -0.417932 ... -0.056184 0.719553 0.418558 0.438128 0.418558 0.142937 1.000000 0.704132 0.682101 0.704132
edit_dist_agg_n3_mean_min -0.627769 -0.655982 -0.568843 -0.592744 -0.514491 -0.526487 0.792047 0.745811 -0.474480 -0.741256 ... -0.597825 0.529070 0.788455 0.741003 0.788455 -0.537708 0.704132 1.000000 0.946076 1.000000
edit_dist_agg_n3_mean_median -0.648024 -0.666951 -0.587434 -0.601548 -0.551257 -0.564213 0.786380 0.741253 -0.357057 -0.666954 ... -0.459888 0.478250 0.679944 0.732094 0.679944 -0.382294 0.682101 0.946076 1.000000 0.946076
edit_dist_agg_n3_mean_mean -0.627769 -0.655982 -0.568843 -0.592744 -0.514491 -0.526487 0.792047 0.745811 -0.474480 -0.741256 ... -0.597825 0.529070 0.788455 0.741003 0.788455 -0.537708 0.704132 1.000000 0.946076 1.000000

58 rows × 58 columns


In [266]:
import datetime
print datetime.datetime.now()


2017-05-25 01:43:56.646323

In [267]:
train.to_csv(config.RAW_PATH+'train_1111111.csv',index=False)

In [269]:
train_orig =  pd.read_csv(config.RAW_PATH+'train.csv', header=0)
test_orig =  pd.read_csv(config.RAW_PATH+'test.csv', header=0)
test_orig['is_duplicate'] = -1

train1 = pd.concat([train_orig[['question1', 'question2','is_duplicate']], \
        test_orig[['question1', 'question2','is_duplicate']]], axis=0).reset_index(drop='index')
train['is_duplicate'] = train1['is_duplicate']

In [270]:
train[train['is_duplicate']>=0].corr()


Out[270]:
jaccard_n1 dicedistence_n1 jaccard_n2 dicedistence_n2 jaccard_n3 dicedistence_n3 compression_dist edit_dist edit_dist_agg_n2_std_std edit_dist_agg_n2_std_max ... edit_dist_agg_n3_median_max edit_dist_agg_n3_median_min edit_dist_agg_n3_median_median edit_dist_agg_n3_median_mean edit_dist_agg_n3_mean_std edit_dist_agg_n3_mean_max edit_dist_agg_n3_mean_min edit_dist_agg_n3_mean_median edit_dist_agg_n3_mean_mean is_duplicate
jaccard_n1 1.000000 0.983648 0.905742 0.923042 0.796528 0.823201 -0.873410 -0.877460 0.218352 0.557366 ... -0.218653 -0.278028 -0.270836 -0.278028 0.113985 -0.478705 -0.570892 -0.574023 -0.570892 0.322136
dicedistence_n1 0.983648 1.000000 0.851457 0.889893 0.724861 0.763040 -0.883301 -0.871926 0.300094 0.630752 ... -0.233918 -0.302649 -0.292346 -0.302649 0.155684 -0.487522 -0.599021 -0.599014 -0.599021 0.367482
jaccard_n2 0.905742 0.851457 1.000000 0.985303 0.952456 0.961399 -0.778677 -0.809950 0.124674 0.424180 ... -0.140668 -0.182755 -0.180052 -0.182755 0.092353 -0.377393 -0.468938 -0.478263 -0.468938 0.191811
dicedistence_n2 0.923042 0.889893 0.985303 1.000000 0.902431 0.933682 -0.809520 -0.833151 0.211759 0.505759 ... -0.163152 -0.215242 -0.209780 -0.215242 0.134406 -0.405831 -0.515965 -0.522163 -0.515965 0.240642
jaccard_n3 0.796528 0.724861 0.952456 0.902431 1.000000 0.986737 -0.664738 -0.705660 -0.006118 0.261021 ... -0.055785 -0.080148 -0.081498 -0.080148 0.043047 -0.265128 -0.340203 -0.355774 -0.340203 0.109883
dicedistence_n3 0.823201 0.763040 0.961399 0.933682 0.986737 1.000000 -0.694339 -0.733957 0.047581 0.309062 ... -0.064470 -0.096115 -0.095873 -0.096115 0.073591 -0.286078 -0.375930 -0.390380 -0.375930 0.145447
compression_dist -0.873410 -0.883301 -0.778677 -0.809520 -0.664738 -0.694339 1.000000 0.912464 -0.357816 -0.673521 ... 0.328422 0.415863 0.397073 0.415863 -0.240151 0.571655 0.707521 0.700699 0.707521 -0.375731
edit_dist -0.877460 -0.871926 -0.809950 -0.833151 -0.705660 -0.733957 0.912464 1.000000 -0.261785 -0.594266 ... 0.305680 0.378890 0.368472 0.378890 -0.169119 0.565274 0.673387 0.672552 0.673387 -0.337489
edit_dist_agg_n2_std_std 0.218352 0.300094 0.124674 0.211759 -0.006118 0.047581 -0.357816 -0.261785 1.000000 0.788365 ... -0.112065 -0.342641 -0.287802 -0.342641 0.680104 -0.072229 -0.407506 -0.366728 -0.407506 0.203267
edit_dist_agg_n2_std_max 0.557366 0.630752 0.424180 0.505759 0.261021 0.309062 -0.673521 -0.594266 0.788365 1.000000 ... -0.289265 -0.491662 -0.445111 -0.491662 0.522129 -0.432572 -0.683978 -0.641201 -0.683978 0.330128
edit_dist_agg_n2_std_min 0.819611 0.845216 0.724390 0.776963 0.583885 0.631624 -0.877966 -0.842250 0.361768 0.755346 ... -0.428653 -0.517532 -0.500054 -0.517532 0.221911 -0.704856 -0.818024 -0.811743 -0.818024 0.352788
edit_dist_agg_n2_std_median 0.801569 0.813535 0.730075 0.773432 0.608117 0.658537 -0.834874 -0.815153 0.282290 0.629261 ... -0.394172 -0.468643 -0.460587 -0.468643 0.175812 -0.661669 -0.768218 -0.782367 -0.768218 0.316072
edit_dist_agg_n2_std_mean 0.819611 0.845216 0.724390 0.776963 0.583885 0.631624 -0.877966 -0.842250 0.361768 0.755346 ... -0.428653 -0.517532 -0.500054 -0.517532 0.221911 -0.704856 -0.818024 -0.811743 -0.818024 0.352788
edit_dist_agg_n2_max_std 0.058510 0.069623 0.025368 0.038576 -0.000376 0.008106 -0.164933 -0.123612 0.259637 0.216282 ... -0.098499 -0.242881 -0.215570 -0.242881 0.317956 -0.083405 -0.243362 -0.218035 -0.243362 0.085038
edit_dist_agg_n2_max_max -0.077590 -0.080404 -0.044794 -0.054068 0.001757 -0.001051 0.164005 0.140567 -0.135714 -0.157240 ... 0.471501 0.503883 0.489616 0.503883 -0.122871 0.427249 0.447653 0.428583 0.447653 -0.028569
edit_dist_agg_n2_max_min -0.088768 -0.098269 -0.040280 -0.058249 0.017168 0.008524 0.252064 0.200361 -0.326730 -0.287613 ... 0.467189 0.601797 0.563190 0.601797 -0.359875 0.416483 0.561535 0.524431 0.561535 -0.076961
edit_dist_agg_n2_max_median -0.065331 -0.073237 -0.024027 -0.039830 0.026155 0.018587 0.210204 0.166447 -0.280963 -0.241626 ... 0.408411 0.516246 0.482165 0.516246 -0.302562 0.365759 0.485137 0.453626 0.485137 -0.059027
edit_dist_agg_n2_max_mean -0.088768 -0.098269 -0.040280 -0.058249 0.017168 0.008524 0.252064 0.200361 -0.326730 -0.287613 ... 0.467189 0.601797 0.563190 0.601797 -0.359875 0.416483 0.561535 0.524431 0.561535 -0.076961
edit_dist_agg_n2_min_std 0.036143 0.077771 -0.004552 0.032458 -0.065391 -0.047852 -0.171653 -0.097067 0.662641 0.474167 ... -0.075366 -0.411035 -0.350162 -0.411035 0.796143 0.042490 -0.353570 -0.309203 -0.353570 0.064207
edit_dist_agg_n2_min_max -0.476532 -0.483349 -0.380350 -0.407500 -0.279369 -0.302152 0.553566 0.536624 -0.015322 -0.379139 ... 0.685893 0.605277 0.598731 0.605277 0.132952 0.861834 0.731905 0.711874 0.731905 -0.225322
edit_dist_agg_n2_min_min -0.586775 -0.616611 -0.471537 -0.519564 -0.336400 -0.371866 0.716303 0.665331 -0.402376 -0.672394 ... 0.693899 0.832939 0.801643 0.832939 -0.338936 0.810832 0.957576 0.932535 0.957576 -0.277161
edit_dist_agg_n2_min_median -0.589610 -0.617273 -0.481943 -0.528881 -0.352195 -0.389742 0.706236 0.661217 -0.375616 -0.630017 ... 0.659586 0.791546 0.773808 0.791546 -0.307338 0.774584 0.926728 0.924593 0.926728 -0.266156
edit_dist_agg_n2_min_mean -0.586775 -0.616611 -0.471537 -0.519564 -0.336400 -0.371866 0.716303 0.665331 -0.402376 -0.672394 ... 0.693899 0.832939 0.801643 0.832939 -0.338936 0.810832 0.957576 0.932535 0.957576 -0.277161
edit_dist_agg_n2_median_std 0.039656 0.054153 -0.012291 0.002490 -0.057575 -0.054843 -0.120359 -0.078673 0.305547 0.271644 ... -0.176854 -0.438021 -0.402739 -0.438021 0.459837 -0.114629 -0.333873 -0.302588 -0.333873 0.038746
edit_dist_agg_n2_median_max -0.172837 -0.184758 -0.118440 -0.134702 -0.063316 -0.071653 0.266145 0.237718 -0.053064 -0.157790 ... 0.606823 0.517554 0.504178 0.517554 0.038616 0.579255 0.510735 0.494859 0.510735 -0.094427
edit_dist_agg_n2_median_min -0.214119 -0.232352 -0.136307 -0.157257 -0.060986 -0.068267 0.326871 0.287549 -0.216892 -0.305933 ... 0.673023 0.770605 0.744430 0.770605 -0.249527 0.599271 0.692219 0.662225 0.692219 -0.104888
edit_dist_agg_n2_median_median -0.211996 -0.227607 -0.143905 -0.162541 -0.074115 -0.080713 0.311804 0.280529 -0.180509 -0.274616 ... 0.642865 0.718914 0.704535 0.718914 -0.202218 0.575698 0.652856 0.631748 0.652856 -0.094554
edit_dist_agg_n2_median_mean -0.214119 -0.232352 -0.136307 -0.157257 -0.060986 -0.068267 0.326871 0.287549 -0.216892 -0.305933 ... 0.673023 0.770605 0.744430 0.770605 -0.249527 0.599271 0.692219 0.662225 0.692219 -0.104888
edit_dist_agg_n2_mean_std 0.036143 0.077771 -0.004552 0.032458 -0.065391 -0.047852 -0.171653 -0.097067 0.662641 0.474167 ... -0.075366 -0.411035 -0.350162 -0.411035 0.796143 0.042490 -0.353570 -0.309203 -0.353570 0.064207
edit_dist_agg_n2_mean_max -0.476532 -0.483349 -0.380350 -0.407500 -0.279369 -0.302152 0.553566 0.536624 -0.015322 -0.379139 ... 0.685893 0.605277 0.598731 0.605277 0.132952 0.861834 0.731905 0.711874 0.731905 -0.225322
edit_dist_agg_n2_mean_min -0.586775 -0.616611 -0.471537 -0.519564 -0.336400 -0.371866 0.716303 0.665331 -0.402376 -0.672394 ... 0.693899 0.832939 0.801643 0.832939 -0.338936 0.810832 0.957576 0.932535 0.957576 -0.277161
edit_dist_agg_n2_mean_median -0.589610 -0.617273 -0.481943 -0.528881 -0.352195 -0.389742 0.706236 0.661217 -0.375616 -0.630017 ... 0.659586 0.791546 0.773808 0.791546 -0.307338 0.774584 0.926728 0.924593 0.926728 -0.266156
edit_dist_agg_n2_mean_mean -0.586775 -0.616611 -0.471537 -0.519564 -0.336400 -0.371866 0.716303 0.665331 -0.402376 -0.672394 ... 0.693899 0.832939 0.801643 0.832939 -0.338936 0.810832 0.957576 0.932535 0.957576 -0.277161
edit_dist_agg_n3_std_std 0.267800 0.338883 0.213005 0.295010 0.128733 0.200282 -0.362231 -0.283037 0.778207 0.627390 ... 0.062344 -0.108395 -0.070112 -0.108395 0.563427 0.058225 -0.252170 -0.229447 -0.252170 0.232153
edit_dist_agg_n3_std_max 0.645287 0.701292 0.565771 0.640750 0.449987 0.517853 -0.709764 -0.665816 0.572468 0.783325 ... -0.076907 -0.205162 -0.180778 -0.205162 0.363962 -0.315347 -0.521116 -0.501328 -0.521116 0.351438
edit_dist_agg_n3_std_min 0.799006 0.815431 0.745034 0.786844 0.645992 0.692754 -0.842622 -0.834750 0.257980 0.638389 ... -0.230747 -0.286223 -0.279181 -0.286223 0.137147 -0.566978 -0.658287 -0.665274 -0.658287 0.338212
edit_dist_agg_n3_std_median 0.782820 0.789530 0.744973 0.776789 0.660156 0.701253 -0.814931 -0.815917 0.201353 0.563522 ... -0.229478 -0.277998 -0.275078 -0.277998 0.111550 -0.546322 -0.638601 -0.664158 -0.638601 0.306767
edit_dist_agg_n3_std_mean 0.799006 0.815431 0.745034 0.786844 0.645992 0.692754 -0.842622 -0.834750 0.257980 0.638389 ... -0.230747 -0.286223 -0.279181 -0.286223 0.137147 -0.566978 -0.658287 -0.665274 -0.658287 0.338212
edit_dist_agg_n3_max_std -0.003888 0.009558 -0.031674 -0.020711 -0.057162 -0.054213 -0.050063 -0.024098 0.274251 0.193292 ... 0.002133 -0.293861 -0.258371 -0.293861 0.563423 0.070252 -0.197738 -0.162451 -0.197738 0.023270
edit_dist_agg_n3_max_max -0.157989 -0.166978 -0.098090 -0.118063 -0.018853 -0.026183 0.299251 0.260304 -0.260639 -0.334085 ... 0.720611 0.734795 0.704048 0.734795 -0.167069 0.669241 0.692008 0.658212 0.692008 -0.077524
edit_dist_agg_n3_max_min -0.136211 -0.151245 -0.067723 -0.091853 0.016877 0.008419 0.286625 0.242152 -0.366838 -0.391777 ... 0.686405 0.823579 0.784410 0.823579 -0.388244 0.585784 0.730643 0.687724 0.730643 -0.078938
edit_dist_agg_n3_max_median -0.130367 -0.145668 -0.061744 -0.086014 0.021459 0.012672 0.275393 0.233569 -0.345685 -0.371430 ... 0.678997 0.798687 0.771310 0.798687 -0.346635 0.577481 0.709155 0.677580 0.709155 -0.075459
edit_dist_agg_n3_max_mean -0.136211 -0.151245 -0.067723 -0.091853 0.016877 0.008419 0.286625 0.242152 -0.366838 -0.391777 ... 0.686405 0.823579 0.784410 0.823579 -0.388244 0.585784 0.730643 0.687724 0.730643 -0.078938
edit_dist_agg_n3_min_std 0.113985 0.155684 0.092353 0.134406 0.043047 0.073591 -0.240151 -0.169119 0.680104 0.522129 ... 0.062232 -0.367999 -0.289450 -0.367999 1.000000 0.139260 -0.366058 -0.313218 -0.366058 0.107007
edit_dist_agg_n3_min_max -0.478705 -0.487522 -0.377393 -0.405831 -0.265128 -0.286078 0.571655 0.565274 -0.072229 -0.432572 ... 0.831166 0.737419 0.727431 0.737419 0.139260 1.000000 0.840906 0.809631 0.840906 -0.212323
edit_dist_agg_n3_min_min -0.570892 -0.599021 -0.468938 -0.515965 -0.340203 -0.375930 0.707521 0.673387 -0.407506 -0.683978 ... 0.728308 0.880807 0.849162 0.880807 -0.366058 0.840906 1.000000 0.975044 1.000000 -0.264395
edit_dist_agg_n3_min_median -0.574023 -0.599014 -0.478263 -0.522163 -0.355774 -0.390380 0.700699 0.672552 -0.366728 -0.641201 ... 0.701415 0.849204 0.845677 0.849204 -0.313218 0.809631 0.975044 1.000000 0.975044 -0.253957
edit_dist_agg_n3_min_mean -0.570892 -0.599021 -0.468938 -0.515965 -0.340203 -0.375930 0.707521 0.673387 -0.407506 -0.683978 ... 0.728308 0.880807 0.849162 0.880807 -0.366058 0.840906 1.000000 0.975044 1.000000 -0.264395
edit_dist_agg_n3_median_std 0.134945 0.157886 0.085565 0.111426 0.032753 0.049555 -0.242809 -0.193213 0.487596 0.454783 ... 0.034709 -0.486253 -0.395581 -0.486253 0.813055 -0.046060 -0.439843 -0.393561 -0.439843 0.100484
edit_dist_agg_n3_median_max -0.218653 -0.233918 -0.140668 -0.163152 -0.055785 -0.064470 0.328422 0.305680 -0.112065 -0.289265 ... 1.000000 0.819782 0.792875 0.819782 0.062232 0.831166 0.728308 0.701415 0.728308 -0.101663
edit_dist_agg_n3_median_min -0.278028 -0.302649 -0.182755 -0.215242 -0.080148 -0.096115 0.415863 0.378890 -0.342641 -0.491662 ... 0.819782 1.000000 0.966317 1.000000 -0.367999 0.737419 0.880807 0.849204 0.880807 -0.137940
edit_dist_agg_n3_median_median -0.270836 -0.292346 -0.180052 -0.209780 -0.081498 -0.095873 0.397073 0.368472 -0.287802 -0.445111 ... 0.792875 0.966317 1.000000 0.966317 -0.289450 0.727431 0.849162 0.845677 0.849162 -0.125726
edit_dist_agg_n3_median_mean -0.278028 -0.302649 -0.182755 -0.215242 -0.080148 -0.096115 0.415863 0.378890 -0.342641 -0.491662 ... 0.819782 1.000000 0.966317 1.000000 -0.367999 0.737419 0.880807 0.849204 0.880807 -0.137940
edit_dist_agg_n3_mean_std 0.113985 0.155684 0.092353 0.134406 0.043047 0.073591 -0.240151 -0.169119 0.680104 0.522129 ... 0.062232 -0.367999 -0.289450 -0.367999 1.000000 0.139260 -0.366058 -0.313218 -0.366058 0.107007
edit_dist_agg_n3_mean_max -0.478705 -0.487522 -0.377393 -0.405831 -0.265128 -0.286078 0.571655 0.565274 -0.072229 -0.432572 ... 0.831166 0.737419 0.727431 0.737419 0.139260 1.000000 0.840906 0.809631 0.840906 -0.212323
edit_dist_agg_n3_mean_min -0.570892 -0.599021 -0.468938 -0.515965 -0.340203 -0.375930 0.707521 0.673387 -0.407506 -0.683978 ... 0.728308 0.880807 0.849162 0.880807 -0.366058 0.840906 1.000000 0.975044 1.000000 -0.264395
edit_dist_agg_n3_mean_median -0.574023 -0.599014 -0.478263 -0.522163 -0.355774 -0.390380 0.700699 0.672552 -0.366728 -0.641201 ... 0.701415 0.849204 0.845677 0.849204 -0.313218 0.809631 0.975044 1.000000 0.975044 -0.253957
edit_dist_agg_n3_mean_mean -0.570892 -0.599021 -0.468938 -0.515965 -0.340203 -0.375930 0.707521 0.673387 -0.407506 -0.683978 ... 0.728308 0.880807 0.849162 0.880807 -0.366058 0.840906 1.000000 0.975044 1.000000 -0.264395
is_duplicate 0.322136 0.367482 0.191811 0.240642 0.109883 0.145447 -0.375731 -0.337489 0.203267 0.330128 ... -0.101663 -0.137940 -0.125726 -0.137940 0.107007 -0.212323 -0.264395 -0.253957 -0.264395 1.000000

59 rows × 59 columns


In [4]:
def get_position_list(obs, target, ngram=1, token_pattern=" "):
    """
        Get the list of positions of obs in target
    """
    obs_tokens = nlp_utils._tokenize(str(obs), token_pattern)
    target_tokens = nlp_utils._tokenize(str(target), token_pattern)
    obs = ngram_utils._ngrams(obs_tokens, ngram)
    target = ngram_utils._ngrams(target_tokens, ngram)
    
    pos_of_obs_in_target = [0]
    if len(obs) != 0:
        pos_of_obs_in_target = [j for j,w in enumerate(obs, start=1) if w in target]
        if len(pos_of_obs_in_target) == 0:
            pos_of_obs_in_target = [0]
    return pos_of_obs_in_target, len(obs)


for ngram in [1,2]:
    for target_name in ['question1','question2']:
        for obs_name in ['question1','question2']:
            if target_name != obs_name:
                position = train[['question1','question2']].apply(lambda x: get_position_list(obs=x[obs_name],target=x[target_name],ngram=ngram), axis=1)
                pos = [i[0] for i in position]
                obs_len = [i[1] for i in position]
                ## stats feat on pos
                train["pos_of_%s_n%s_in_%s_min" % (obs_name, ngram, target_name)] = map(np.min, pos)
                train["pos_of_%s_n%s_in_%s_mean" % (obs_name, ngram, target_name)] = map(np.mean, pos)
                train["pos_of_%s_n%s_in_%s_median" % (obs_name, ngram, target_name)] = map(np.median, pos)
                train["pos_of_%s_n%s_in_%s_max" % (obs_name, ngram, target_name)] = map(np.max, pos)
                train["pos_of_%s_n%s_in_%s_std" % (obs_name, ngram, target_name)] = map(np.std, pos)
                # stats feat on normalized_pos
                train["norm_pos_of_%s_n%s_in_%s_min" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_min" % (obs_name, ngram, target_name)], obs_len)
                train["norm_pos_of_%s_n%s_in_%s_mean" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_mean" % (obs_name, ngram, target_name)], obs_len)
                train["norm_pos_of_%s_n%s_in_%s_median" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_median" % (obs_name, ngram, target_name)], obs_len)
                train["norm_pos_of_%s_n%s_in_%s_max" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_max" % (obs_name, ngram, target_name)], obs_len)
                train["norm_pos_of_%s_n%s_in_%s_std" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_std" % (obs_name, ngram, target_name)] ,obs_len)

train.corr()[train.corr().index.str.contains('pos_of')]


Out[4]:
id qid1 qid2 is_duplicate jaccard_n123 dicedistence_n123 compression_dist edit_dist edit_dist_agg_n2 pos_of_question2_n1_in_question1_min ... pos_of_question1_n2_in_question2_min pos_of_question1_n2_in_question2_mean pos_of_question1_n2_in_question2_median pos_of_question1_n2_in_question2_max pos_of_question1_n2_in_question2_std norm_pos_of_question1_n2_in_question2_min norm_pos_of_question1_n2_in_question2_mean norm_pos_of_question1_n2_in_question2_median norm_pos_of_question1_n2_in_question2_max norm_pos_of_question1_n2_in_question2_std
pos_of_question2_n1_in_question1_min 0.091188 0.091188 0.091188 -0.070505 -0.270079 -0.285221 0.299071 0.314354 0.095370 1.000000 ... 0.136549 -0.014180 -0.007304 -0.102428 -0.220998 0.126898 -0.015142 -0.010791 -0.117006 -0.270792
pos_of_question2_n1_in_question1_mean 0.081232 0.081232 0.081232 0.036182 0.073449 0.068550 0.066148 0.005480 0.223616 0.562078 ... 0.330662 0.436351 0.434052 0.409361 0.252429 0.209560 0.252194 0.251629 0.236988 0.093090
pos_of_question2_n1_in_question1_median 0.077059 0.077059 0.077059 0.028635 0.058116 0.053006 0.068766 0.006716 0.212374 0.525533 ... 0.371946 0.456286 0.460653 0.406857 0.223970 0.247964 0.283484 0.286801 0.253870 0.082694
pos_of_question2_n1_in_question1_max 0.056250 0.056250 0.056250 0.082929 0.216690 0.217676 -0.054016 -0.128350 0.225545 0.253399 ... 0.292477 0.491941 0.480088 0.516095 0.401354 0.151540 0.266443 0.260082 0.305770 0.228553
pos_of_question2_n1_in_question1_std 0.024478 0.024478 0.024478 0.090390 0.249456 0.261758 -0.115304 -0.199164 0.193695 -0.147556 ... 0.234796 0.443623 0.427193 0.484413 0.413629 0.107738 0.242712 0.233664 0.301975 0.275150
norm_pos_of_question2_n1_in_question1_min 0.083675 0.083675 0.083675 -0.022449 -0.232674 -0.232562 0.124182 0.201850 -0.150352 0.842050 ... 0.086237 -0.088551 -0.082982 -0.174595 -0.266676 0.142736 -0.005863 -0.003723 -0.111243 -0.266057
norm_pos_of_question2_n1_in_question1_mean 0.117718 0.117718 0.117718 0.232939 0.226316 0.273982 -0.320109 -0.305905 -0.121146 0.551932 ... 0.336494 0.355985 0.358512 0.294229 0.139657 0.382569 0.434812 0.429571 0.403077 0.188142
norm_pos_of_question2_n1_in_question1_median 0.112239 0.112239 0.112239 0.216539 0.202683 0.245646 -0.292469 -0.289202 -0.097456 0.526367 ... 0.406495 0.412683 0.423171 0.324405 0.127978 0.441299 0.486974 0.487168 0.435851 0.176461
norm_pos_of_question2_n1_in_question1_max 0.101182 0.101182 0.101182 0.336523 0.470675 0.536000 -0.519161 -0.541313 -0.077635 0.189331 ... 0.312481 0.462734 0.455256 0.464052 0.359845 0.335077 0.507346 0.494531 0.556283 0.426362
norm_pos_of_question2_n1_in_question1_std 0.047338 0.047338 0.047338 0.320685 0.488246 0.557265 -0.508239 -0.573647 -0.003227 -0.322718 ... 0.242315 0.427838 0.415603 0.457938 0.407233 0.248703 0.449633 0.435070 0.529076 0.477085
pos_of_question1_n1_in_question2_min 0.048041 0.048041 0.048041 -0.041662 -0.270854 -0.277709 0.201826 0.253632 -0.075051 0.690550 ... 0.166150 0.015352 0.020983 -0.078921 -0.206003 0.141570 -0.005352 -0.003480 -0.111340 -0.271312
pos_of_question1_n1_in_question2_mean 0.031870 0.031870 0.031870 0.097225 0.113879 0.114228 -0.007903 -0.064612 -0.027036 0.337753 ... 0.347105 0.503235 0.481384 0.514724 0.375556 0.154097 0.222306 0.208786 0.244921 0.156011
pos_of_question1_n1_in_question2_median 0.020851 0.020851 0.020851 0.065481 0.093428 0.090899 0.009473 -0.057161 -0.016962 0.328675 ... 0.383212 0.506349 0.502470 0.483050 0.310223 0.200463 0.253145 0.251839 0.248441 0.116319
pos_of_question1_n1_in_question2_max 0.019842 0.019842 0.019842 0.132564 0.224631 0.228405 -0.079889 -0.151789 -0.010276 0.129886 ... 0.264906 0.511017 0.479089 0.580738 0.509689 0.059344 0.193050 0.174065 0.272049 0.274810
pos_of_question1_n1_in_question2_std 0.023942 0.023942 0.023942 0.135500 0.230860 0.242316 -0.082273 -0.175010 -0.001219 -0.065597 ... 0.204861 0.453891 0.414759 0.537644 0.514957 0.016319 0.164335 0.141360 0.259986 0.312461
norm_pos_of_question1_n1_in_question2_min 0.072126 0.072126 0.072126 -0.031879 -0.223887 -0.219974 0.066487 0.156178 -0.114076 0.647565 ... 0.091735 -0.086459 -0.077340 -0.178541 -0.275435 0.189259 0.036452 0.039895 -0.080838 -0.268061
norm_pos_of_question1_n1_in_question2_mean 0.142549 0.142549 0.142549 0.235419 0.238150 0.286515 -0.345741 -0.328967 -0.071357 0.464775 ... 0.360738 0.366475 0.370272 0.295439 0.117613 0.442750 0.491408 0.486719 0.447978 0.185505
norm_pos_of_question1_n1_in_question2_median 0.133428 0.133428 0.133428 0.204796 0.209328 0.250635 -0.305061 -0.305119 -0.058535 0.452200 ... 0.416440 0.401629 0.421177 0.299519 0.079217 0.495524 0.530134 0.537860 0.459306 0.146749
norm_pos_of_question1_n1_in_question2_max 0.133814 0.133814 0.133814 0.339833 0.474708 0.539406 -0.515086 -0.536212 -0.030594 0.183550 ... 0.325544 0.472485 0.464993 0.470038 0.354625 0.368647 0.543963 0.531180 0.588647 0.433673
norm_pos_of_question1_n1_in_question2_std 0.103528 0.103528 0.103528 0.343918 0.480212 0.546342 -0.462842 -0.536711 0.013861 -0.180206 ... 0.250048 0.439799 0.421882 0.473457 0.424218 0.253075 0.462803 0.445235 0.548333 0.497691
pos_of_question2_n2_in_question1_min 0.112933 0.112933 0.112933 0.086098 -0.054563 -0.017795 0.040305 0.026311 0.184187 0.158557 ... 0.695424 0.557182 0.574308 0.355958 -0.022894 0.651668 0.564387 0.567701 0.404689 -0.053071
pos_of_question2_n2_in_question1_mean 0.094554 0.094554 0.094554 0.142627 0.287418 0.308433 -0.215077 -0.244118 0.187821 0.044508 ... 0.630198 0.769977 0.775899 0.688216 0.395937 0.545062 0.663055 0.662129 0.631380 0.309004
pos_of_question2_n2_in_question1_median 0.092296 0.092296 0.092296 0.146853 0.286346 0.306405 -0.212481 -0.243056 0.181198 0.048001 ... 0.631512 0.758888 0.782520 0.661613 0.356264 0.546606 0.661876 0.672360 0.619952 0.288452
pos_of_question2_n2_in_question1_max 0.060870 0.060870 0.060870 0.161251 0.489671 0.497004 -0.367295 -0.401878 0.165471 -0.045333 ... 0.495886 0.815616 0.800102 0.845161 0.660533 0.391091 0.635615 0.623132 0.708983 0.533150
pos_of_question2_n2_in_question1_std -0.001727 -0.001727 -0.001727 0.148493 0.666623 0.658158 -0.519971 -0.552408 0.047532 -0.221012 ... 0.026526 0.572376 0.524895 0.795447 0.924171 -0.074319 0.340733 0.314633 0.592071 0.808374
norm_pos_of_question2_n2_in_question1_min 0.111356 0.111356 0.111356 0.202956 0.052363 0.117784 -0.164564 -0.160651 0.057744 0.133814 ... 0.768747 0.590003 0.604973 0.361684 -0.059002 0.868930 0.772325 0.769540 0.577749 -0.011092
norm_pos_of_question2_n2_in_question1_mean 0.099455 0.099455 0.099455 0.287862 0.440170 0.501259 -0.471590 -0.487491 0.042374 -0.005784 ... 0.680905 0.755054 0.763987 0.638993 0.327979 0.754365 0.892576 0.886443 0.844365 0.426942
norm_pos_of_question2_n2_in_question1_median 0.095527 0.095527 0.095527 0.287235 0.437420 0.496481 -0.466425 -0.483948 0.035423 -0.002570 ... 0.677878 0.746196 0.771533 0.617548 0.297640 0.749198 0.885691 0.892440 0.826913 0.403925
norm_pos_of_question2_n2_in_question1_max 0.071973 0.071973 0.071973 0.306862 0.652744 0.703385 -0.624467 -0.651315 0.028242 -0.106652 ... 0.522951 0.764205 0.754530 0.758050 0.573699 0.566225 0.848872 0.830632 0.922680 0.691381
norm_pos_of_question2_n2_in_question1_std 0.014378 0.014378 0.014378 0.240325 0.756276 0.780171 -0.657072 -0.693398 -0.035474 -0.272580 ... -0.009045 0.447703 0.414177 0.639808 0.800895 -0.035452 0.419607 0.393184 0.691857 0.946908
pos_of_question1_n2_in_question2_min 0.065133 0.065133 0.065133 0.131219 -0.026900 0.023016 -0.026914 -0.045417 0.127732 0.136549 ... 1.000000 0.811530 0.815118 0.555355 0.004510 0.876420 0.762169 0.751423 0.568876 -0.048068
pos_of_question1_n2_in_question2_mean 0.036005 0.036005 0.036005 0.199153 0.384076 0.412280 -0.317691 -0.356340 0.120830 -0.014180 ... 0.811530 1.000000 0.984414 0.928139 0.564117 0.654809 0.811064 0.793421 0.801332 0.428496
pos_of_question1_n2_in_question2_median 0.032359 0.032359 0.032359 0.202044 0.390222 0.416516 -0.324880 -0.363967 0.126561 -0.007304 ... 0.815118 0.984414 1.000000 0.884282 0.496506 0.665064 0.817854 0.821268 0.788230 0.394315
pos_of_question1_n2_in_question2_max 0.005097 0.005097 0.005097 0.204543 0.537672 0.549756 -0.413662 -0.457101 0.087944 -0.102428 ... 0.555355 0.928139 0.884282 1.000000 0.818335 0.398050 0.675576 0.643358 0.788687 0.636131
pos_of_question1_n2_in_question2_std -0.020040 -0.020040 -0.020040 0.168386 0.603279 0.600828 -0.446220 -0.486528 0.011570 -0.220998 ... 0.004510 0.564117 0.496506 0.818335 1.000000 -0.100832 0.308417 0.269074 0.575570 0.832626
norm_pos_of_question1_n2_in_question2_min 0.100338 0.100338 0.100338 0.173219 0.043421 0.109226 -0.163327 -0.175674 0.074315 0.126898 ... 0.876420 0.654809 0.665064 0.398050 -0.100832 1.000000 0.870747 0.862024 0.644361 -0.052051
norm_pos_of_question1_n2_in_question2_mean 0.096335 0.096335 0.096335 0.280216 0.453674 0.515806 -0.481751 -0.514389 0.094278 -0.015142 ... 0.762169 0.811064 0.817854 0.675576 0.308417 0.870747 1.000000 0.991265 0.932171 0.429119
norm_pos_of_question1_n2_in_question2_median 0.091951 0.091951 0.091951 0.280077 0.451625 0.511538 -0.478463 -0.511172 0.096346 -0.010791 ... 0.751423 0.793421 0.821268 0.643358 0.269074 0.862024 0.991265 1.000000 0.910921 0.404098
norm_pos_of_question1_n2_in_question2_max 0.074614 0.074614 0.074614 0.312856 0.670713 0.722127 -0.630533 -0.672551 0.089769 -0.117006 ... 0.568876 0.801332 0.788230 0.788687 0.575570 0.644361 0.932171 0.910921 1.000000 0.717040
norm_pos_of_question1_n2_in_question2_std 0.022055 0.022055 0.022055 0.272436 0.762015 0.786488 -0.642131 -0.684095 0.030518 -0.270792 ... -0.048068 0.428496 0.394315 0.636131 0.832626 -0.052051 0.429119 0.404098 0.717040 1.000000

40 rows × 49 columns


In [5]:
# class Count_Ngram_BaseEstimator:
#     def __init__(self, idx=-1, ngram=1, aggr="", threshold=config.STR_MATCH_THRESHOLD):
#         self.idx = idx
#         self.ngram = ngram
#         self.threshold = threshold

#     def _get_match_count(self, obs, target):
#         cnt = 0
#         if (len(obs) != 0) and (len(target) != 0):
#             if self.idx == -1:
#                 for obs_word in obs:
#                     for word in target:
#                         if dist_utils._is_str_match(word, obs_word, self.threshold):
#                             cnt += 1
#             else:
#                 for word in target:
#                     if dist_utils._is_str_match(word, obs[self.idx], self.threshold):
#                         cnt += 1
#         return cnt

#     def count_close_ngram(self, obs, target, token_pattern=" "):
#         obs_tokens = nlp_utils._tokenize(obs, token_pattern)
#         target_tokens = nlp_utils._tokenize(target, token_pattern)
#         obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
#         target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
#         return self._get_match_count(obs_ngrams, target_ngrams)
    
    
def count_close_ngram(obs, target, idx=-1, ratio='count', ngram=123, aggr="", token_pattern=" ", threshold=config.STR_MATCH_THRESHOLD):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs = ngram_utils._ngrams(obs_tokens, ngram)
    target = ngram_utils._ngrams(target_tokens, ngram)
    cnt = 0
    if (len(obs) != 0) and (len(target) != 0):
        if idx == -1:
            for obs_word in obs:
                for word in target:
                    if dist_utils._is_str_match(word, obs_word, threshold):
                        cnt += 1
        else:
            for word in target:
                if dist_utils._is_str_match(word, obs[idx], threshold):
                    cnt += 1
    if ratio == 'count': 
        return cnt
    else: return np_utils._try_divide(cnt, (len(obs)+len(target))/2.0)
    
# count1 = Count_Ngram_BaseEstimator(threshold=0.65)
# train['intersect_count'] = train[['question1','question2']].apply(lambda x: 
#                     count1.count_close_ngram(x[0],x[1]), axis=1)
NGRAMS=[1,2,3]
RATIO=['count','ratio']
for ngram in NGRAMS:
    for ratio in RATIO:
        train['intersect_close_%s_n%s'%(ratio,ngram)] = train[['question1','question2']].apply(lambda x: 
                    count_close_ngram(x[0],x[1],threshold=0.7,ngram=ngram,ratio=ratio), axis=1)
train.corr()[train.corr().index.str.contains('intersect_close')]


Out[5]:
id qid1 qid2 is_duplicate jaccard_n123 dicedistence_n123 compression_dist edit_dist edit_dist_agg_n2 pos_of_question2_n1_in_question1_min ... norm_pos_of_question1_n2_in_question2_mean norm_pos_of_question1_n2_in_question2_median norm_pos_of_question1_n2_in_question2_max norm_pos_of_question1_n2_in_question2_std intersect_close_count_n1 intersect_close_ratio_n1 intersect_close_count_n2 intersect_close_ratio_n2 intersect_close_count_n3 intersect_close_ratio_n3
intersect_close_count_n1 -0.013122 -0.013122 -0.013122 0.131526 0.561268 0.534027 -0.388786 -0.427734 0.037220 -0.112805 ... 0.268151 0.261078 0.419248 0.465860 1.000000 0.756330 0.909778 0.530302 0.850685 0.457997
intersect_close_ratio_n1 0.027262 0.027262 0.027262 0.313315 0.840758 0.865684 -0.816310 -0.806912 -0.088754 -0.249321 ... 0.489601 0.486149 0.659656 0.681442 0.756330 1.000000 0.784211 0.859992 0.763991 0.780021
intersect_close_count_n2 -0.027767 -0.027767 -0.027767 0.179281 0.728372 0.695477 -0.583789 -0.598518 0.014591 -0.169004 ... 0.339158 0.331018 0.523922 0.588936 0.909778 0.784211 1.000000 0.745167 0.954822 0.654655
intersect_close_ratio_n2 0.003594 0.003594 0.003594 0.309140 0.891032 0.908138 -0.893406 -0.884361 -0.094620 -0.259414 ... 0.493090 0.488313 0.674455 0.718270 0.530302 0.859992 0.745167 1.000000 0.732753 0.908437
intersect_close_count_n3 -0.024523 -0.024523 -0.024523 0.172974 0.765048 0.730787 -0.609192 -0.634673 0.027904 -0.182026 ... 0.369754 0.363618 0.557435 0.611850 0.850685 0.763991 0.954822 0.732753 1.000000 0.754092
intersect_close_ratio_n3 -0.024820 -0.024820 -0.024820 0.269428 0.878670 0.891131 -0.842762 -0.867130 -0.049912 -0.252568 ... 0.512276 0.511296 0.685461 0.710932 0.457997 0.780021 0.654655 0.908437 0.754092 1.000000

6 rows × 55 columns


In [6]:
def cooccurrence_ngram(obs, target, ngram=1, threshold=0.8, ratio='ratio', token_pattern=" "):
    """
        Get the count cooccurrence_ngram in obs and target
    """
    obs_tokens = nlp_utils._tokenize(str(obs), token_pattern)
    target_tokens = nlp_utils._tokenize(str(target), token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)

    s = 0.
    for w1 in obs_ngrams:
        for w2 in target_ngrams:
            if dist_utils._is_str_match(w1, w2, threshold):
                s += 1.
    if ratio == 'count': 
        return s
    else: return np_utils._try_divide(s, len(obs_ngrams)*len(target_ngrams))
    

NGRAMS=[1,2,3]
RATIO=['count','ratio']
for ngram in NGRAMS:
    for ratio in RATIO:
        train['cooccurrence_close_%s_n%s'%(ratio,ngram)] = train[['question1','question2']].apply(lambda x: 
                    cooccurrence_ngram(x[0],x[1],threshold=0.7,ngram=ngram,ratio=ratio), axis=1)
# train.corr().ix[-1*len(NGRAMS)*len(RATIO):]
train.corr()[train.corr().index.str.contains('cooccurrence')]


Out[6]:
id qid1 qid2 is_duplicate jaccard_n123 dicedistence_n123 compression_dist edit_dist edit_dist_agg_n2 pos_of_question2_n1_in_question1_min ... intersect_close_count_n2 intersect_close_ratio_n2 intersect_close_count_n3 intersect_close_ratio_n3 cooccurrence_close_count_n1 cooccurrence_close_ratio_n1 cooccurrence_close_count_n2 cooccurrence_close_ratio_n2 cooccurrence_close_count_n3 cooccurrence_close_ratio_n3
cooccurrence_close_count_n1 -0.013122 -0.013122 -0.013122 0.131526 0.561268 0.534027 -0.388786 -0.427734 0.037220 -0.112805 ... 0.909778 0.530302 0.850685 0.457997 1.000000 0.153259 0.909778 0.070105 0.850685 0.064748
cooccurrence_close_ratio_n1 0.023832 0.023832 0.023832 0.328474 0.640463 0.699828 -0.786163 -0.733716 -0.252815 -0.268398 ... 0.255664 0.724433 0.268879 0.656784 0.153259 1.000000 0.255664 0.895333 0.268879 0.791832
cooccurrence_close_count_n2 -0.027767 -0.027767 -0.027767 0.179281 0.728372 0.695477 -0.583789 -0.598518 0.014591 -0.169004 ... 1.000000 0.745167 0.954822 0.654655 0.909778 0.255664 1.000000 0.266881 0.954822 0.232364
cooccurrence_close_ratio_n2 -0.008782 -0.008782 -0.008782 0.302772 0.632048 0.677373 -0.759505 -0.736952 -0.220926 -0.243390 ... 0.266881 0.797931 0.270481 0.722184 0.070105 0.895333 0.266881 1.000000 0.270481 0.912122
cooccurrence_close_count_n3 -0.024523 -0.024523 -0.024523 0.172974 0.765048 0.730787 -0.609192 -0.634673 0.027904 -0.182026 ... 0.954822 0.732753 1.000000 0.754092 0.850685 0.268879 0.954822 0.270481 1.000000 0.310343
cooccurrence_close_ratio_n3 -0.070270 -0.070270 -0.070270 0.266992 0.615008 0.651725 -0.690567 -0.710414 -0.153370 -0.223456 ... 0.232364 0.720548 0.310343 0.801040 0.064748 0.791832 0.232364 0.912122 0.310343 1.000000

6 rows × 61 columns


In [7]:
def LongestMatchSize(obs_corpus, target_corpus):
    return dist_utils._longest_match_size(obs_corpus, target_corpus)

def LongestMatchRatio(obs_corpus, target_corpus):
    return dist_utils._longest_match_ratio(obs_corpus, target_corpus)

train['LongestMatchSize'] = train[['question1','question2']].apply(lambda x: LongestMatchSize(x[0],x[1]), axis=1)
train['LongestMatchRatio'] = train[['question1','question2']].apply(lambda x: LongestMatchRatio(x[0],x[1]), axis=1)
train.corr()[train.corr().index.str.contains('LongestMatch')]


Out[7]:
id qid1 qid2 is_duplicate jaccard_n123 dicedistence_n123 compression_dist edit_dist edit_dist_agg_n2 pos_of_question2_n1_in_question1_min ... intersect_close_count_n3 intersect_close_ratio_n3 cooccurrence_close_count_n1 cooccurrence_close_ratio_n1 cooccurrence_close_count_n2 cooccurrence_close_ratio_n2 cooccurrence_close_count_n3 cooccurrence_close_ratio_n3 LongestMatchSize LongestMatchRatio
LongestMatchSize 0.102081 0.102081 0.102081 0.218323 0.374156 0.405512 -0.492384 -0.485790 -0.150488 0.027599 ... 0.375600 0.408711 0.397615 0.380343 0.412382 0.350781 0.375600 0.319515 1.000000 0.440454
LongestMatchRatio 0.079977 0.079977 0.079977 0.213539 0.172969 0.233214 -0.420301 -0.349987 -0.246562 -0.079360 ... -0.092456 0.267227 -0.221972 0.710290 -0.119670 0.639684 -0.092456 0.530724 0.440454 1.000000

2 rows × 63 columns


In [9]:
'''
QuestionQuality
IsInSpellCheckingList
'''

In [10]:
from collections import defaultdict

def _get_df_dict(target_corpus, ngram=1, token_pattern=" "):
    d = defaultdict(lambda : 1)
    for target in target_corpus:
        target_tokens = nlp_utils._tokenize(target, token_pattern)
        target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
        for w in set(target_ngrams):
            d[w] += 1
    return d

def _get_idf(word, idf_dict, N):
    return np.log((N - idf_dict[word] + 0.5)/(idf_dict[word] + 0.5))

def cooc_tfidf_ngram(obs, target, ngram=1, threshold=0.85, ratio="ratio", token_pattern=" "):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    val_list = []
    for w1 in obs_ngrams:
        s = 0.
        for w2 in target_ngrams:
            if dist_utils._is_str_match(w1, w2, threshold):
                s += 1.
        if ratio == "count":
            val_list.append(s * _get_idf(w1, idf_dict, doc_num))
        elif ratio == "ratio":
            val_list.append(np_utils._try_divide(s, len(target_ngrams)) * _get_idf(w1, idf_dict, doc_num))
            
    if len(val_list) == 0:
        val_list = [config.MISSING_VALUE_NUMERIC]
    return val_list

doc_num = train.shape[0]

for ngram in [1,2]:
    idf_dict = _get_df_dict(np.concatenate((train['question1'].values , train['question2'].values)), ngram=ngram)
    for ratio in ['count','ratio']:
        for target_name in ['question1','question2']:
            for obs_name in ['question1','question2']:
                if target_name != obs_name:
                    pos = train[['question1','question2']].apply(lambda x: cooc_tfidf_ngram(
                                obs=x[obs_name],target=x[target_name], ngram=ngram,ratio=ratio), axis=1)
#                     train["cooc_tfidf_%s_n%s_%s_min" % (obs_name, ngram, ratio)] = map(np.min, pos)
                    train["cooc_tfidf_%s_n%s_%s_mean" % (obs_name, ngram, ratio)] = map(np.mean, pos)
                    train["cooc_tfidf_%s_n%s_%s_median" % (obs_name, ngram, ratio)] = map(np.median, pos)
                    train["cooc_tfidf_%s_n%s_%s_max" % (obs_name, ngram, ratio)] = map(np.max, pos)
                    train["cooc_tfidf_%s_n%s_%s_std" % (obs_name, ngram, ratio)] = map(np.std, pos)
train.corr()[train.corr().index.str.contains('cooc_tfidf')]


Out[10]:
id qid1 qid2 is_duplicate jaccard_n123 dicedistence_n123 compression_dist edit_dist edit_dist_agg_n2 pos_of_question2_n1_in_question1_min ... cooc_tfidf_question1_n2_count_max cooc_tfidf_question1_n2_count_std cooc_tfidf_question2_n2_ratio_mean cooc_tfidf_question2_n2_ratio_median cooc_tfidf_question2_n2_ratio_max cooc_tfidf_question2_n2_ratio_std cooc_tfidf_question1_n2_ratio_mean cooc_tfidf_question1_n2_ratio_median cooc_tfidf_question1_n2_ratio_max cooc_tfidf_question1_n2_ratio_std
cooc_tfidf_question2_n1_count_mean 0.003989 0.003989 0.003989 0.315111 0.635442 0.665384 -0.702794 -0.625465 -0.241676 -0.182138 ... 0.571941 0.570117 0.587249 0.519220 0.432240 0.436726 0.586654 0.440112 0.583396 0.582061
cooc_tfidf_question2_n1_count_median -0.090565 -0.090565 -0.090565 0.192691 0.619518 0.610902 -0.563681 -0.537952 -0.225618 -0.182404 ... 0.322563 0.323437 0.548403 0.598038 0.271173 0.273418 0.548101 0.518649 0.399391 0.410360
cooc_tfidf_question2_n1_count_max 0.086535 0.086535 0.086535 0.155608 0.341330 0.371928 -0.363684 -0.305797 -0.135749 0.103649 ... 0.547275 0.462243 0.224502 0.140407 0.288522 0.245106 0.224415 0.117888 0.346485 0.304483
cooc_tfidf_question2_n1_count_std 0.090671 0.090671 0.090671 0.257174 0.466729 0.507125 -0.546591 -0.473529 -0.262640 0.004090 ... 0.605320 0.547798 0.377138 0.238530 0.408312 0.385299 0.376335 0.228525 0.498080 0.459331
cooc_tfidf_question1_n1_count_mean 0.044628 0.044628 0.044628 0.323651 0.669951 0.699076 -0.703957 -0.640552 -0.017445 -0.074931 ... 0.566548 0.591664 0.588435 0.458586 0.582090 0.572508 0.588429 0.514155 0.424021 0.456587
cooc_tfidf_question1_n1_count_median -0.095041 -0.095041 -0.095041 0.174151 0.735778 0.718386 -0.636585 -0.647597 -0.010198 -0.208854 ... 0.317519 0.318884 0.586724 0.569745 0.423187 0.416085 0.587563 0.641572 0.255790 0.273033
cooc_tfidf_question1_n1_count_max 0.056639 0.056639 0.056639 0.222180 0.335312 0.359008 -0.351301 -0.315580 -0.049986 0.089702 ... 0.545610 0.449786 0.182511 0.114927 0.248850 0.215359 0.182170 0.121081 0.231717 0.205097
cooc_tfidf_question1_n1_count_std 0.077963 0.077963 0.077963 0.272164 0.478915 0.516322 -0.544003 -0.494883 -0.056859 0.086213 ... 0.623527 0.572896 0.334842 0.216113 0.389645 0.351953 0.334206 0.226593 0.359995 0.343779
cooc_tfidf_question2_n1_ratio_mean 0.019067 0.019067 0.019067 0.316319 0.499107 0.547125 -0.668777 -0.576776 -0.280002 -0.159105 ... 0.431774 0.501173 0.757793 0.548919 0.741336 0.735641 0.757621 0.579479 0.698421 0.726802
cooc_tfidf_question2_n1_ratio_median -0.038260 -0.038260 -0.038260 0.215351 0.491605 0.501658 -0.502786 -0.467245 -0.278656 -0.159788 ... 0.261734 0.298588 0.640449 0.594050 0.466169 0.467370 0.640353 0.602926 0.481934 0.511012
cooc_tfidf_question2_n1_ratio_max 0.082468 0.082468 0.082468 0.216302 0.278044 0.337780 -0.466871 -0.366985 -0.159656 0.095446 ... 0.370348 0.420107 0.539617 0.270808 0.725960 0.677235 0.539754 0.350971 0.567464 0.569405
cooc_tfidf_question2_n1_ratio_std 0.076636 0.076636 0.076636 0.272192 0.347231 0.409941 -0.570717 -0.466268 -0.254452 -0.024153 ... 0.400659 0.461787 0.637732 0.336993 0.791822 0.756462 0.637516 0.432838 0.655945 0.658664
cooc_tfidf_question1_n1_ratio_mean 0.017358 0.017358 0.017358 0.319090 0.501239 0.549621 -0.669997 -0.578030 -0.280122 -0.159114 ... 0.432541 0.502429 0.759647 0.551248 0.742969 0.737258 0.759471 0.582124 0.697751 0.726785
cooc_tfidf_question1_n1_ratio_median -0.088611 -0.088611 -0.088611 0.223085 0.593280 0.599659 -0.568388 -0.577833 -0.091789 -0.188903 ... 0.274046 0.310232 0.669991 0.596344 0.548571 0.529878 0.670773 0.697331 0.366295 0.393342
cooc_tfidf_question1_n1_ratio_max 0.051947 0.051947 0.051947 0.254895 0.286598 0.342347 -0.534488 -0.419630 -0.460916 -0.094526 ... 0.411195 0.420949 0.565921 0.364146 0.568890 0.552789 0.565256 0.312229 0.736899 0.703848
cooc_tfidf_question1_n1_ratio_std 0.027423 0.027423 0.027423 0.249456 0.341041 0.398961 -0.604110 -0.497149 -0.427288 -0.103805 ... 0.417358 0.446905 0.628896 0.426709 0.592366 0.566706 0.628089 0.363921 0.768202 0.736751
cooc_tfidf_question2_n2_count_mean -0.024889 -0.024889 -0.024889 0.229885 0.872660 0.878378 -0.815145 -0.802608 -0.130825 -0.248978 ... 0.670320 0.612324 0.798398 0.734069 0.501628 0.442700 0.797796 0.676297 0.630208 0.569504
cooc_tfidf_question2_n2_count_median -0.105260 -0.105260 -0.105260 0.138070 0.817822 0.779336 -0.647456 -0.672878 -0.135478 -0.188196 ... 0.350449 0.265345 0.664877 0.864673 0.242389 0.171566 0.665026 0.720828 0.372116 0.303772
cooc_tfidf_question2_n2_count_max 0.166799 0.166799 0.166799 0.221625 0.517533 0.577462 -0.573341 -0.552757 0.008579 -0.153474 ... 0.999666 0.923506 0.523277 0.290233 0.696250 0.644719 0.522527 0.284394 0.697752 0.655194
cooc_tfidf_question2_n2_count_std 0.191085 0.191085 0.191085 0.270591 0.476097 0.557691 -0.588965 -0.552504 -0.014438 -0.168895 ... 0.912893 0.960611 0.489599 0.220409 0.685800 0.756998 0.488523 0.211422 0.701300 0.744892
cooc_tfidf_question1_n2_count_mean -0.008312 -0.008312 -0.008312 0.256380 0.907179 0.911966 -0.825730 -0.825468 0.021102 -0.230536 ... 0.664029 0.617362 0.789959 0.678473 0.597334 0.542016 0.790179 0.736503 0.493222 0.449871
cooc_tfidf_question1_n2_count_median -0.114250 -0.114250 -0.114250 0.146932 0.857727 0.815842 -0.679334 -0.704490 0.013469 -0.181796 ... 0.337674 0.255166 0.680525 0.749994 0.347400 0.288649 0.681317 0.860870 0.249197 0.187456
cooc_tfidf_question1_n2_count_max 0.164318 0.164318 0.164318 0.221625 0.518104 0.578260 -0.573545 -0.552870 0.010680 -0.153098 ... 1.000000 0.924314 0.523290 0.290233 0.696264 0.644664 0.522572 0.284394 0.696716 0.654793
cooc_tfidf_question1_n2_count_std 0.181244 0.181244 0.181244 0.265361 0.464553 0.549429 -0.585971 -0.551962 0.003356 -0.148659 ... 0.924314 1.000000 0.503748 0.217442 0.732423 0.764353 0.503373 0.224613 0.693166 0.757246
cooc_tfidf_question2_n2_ratio_mean -0.030498 -0.030498 -0.030498 0.266197 0.683283 0.715731 -0.734070 -0.718687 -0.195003 -0.221924 ... 0.523290 0.503748 1.000000 0.837211 0.753278 0.639606 0.999854 0.855130 0.753010 0.663787
cooc_tfidf_question2_n2_ratio_median -0.099573 -0.099573 -0.099573 0.178463 0.683422 0.670539 -0.590686 -0.623215 -0.175944 -0.164457 ... 0.290233 0.217442 0.837211 1.000000 0.385004 0.262381 0.837486 0.882194 0.471987 0.359693
cooc_tfidf_question2_n2_ratio_max 0.119504 0.119504 0.119504 0.261222 0.412035 0.484714 -0.582165 -0.529609 -0.092885 -0.161629 ... 0.696264 0.732423 0.753278 0.385004 1.000000 0.938293 0.753116 0.487475 0.801990 0.793872
cooc_tfidf_question2_n2_ratio_std 0.157089 0.157089 0.157089 0.269801 0.376592 0.457606 -0.566856 -0.506089 -0.094504 -0.168780 ... 0.644664 0.764353 0.639606 0.262381 0.938293 1.000000 0.639237 0.360561 0.760209 0.841366
cooc_tfidf_question1_n2_ratio_mean -0.032089 -0.032089 -0.032089 0.264681 0.683501 0.715763 -0.733278 -0.718102 -0.195255 -0.221344 ... 0.522572 0.503373 0.999854 0.837486 0.753116 0.639237 1.000000 0.855901 0.752259 0.663353
cooc_tfidf_question1_n2_ratio_median -0.112492 -0.112492 -0.112492 0.204001 0.694708 0.682218 -0.604370 -0.633825 -0.102171 -0.167485 ... 0.284394 0.224613 0.855130 0.882194 0.487475 0.360561 0.855901 1.000000 0.400296 0.299679
cooc_tfidf_question1_n2_ratio_max 0.096778 0.096778 0.096778 0.193156 0.376741 0.444434 -0.568652 -0.498053 -0.289797 -0.191793 ... 0.696716 0.693166 0.753010 0.471987 0.801990 0.760209 0.752259 0.400296 1.000000 0.945353
cooc_tfidf_question1_n2_ratio_std 0.125250 0.125250 0.125250 0.211608 0.344111 0.422697 -0.564126 -0.488884 -0.272444 -0.183110 ... 0.654793 0.757246 0.663787 0.359693 0.793872 0.841366 0.663353 0.299679 0.945353 1.000000

32 rows × 95 columns


In [13]:
BM25_K1=config.BM25_K1
BM25_B=config.BM25_B

def _get_avg_ngram_doc_len(target_corpus, ngram=1, token_pattern=" "):
    lst = []
    for target in target_corpus:
        target_tokens = nlp_utils._tokenize(target, token_pattern)
        target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
        lst.append(len(target_ngrams))
    return np.mean(lst)

def bm25(obs, target, ngram=1, threshold=0.85, ratio="ratio", token_pattern=" ", b=None, k1=None, doc_len=None, idf_dict=idf_dict):
    obs_tokens = nlp_utils._tokenize(obs, token_pattern)
    target_tokens = nlp_utils._tokenize(target, token_pattern)
    obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
    target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
    K = k1 * (1 - b + b * np_utils._try_divide(len(target_ngrams), doc_len))
    val_list = []
    for w1 in obs_ngrams:
        s = 0.
        for w2 in target_ngrams:
            if dist_utils._is_str_match(w1, w2, threshold):
                s += 1.
        bm25 = s * _get_idf(w1, idf_dict, doc_num) * np_utils._try_divide(1 + k1, s + K)
        val_list.append(bm25)
    if len(val_list) == 0:
        val_list = [config.MISSING_VALUE_NUMERIC]
    return val_list

for ngram in [1,2]:
    idf_dict = _get_df_dict(np.concatenate((train['question1'].values , train['question2'].values)), ngram=ngram)
#     for ratio in ['count','ratio']:
    for target_name in ['question1','question2']:
        avg_target_len = _get_avg_ngram_doc_len(train[target_name].values, ngram=ngram)
        for obs_name in ['question1','question2']:
            if target_name != obs_name:
                pos = train[['question1','question2']].apply(lambda x: bm25(obs=x[obs_name],target=x[target_name], 
                                ngram=ngram,threshold=0.85,b=BM25_B, k1=BM25_K1, doc_len=avg_target_len), axis=1)
#                 train["bm25_%s_n%s_min" % (obs_name, ngram)] = map(np.min, pos)
                train["bm25_%s_n%s_mean" % (obs_name, ngram)] = map(np.mean, pos)
                train["bm25_%s_n%s_median" % (obs_name, ngram)] = map(np.median, pos)
                train["bm25_%s_n%s_max" % (obs_name, ngram)] = map(np.max, pos)
                train["bm25_%s_n%s_std" % (obs_name, ngram)] = map(np.std, pos)
train.corr()[train.corr().index.str.contains('bm25_')].sort_values(by='is_duplicate',ascending=0)


Out[13]:
id qid1 qid2 is_duplicate jaccard_n123 dicedistence_n123 compression_dist edit_dist edit_dist_agg_n2 pos_of_question2_n1_in_question1_min ... bm25_question1_n1_max bm25_question1_n1_std bm25_question2_n2_mean bm25_question2_n2_median bm25_question2_n2_max bm25_question2_n2_std bm25_question1_n2_mean bm25_question1_n2_median bm25_question1_n2_max bm25_question1_n2_std
bm25_question1_n1_mean 0.036480 0.036480 0.036480 0.352203 0.790041 0.849268 -0.865517 -0.832492 -0.094787 -0.255752 ... 0.571958 0.268813 0.786613 0.584403 0.715651 0.694923 0.844578 0.668199 0.670111 0.673950
bm25_question2_n1_mean 0.022915 0.022915 0.022915 0.340193 0.792846 0.850719 -0.887646 -0.850095 -0.230319 -0.302150 ... 0.636460 0.408147 0.846947 0.661692 0.689764 0.675161 0.826091 0.633913 0.721958 0.703803
bm25_question2_n1_median 0.035406 0.035406 0.035406 0.299429 0.693846 0.756189 -0.770685 -0.771992 -0.165842 -0.365725 ... 0.445785 0.234845 0.745906 0.543764 0.583438 0.623651 0.743594 0.559853 0.623996 0.623892
bm25_question1_n1_median 0.007516 0.007516 0.007516 0.296185 0.667910 0.724268 -0.752738 -0.749062 -0.009808 -0.304165 ... 0.355916 0.170710 0.657273 0.446939 0.599223 0.600207 0.721546 0.546440 0.532266 0.570665
bm25_question1_n2_mean -0.013548 -0.013548 -0.013548 0.274326 0.851427 0.875016 -0.835873 -0.827952 -0.062225 -0.239055 ... 0.395922 0.100099 0.960971 0.784017 0.727546 0.665076 1.000000 0.854926 0.699136 0.646522
bm25_question2_n2_std 0.187795 0.187795 0.187795 0.270694 0.451326 0.541306 -0.626486 -0.575094 -0.042077 -0.167574 ... 0.455790 0.360794 0.632415 0.269978 0.925471 1.000000 0.665076 0.315418 0.900906 0.959200
bm25_question1_n2_std 0.166054 0.166054 0.166054 0.268527 0.435053 0.527615 -0.625563 -0.569202 -0.122542 -0.174640 ... 0.474244 0.441958 0.672491 0.314522 0.917058 0.959200 0.646522 0.277646 0.936658 1.000000
bm25_question1_n1_max 0.103296 0.103296 0.103296 0.264274 0.365579 0.420987 -0.508355 -0.451527 -0.300477 0.039928 ... 1.000000 0.715579 0.420601 0.274850 0.472655 0.455790 0.395922 0.240243 0.515742 0.474244
bm25_question2_n2_mean -0.022076 -0.022076 -0.022076 0.246086 0.825447 0.848674 -0.828015 -0.812105 -0.157002 -0.245000 ... 0.420601 0.215320 1.000000 0.838139 0.700719 0.632415 0.960971 0.794369 0.736143 0.672491
bm25_question2_n2_max 0.162310 0.162310 0.162310 0.244414 0.500077 0.577379 -0.640644 -0.599235 -0.025180 -0.156416 ... 0.472655 0.361338 0.700719 0.354677 1.000000 0.925471 0.727546 0.391470 0.957137 0.917058
bm25_question2_n1_std 0.183857 0.183857 0.183857 0.243521 0.058436 0.144169 -0.283518 -0.205754 -0.084184 0.119714 ... 0.626138 0.599859 0.112170 -0.130609 0.460148 0.488118 0.186064 -0.043109 0.396099 0.422137
bm25_question1_n2_max 0.149727 0.149727 0.149727 0.240544 0.489941 0.567007 -0.637671 -0.588049 -0.129982 -0.184990 ... 0.515742 0.450158 0.736143 0.400822 0.957137 0.900906 0.699136 0.353776 1.000000 0.936658
bm25_question1_n1_std 0.155322 0.155322 0.155322 0.216456 0.042034 0.119070 -0.315549 -0.249060 -0.329025 0.088518 ... 0.715579 1.000000 0.215320 0.037981 0.361338 0.360794 0.100099 -0.132954 0.450158 0.441958
bm25_question2_n1_max 0.113692 0.113692 0.113692 0.187409 0.326878 0.383161 -0.440080 -0.393743 -0.155637 0.221699 ... 0.801848 0.634113 0.362720 0.207868 0.484026 0.442201 0.378468 0.226581 0.455086 0.448029
bm25_question1_n2_median -0.112815 -0.112815 -0.112815 0.169130 0.811521 0.783897 -0.668376 -0.696185 -0.024090 -0.180322 ... 0.240243 -0.132954 0.794369 0.842045 0.391470 0.315418 0.854926 1.000000 0.353776 0.277646
bm25_question2_n2_median -0.100979 -0.100979 -0.100979 0.152267 0.782296 0.756474 -0.644594 -0.673792 -0.152842 -0.182378 ... 0.274850 0.037981 0.838139 1.000000 0.354677 0.269978 0.784017 0.842045 0.400822 0.314522

16 rows × 111 columns


In [57]:
# ------------------------ Vector Space Features -------------------------------

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import config
from utils import dist_utils, ngram_utils, nlp_utils, np_utils, pkl_utils
from utils import logging_utils, time_utils

class VectorSpace:
    ## word based
    def _init_word_bow(self, ngram, vocabulary=None):
        bow = CountVectorizer(min_df=3,
                                max_df=0.75,
                                max_features=None,
                                # norm="l2",
                                strip_accents="unicode",
                                analyzer="word",
                                token_pattern=r"\w{1,}",
                                ngram_range=(1, ngram),
                                vocabulary=vocabulary)
        return bow

    ## word based
    def _init_word_ngram_tfidf(self, ngram, vocabulary=None):
        tfidf = TfidfVectorizer(min_df=3,
                                max_df=0.75,                                
                                max_features=None,
                                norm="l2",
                                strip_accents="unicode",
                                analyzer="word",
                                token_pattern=r"\w{1,}",
                                ngram_range=(1, ngram),
                                use_idf=1,
                                smooth_idf=1,
                                sublinear_tf=1,
                                # stop_words="english",
                                vocabulary=vocabulary)
        return tfidf

    ## char based
    def _init_char_tfidf(self, include_digit=False):
        chars = list(string.ascii_lowercase)
        if include_digit:
            chars += list(string.digits)        
        vocabulary = dict(zip(chars, range(len(chars))))
        tfidf = TfidfVectorizer(strip_accents="unicode",
                                analyzer="char",
                                norm=None,
                                token_pattern=r"\w{1,}",
                                ngram_range=(1, 1), 
                                use_idf=0,
                                vocabulary=vocabulary)
        return tfidf

    ## char based ngram
    def _init_char_ngram_tfidf(self, ngram, vocabulary=None):
        tfidf = TfidfVectorizer(min_df=3,
                                max_df=0.75, 
                                max_features=None, 
                                norm="l2",
                                strip_accents="unicode", 
                                analyzer="char",
                                token_pattern=r"\w{1,}",
                                ngram_range=(1, ngram), 
                                use_idf=1,
                                smooth_idf=1,
                                sublinear_tf=1, 
                                # stop_words="english",
                                vocabulary=vocabulary)
        return tfidf

# ------------------------ LSA -------------------------------
class LSA_Ngram(VectorSpace):
    def __init__(self, corpus, obs_corpus, target_corpus=None, ngram=3, svd_dim=100, svd_n_iter=5):
        self.obs_corpus = obs_corpus
        self.ngram = ngram
        self.svd_dim = svd_dim
        self.svd_n_iter = svd_n_iter
        self.corpus = corpus
        self.target_corpus = target_corpus
        
    def word_transform(self):
        tfidf = self._init_word_ngram_tfidf(self.ngram)
        tfidf.fit(self.corpus)
        X = tfidf.transform(self.obs_corpus)
#         word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
        svd = TruncatedSVD(n_components = self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)
    
    def char_transform(self):
        tfidf = self._init_char_ngram_tfidf(self.ngram)
        tfidf.fit(self.corpus)
        X = tfidf.transform(self.obs_corpus)
        svd = TruncatedSVD(n_components = self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)

    def pair_transform(self):
        ## tfidf
        tfidf = self._init_word_ngram_tfidf(ngram=self.ngram)
        tfidf.fit(self.corpus)
        X_obs = tfidf.transform(self.obs_corpus)
        tfidf = self._init_word_ngram_tfidf(ngram=self.ngram)
        tfidf.fit(self.corpus)
        X_target = tfidf.transform(self.target_corpus)
        X_tfidf = scipy.sparse.hstack([X_obs, X_target]).tocsr()
        ## svd
        svd = TruncatedSVD(n_components=self.svd_dim, n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        X_svd = svd.fit_transform(X_tfidf)
        return X_svd
    
all_corpus = []
feats_corpus = ['question1','question2']
for f in feats_corpus:
    train[f] = train[f].astype(str)
    all_corpus += train[f].values.tolist()

for f in ['question1','question2']:
    lsa_word = LSA_Ngram(all_corpus,train[f], ngram=2, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
    print lsa_word.word_transform()
    break
    
for f in ['question1','question2']:
    lsa_char = LSA_Ngram(all_corpus,train[f], ngram=5, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
    print lsa_char.char_transform()
    break

lsa_pair = LSA_Ngram(all_corpus,train['question1'], target_corpus=train['question2'], ngram=2, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print lsa_pair.pair_transform()


[[ 0.48897685 -0.12364507 -0.12529131 ...,  0.19337968 -0.04179793
  -0.21828433]
 [ 0.76094565 -0.13066384 -0.24926977 ..., -0.19513835 -0.00652884
   0.01361816]
 [ 0.20219964  0.32453762 -0.08998923 ..., -0.13682609  0.11677467
   0.02283278]
 ..., 
 [ 0.13093228  0.32318256 -0.10861661 ...,  0.01381884 -0.01144202
  -0.05396418]
 [ 0.28598382  0.02573773  0.37589184 ..., -0.13925678  0.02044775
  -0.11142202]
 [ 0.36398638 -0.09518961 -0.13558644 ...,  0.04975152 -0.01402062
   0.02189748]]
[[ 0.30628278  0.13072922 -0.05537384 ...,  0.02647884 -0.07637475
   0.00863192]
 [ 0.28948935  0.18839956 -0.04557625 ..., -0.03926573 -0.08395679
  -0.00401218]
 [ 0.30066084 -0.16248022 -0.1117007  ..., -0.06727614 -0.0669044
   0.09360335]
 ..., 
 [ 0.22409702 -0.17455413 -0.12172682 ...,  0.12891339 -0.05281741
   0.05558532]
 [ 0.30431468  0.05538239  0.10953847 ..., -0.15892938 -0.1105081
   0.0027384 ]
 [ 0.28330364  0.18447348 -0.046028   ...,  0.19158112 -0.03420802
   0.02571635]]
[[ 0.66635843  0.32118802 -0.14746609 ...,  0.14943076 -0.03907724
   0.10606122]
 [ 0.6719319   0.19891899 -0.15506064 ..., -0.19943743  0.02545268
   0.10990824]
 [ 0.29317479 -0.3724526  -0.20710379 ...,  0.00764295  0.23170377
   0.14859051]
 ..., 
 [ 0.28281648 -0.45301731 -0.25510558 ...,  0.02362853  0.0803303
   0.11567539]
 [ 0.37670925  0.05930413  0.35040877 ..., -0.12451715  0.24743331
  -0.05622315]
 [ 0.48586083  0.22246215 -0.15996259 ...,  0.15267309 -0.20060042
  -0.0090168 ]]

In [79]:
class TSNE_LSA_Ngram(LSA_Ngram):
    def __init__(self, corpus, obs_corpus, target_corpus=None, ngram=3, svd_dim=100, svd_n_iter=5):
        LSA_Ngram.__init__(self, corpus, obs_corpus, target_corpus, ngram, svd_dim, svd_n_iter)

    def tsne_word_transform(self):
        X_svd = self.word_transform()
        X_scaled = StandardScaler().fit_transform(X_svd)
        X_tsne = TSNE().fit_transform(X_scaled)
        return X_tsne
    
    def tsne_char_transform(self):
        X_svd = self.char_transform()
        X_scaled = StandardScaler().fit_transform(X_svd)
        X_tsne = TSNE().fit_transform(X_scaled)
        return X_tsne
    
    def tsne_pair_transform(self):
        X_svd = self.pair_transform()
        X_scaled = StandardScaler().fit_transform(X_svd)
        X_tsne = TSNE().fit_transform(X_scaled)
        return X_tsne


for f in ['question1','question2']:
    lsa_word = TSNE_LSA_Ngram(all_corpus,train[f], ngram=2, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
    print lsa_word.tsne_word_transform()
    break
    
for f in ['question1','question2']:
    lsa_char = TSNE_LSA_Ngram(all_corpus,train[f], ngram=5, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
    print lsa_char.tsne_char_transform()
    break

lsa_pair = TSNE_LSA_Ngram(all_corpus,train['question1'], target_corpus=train['question2'], ngram=2, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print lsa_pair.tsne_pair_transform()


[[ -5.02173026e+01   9.81634541e+01]
 [ -5.50605519e+01   5.28976124e+01]
 [ -1.45345964e+02   1.26105509e+01]
 [ -9.71501000e+01  -1.51336779e+01]
 [  9.53674103e+01   5.00098621e+01]
 [  6.59470030e+01   4.15461335e+01]
 [ -5.28130951e+01  -6.40893766e+01]
 [ -7.26328449e+01  -9.78179242e+00]
 [ -3.06216697e+00   2.34854209e+01]
 [ -6.36726993e+01   2.97892017e+01]
 [  2.94998169e+01  -4.11528362e+01]
 [  9.72237633e+00  -2.31139806e+01]
 [ -4.29106814e+01   5.45876588e+00]
 [ -3.27922544e+00   1.20207713e+02]
 [ -1.36735814e+01   5.63581756e+00]
 [  2.06460624e+02   2.19009850e+02]
 [  1.02913555e+02   2.21966611e+01]
 [  4.45491606e+01  -4.41626501e+01]
 [ -7.17631231e+01   7.41048310e+01]
 [  7.48757185e+01   6.20560869e+01]
 [  4.01956832e+01  -2.81648141e+01]
 [  2.04307677e+01   8.57176030e+00]
 [  1.04242928e+01  -6.43013667e+01]
 [  8.84815880e+01  -1.76258105e+02]
 [ -1.31826118e+02   1.16035510e+02]
 [ -9.25258676e+01   3.67804484e+01]
 [ -4.66930999e+01   4.11563876e+01]
 [  6.04993745e+01  -1.43536914e+01]
 [  2.98701443e+01   1.49373892e+02]
 [ -1.24481931e+01  -7.26920433e+01]
 [ -2.28449194e+00  -6.02152329e+01]
 [  1.35216004e+02   2.82231421e+01]
 [ -1.20661287e+00  -1.09153721e+02]
 [ -5.57886437e+01  -5.91422863e+00]
 [  3.52824080e+00   3.85275306e+01]
 [  1.91599688e+02   2.00967792e+01]
 [ -1.18445349e+01  -9.48513238e+01]
 [  2.70514542e+00  -3.38170695e+01]
 [ -3.80141237e+02   1.34547759e+02]
 [ -1.03647170e+02   2.22067201e+01]
 [  1.65312165e+02  -2.03212500e+01]
 [ -9.62888689e+01  -3.25023466e+01]
 [ -2.15642911e+01   1.57718217e+01]
 [ -3.90135236e+01  -7.67351676e+00]
 [  7.89635279e+01   2.63709969e+01]
 [ -5.43380419e+01  -4.66201668e+01]
 [  6.59118136e+01  -5.17889024e+01]
 [  5.11045012e+01  -8.96572214e+01]
 [  5.31383614e+00   2.20458294e+02]
 [ -3.63370375e+01  -6.15556758e+01]
 [ -1.10964357e+02   3.76685368e+01]
 [  5.83323072e+01   1.24842597e+01]
 [  8.82167242e+01   1.19664458e+01]
 [ -4.53332374e+01  -2.19026698e+01]
 [ -9.92081023e+01   7.62913017e+01]
 [  3.88218787e+01   6.62267065e+01]
 [  1.37794239e+01   2.06780072e+01]
 [ -2.45414221e+01  -6.94565483e+01]
 [ -1.63139315e+02   1.26069551e+01]
 [ -2.62519428e+02  -4.23707160e+01]
 [ -3.11719726e+01  -9.64231543e+01]
 [  2.93597433e+01  -7.41586500e+00]
 [  6.33703570e+01  -1.73044080e+02]
 [  2.64682321e+01  -9.49105652e+01]
 [ -8.51344684e+01  -2.52580345e+01]
 [ -1.08058885e+02   2.86373015e+00]
 [ -1.62580551e+01  -7.30863768e+00]
 [  8.32859309e+01   7.49291609e+01]
 [ -3.74946130e+01  -1.19927637e+02]
 [  1.87675494e+01   3.60936490e+01]
 [  3.93159269e+01  -1.04246492e+02]
 [ -3.30590967e+01   2.11699127e+02]
 [ -8.51994549e+01  -1.16000051e+01]
 [ -1.09019817e+01   1.36753907e+02]
 [ -6.67883321e+01   1.66546406e+01]
 [ -3.20585716e+01   3.04906404e+01]
 [  7.30511461e+01   9.61216227e+01]
 [  1.09491728e+02  -1.70627920e+01]
 [ -1.09600212e+02  -3.41021638e+01]
 [ -3.23338087e+01   5.87967285e+01]
 [ -1.05075050e+02  -5.30245552e+01]
 [  3.11660612e+01   3.96728443e+01]
 [  1.24041397e+02  -3.27005285e+01]
 [  1.47201595e+01  -9.14561533e+01]
 [ -7.71756896e+01   9.63832905e+01]
 [ -1.06698291e+02  -2.14708317e+01]
 [ -3.00914969e+01   5.23179458e+00]
 [  4.29910209e+01   9.71468916e+01]
 [ -1.56396364e+02  -1.48985947e+02]
 [ -1.65964409e-01  -1.48332035e+01]
 [  8.88771194e+01  -3.68674149e+01]
 [  1.03538951e+02  -5.07133776e+01]
 [ -7.55420840e+01   6.30360367e+00]
 [ -5.45670455e+01  -8.09672521e+01]
 [ -1.83699933e+01   6.19523446e+01]
 [  6.70273706e+01  -4.21260285e+00]
 [  4.50776123e+01  -5.77407945e+01]
 [  5.24288480e+01  -3.21150798e+01]
 [ -1.44698285e+00   1.09794338e+01]
 [  6.25150400e+01  -3.44076501e+01]
 [ -3.35226079e+01   1.98323670e+01]
 [  1.23084703e+02  -5.41254573e+01]
 [ -7.72035301e+01   4.09033927e+01]
 [ -7.15570856e+01  -1.22970074e+02]
 [ -8.63250019e+01  -3.99773043e+01]
 [  7.44030101e+01   1.01926178e+01]
 [  6.74740328e+01  -9.23177589e+01]
 [ -3.72747664e+01   1.13630432e+02]
 [ -4.72452124e+01   2.33293189e+01]
 [ -8.93880381e+01   1.31573247e+01]
 [ -7.40634345e+01  -3.47365636e+01]
 [ -7.24929479e+01   5.15690386e+01]
 [ -1.10870185e+01  -4.38069786e+01]
 [ -2.21090291e+01  -8.31403999e+01]
 [  7.85066341e+01   3.97713528e+01]
 [  1.19893150e+02  -4.09001442e+02]
 [  1.26370130e+01   1.37887560e+02]
 [ -1.01686466e+01   7.67320611e+01]
 [  2.35446901e+01  -2.92277525e+01]
 [ -1.28160749e+02  -9.55692518e+01]
 [  6.58948670e+01   2.45196686e+01]
 [  1.09454484e+01   6.96711640e+01]
 [  4.54987982e+00   8.94737966e+01]
 [  2.16005118e+01  -7.32562132e+01]
 [  1.11962276e+02   7.09275966e+01]
 [  3.23780882e+01   4.88914852e+00]
 [ -5.47667886e+01   6.60210033e+01]
 [  2.85367708e+01  -1.69935076e+01]
 [  9.98083845e+01  -1.24375883e+00]
 [ -2.87954334e+01  -2.83996099e+01]
 [  3.64298186e+01  -7.69652550e+01]
 [  8.50367629e+01   1.26387786e+02]
 [ -1.96294047e+01  -5.40762409e+01]
 [  2.45093975e+01   8.54137091e+01]
 [ -7.66881576e+01  -8.31967873e+01]
 [ -7.93522924e+01   2.61970367e+01]
 [  5.16671109e+01   2.31402381e+01]
 [  8.90776228e+01  -1.96929552e+01]
 [ -7.51269732e+01  -6.33156175e+01]
 [ -3.60646209e+01   4.75194912e+01]
 [  1.36879652e+02  -1.40288637e+01]
 [  4.83186679e+01  -7.31590710e+01]
 [ -3.93781088e+01  -8.38940387e+01]
 [  5.43955325e+01   3.35604531e+01]
 [  1.74472462e+02   2.12768627e+02]
 [  5.86136516e+01   1.29034558e+00]
 [ -5.59443619e+01  -1.63460934e+02]
 [  8.15711881e+01  -3.01187153e+00]
 [ -9.88097605e+00   9.22995667e+01]
 [  6.07111998e+01   1.87887025e+02]
 [  4.62852676e+01  -8.27167661e+00]
 [ -7.29538282e+01  -2.23418053e+01]
 [  1.59835067e+01  -2.98504520e+00]
 [  3.81921814e+01   1.92031189e+01]
 [ -6.69005571e+01  -4.98517658e+01]
 [  5.26727501e+01   5.25431164e+01]
 [  6.79917387e+01  -6.84061659e+01]
 [  2.10578410e+01  -5.59705717e+01]
 [ -1.15932606e+02   2.03903532e+02]
 [  7.39843480e+01  -4.02046482e+01]
 [ -4.43617369e+01   7.53630233e+01]
 [ -6.22264267e+01  -7.04422049e+01]
 [  1.38485548e+01   1.05809743e+02]
 [ -3.04467164e+01  -5.27198982e+01]
 [  1.60495414e+02  -2.91885099e+02]
 [ -1.92605544e+01   4.52719367e+01]
 [  2.71275820e+00  -7.85365592e+01]
 [ -1.96607967e+01   1.06373860e+02]
 [ -6.16520518e+01  -1.09373513e+02]
 [ -1.30669474e+00  -4.75847202e+01]
 [ -8.70032990e+01   6.34162573e+01]
 [  9.63613032e+01   3.72054755e+01]
 [ -1.44476343e+01   3.12017979e+01]
 [ -6.63139387e+01   1.60712726e+02]
 [ -1.91232850e+01  -3.57361166e+01]
 [ -2.62928567e+01   7.60913326e+01]
 [  2.67973165e+01   7.31784376e+01]
 [ -2.31825985e+00   6.30008748e+01]
 [  6.72014857e+01  -1.19507186e+02]
 [ -2.80879539e+01  -1.56701489e+01]
 [ -2.68247344e+01  -4.26085644e+01]
 [  9.25073132e+01  -7.16459074e+01]
 [  4.40722686e+01   6.54724871e+00]
 [ -3.79302739e+01  -3.55295401e+01]
 [  6.57988112e+01  -2.03448697e+02]
 [ -1.12990202e+02   5.60114405e+01]
 [  3.31073807e+01   1.92556044e+02]
 [ -5.48445473e+01   8.22362101e+00]
 [  4.49938798e+01  -1.87500787e+02]
 [  2.64336943e+01   2.28053977e+01]
 [  5.11339277e+01   7.74846096e+01]
 [ -6.16130296e+01  -2.22594587e+01]
 [ -1.28163036e+00  -1.89277178e+00]
 [  1.41376863e+01  -4.18989222e+01]
 [ -4.06707412e+01  -4.69776451e+01]
 [ -4.32288034e+01   6.12225233e+01]
 [  7.24746686e+01  -1.96097097e+01]
 [  1.97822709e+01   5.35450765e+01]
 [ -1.41564519e+01  -2.22815107e+01]
 [ -5.44327264e+00   4.87377271e+01]
 [  3.69416432e+01   5.05413004e+01]]
[[  9.13183867e+01   3.43813093e+01]
 [  6.32039836e+01   1.31012309e+02]
 [  4.92936924e+01   3.80308284e+01]
 [  8.92695860e+01   5.67185020e+01]
 [ -7.55351766e+01   2.52899350e+01]
 [  3.09446556e+01   7.84641207e+01]
 [ -4.73319358e+01   2.18946078e+01]
 [  7.88886380e+00   1.88780445e+02]
 [  5.93478479e+01  -9.43862598e+01]
 [ -1.82420709e+02   1.19098342e+02]
 [ -8.27501892e+02  -7.95702994e+01]
 [ -4.73830534e+01   8.19883737e+01]
 [  3.54569092e+01   1.37694569e+02]
 [ -2.25053438e+01  -5.33170724e+00]
 [ -2.33556560e+01   5.37155005e+01]
 [ -8.58758031e+01  -1.44296721e+01]
 [  5.60329712e+01   8.27930082e+01]
 [ -5.83903679e+01  -5.22117954e+01]
 [  8.60466785e+01   4.53532056e+00]
 [  1.07573238e+01  -3.97343193e+01]
 [ -5.51702261e+01  -8.81269113e+01]
 [ -5.34469763e+01  -1.89657074e+02]
 [  6.51188588e+01   1.84100396e+00]
 [ -7.41742435e-01   1.57766477e+01]
 [  1.85281335e+01  -7.74775007e+01]
 [ -4.18378927e+01   5.31545223e+01]
 [  5.57055074e+01  -5.57444733e+01]
 [ -3.18702093e+01   7.86865459e+00]
 [  8.04553083e+01  -3.67970130e+01]
 [ -6.73371425e+01  -9.73862571e+00]
 [ -1.02670758e+02  -4.45819900e+01]
 [ -2.67304680e+01   6.85254476e+01]
 [  1.28259249e+02  -1.11830038e+02]
 [  2.94704037e+01   4.90290368e+01]
 [  3.07194269e+01  -1.18487666e+02]
 [ -4.60922936e+01  -5.14205753e+01]
 [ -8.51841748e+01   1.75508971e+02]
 [  8.71242884e+01   1.37845597e+02]
 [  6.45742529e+01   2.77289306e+01]
 [ -2.01939142e+01   3.51716666e+01]
 [ -4.89984448e+01  -7.35045511e+01]
 [  1.42459175e+02  -1.96812104e+00]
 [  3.43364875e+01   9.25835597e+00]
 [ -1.75829507e+00   1.08099920e+02]
 [  5.68950546e+01  -1.53800220e+02]
 [  6.96574349e+01  -1.14073640e+02]
 [  6.57155914e+01   4.46009327e+01]
 [  2.67596309e+01  -3.83837747e+01]
 [  5.21833476e+02   1.71890117e+02]
 [  9.87184081e+01  -2.09975067e+01]
 [  1.91817509e+02   1.62090348e+02]
 [  2.41836465e+02  -6.19249330e+01]
 [  1.26652915e+01  -2.44281469e+01]
 [  1.16213111e+02  -3.57316088e+01]
 [ -1.91667153e+01  -3.39858184e+01]
 [ -8.90608560e+01  -5.95784090e+01]
 [ -7.49495218e+01   9.00630019e+01]
 [ -1.07790308e+02   8.90536302e+00]
 [  1.61165176e+01   7.36461202e+01]
 [  9.84298091e+01  -1.17140043e+02]
 [  2.25282765e+01  -9.87449972e+01]
 [ -6.02960060e+01   3.43007813e+01]
 [  1.82096691e+02   6.20193979e+01]
 [ -3.14003585e+01  -1.39688450e+02]
 [ -3.74243531e+01   3.57985630e+01]
 [  3.40159302e+01   2.72639677e+01]
 [  4.33583482e+01  -2.95562904e+01]
 [  4.81292280e+00   4.82484468e+01]
 [  6.02833315e+01  -7.48267637e+01]
 [ -9.96882890e+01   5.52999369e+01]
 [  2.10318097e+01   1.63301562e+02]
 [  9.61792811e+01  -1.67107801e+02]
 [  5.02829883e+01  -9.25972547e+00]
 [ -8.02113090e+00  -1.74739722e+02]
 [ -8.15194759e+00   4.34545369e+01]
 [  3.02873701e+01  -1.93173205e+02]
 [  1.04717231e+01   1.25866054e+02]
 [  3.70910428e+01  -1.02377048e+02]
 [  9.74816215e+01   1.62292133e+02]
 [  1.68029586e+01   5.82979029e+01]
 [ -1.27439078e+02  -1.08828188e+02]
 [ -1.18589182e+02   6.61713997e+01]
 [ -1.63392594e+01   1.55977663e+02]
 [ -3.69928128e+01  -7.47654088e+01]
 [ -6.85877861e+01  -7.69469823e+01]
 [  1.79884162e+02   9.08344985e+01]
 [  5.23290468e+01   1.57380570e+02]
 [  4.91726149e+01   5.66951471e+01]
 [  6.26475171e+01  -2.55354838e+01]
 [ -1.61630696e+01  -1.00483109e+02]
 [ -6.12958659e+01  -1.13151178e+02]
 [  1.15987061e+02   1.25209972e+01]
 [ -7.87342496e+01   6.06494046e+01]
 [  1.63087498e+01   7.90275407e+00]
 [ -1.27160497e+01  -5.80776764e+01]
 [  9.96079949e+01  -2.30919203e+02]
 [ -4.02131900e+01  -7.24163302e+00]
 [ -1.41382265e+02  -2.67367661e+02]
 [ -1.26032909e+02   4.12319718e+01]
 [  1.43536294e+00  -7.07398687e+01]
 [  8.19036232e+01   7.65232409e+01]
 [ -1.16438466e+01  -1.49551529e+02]
 [ -1.59577664e+02  -5.99853612e+01]
 [ -1.99856811e+01   1.16954973e+02]
 [ -2.68614677e+01  -2.06517472e+01]
 [  2.34820686e+02  -3.02553009e+01]
 [ -4.08619179e+01   1.23337326e+02]
 [ -1.40850384e+02   1.97357440e+02]
 [  1.02216244e+02   2.20172278e+01]
 [ -1.01590647e+02  -7.65003144e+01]
 [  3.18367560e-01   3.07595173e+01]
 [  1.67129920e+01  -8.89069052e+00]
 [ -1.63754481e+01  -7.71841099e+01]
 [  1.10612560e+01   2.42357097e+02]
 [ -7.84269932e+00   6.51039657e+01]
 [ -1.25561926e+00  -1.13530282e+02]
 [ -5.14276578e+01  -4.18808512e+01]
 [ -1.36059741e+02  -4.28031909e+01]
 [ -4.06117452e+01   6.46397949e+01]
 [ -4.43054134e+01  -2.70818246e+01]
 [  1.12511919e+01   9.13313840e+01]
 [ -1.04510774e+02  -1.02259019e+02]
 [ -4.46930047e+01  -1.25940163e+02]
 [  3.29344215e+01  -6.41014639e+01]
 [  8.44986320e+01  -5.41743265e+01]
 [ -1.45486758e+00   7.99639347e+01]
 [  3.48989542e+01  -1.45604042e+01]
 [ -4.28837439e+01   1.45708555e+02]
 [ -4.97685287e+01  -1.20807736e+01]
 [  7.95510418e+01  -9.59159375e+01]
 [ -9.93274268e+01  -5.90928466e-01]
 [ -1.15437841e+02  -2.76304443e+01]
 [  7.66426616e+01  -1.90647173e+02]
 [  3.85009799e+01   9.41236411e+01]
 [  1.03204406e+02  -5.91151812e+01]
 [  1.91401032e+00  -5.32465257e-01]
 [  1.75884353e+01   3.79805933e+01]
 [ -3.66476640e+01   1.01888957e+02]
 [  8.15631749e+01   9.28344877e+01]
 [  2.03305825e+01  -5.71435093e+01]
 [ -1.38481650e+01   9.30069190e+00]
 [ -7.49018726e+01   4.59578230e+01]
 [  1.52120574e+01   2.30738744e+01]
 [ -3.53270795e+01  -1.09952288e+02]
 [  4.89443110e+01   1.55135990e+01]
 [  1.18397968e+02   4.98555726e+01]
 [  2.80024083e+02  -1.12205610e+02]
 [ -2.75585294e+01   8.44681203e+01]
 [ -8.31954636e+01  -1.07463765e+02]
 [  1.31404624e+02   2.15980388e+01]
 [ -8.80618411e+00  -1.11497513e+01]
 [  5.06214018e+01  -1.80439025e+02]
 [ -6.40387054e+01   1.29103095e+02]
 [ -2.98054177e+01  -4.88988402e+01]
 [ -7.37298929e+01  -8.57275873e-01]
 [ -1.35821822e+01   9.20239934e+01]
 [ -8.81792004e+01   7.63448761e+01]
 [ -1.29565486e+02  -5.99846726e+00]
 [ -6.19002785e+01   7.29034671e+01]
 [  1.47585651e+02  -1.65977008e+02]
 [  4.41628862e+01  -8.39390754e+01]
 [ -2.23019290e+02  -7.49359476e+01]
 [ -1.02564964e+02   8.89563515e+01]
 [  4.03199961e+00  -1.31659890e+02]
 [  7.46044877e+01   1.71350891e+01]
 [ -6.73119748e+01  -2.94356633e+01]
 [ -3.47437106e+00  -2.37895626e+01]
 [ -2.77374263e+01   2.21560833e+01]
 [  3.70433610e-01  -9.26895324e+01]
 [ -3.80335365e+01  -9.23082993e+01]
 [  4.17601972e+01   6.52021985e+01]
 [  6.14298775e+01   1.10756161e+02]
 [  7.72100892e+01  -1.54481714e+01]
 [  4.15900073e+01   1.14520805e+02]
 [ -1.07369340e+02   2.74871216e+01]
 [ -8.86420178e+01  -1.44336165e+02]
 [  2.13806351e+01   1.05784070e+02]
 [ -1.31784363e+02  -7.21397831e+01]
 [ -8.15695382e+01  -8.55897797e+01]
 [ -6.87100597e+01  -6.21945330e+01]
 [ -5.83356074e+01   5.36798136e+01]
 [  4.31847031e+01  -4.49882819e+01]
 [ -8.79580719e+01   3.41773517e+01]
 [ -1.13769290e+00  -4.84813983e+01]
 [  1.02617470e+02   1.02517812e+02]
 [  1.20313869e+02   8.26274799e+01]
 [  4.56038898e+01  -6.67150558e+01]
 [ -8.18159888e+01  -4.27129480e+01]
 [ -1.15908030e+02  -6.01148287e+01]
 [ -4.92503359e+01   6.48543192e+00]
 [ -6.39382681e+01   1.46516464e+01]
 [ -5.72509330e+01  -1.47807118e+02]
 [ -9.42444255e+01  -2.79549317e+01]
 [ -1.72228696e+02   9.13606844e+01]
 [  1.99154220e+01   2.29179776e+02]
 [  8.15189086e+01  -7.14905042e+01]
 [ -8.55694900e+01   1.23393898e+02]
 [ -9.01016284e+01   1.18113672e+01]
 [  7.12220128e+01   6.13434210e+01]
 [ -5.90610758e+01   1.02154065e+02]
 [  6.95289200e+01  -4.29585746e+01]]
[[ -1.06590701e+02   4.47612561e+01]
 [ -7.01882236e+01  -7.69884755e+01]
 [ -8.89079358e+01   1.16942591e+02]
 [ -1.84858716e+01   1.38487345e+01]
 [  6.36517184e+01  -3.62510475e+01]
 [  1.13958238e+02  -4.43968821e-01]
 [ -2.24758120e+02  -2.19418459e+02]
 [  7.65764171e+01   2.23417948e+02]
 [ -6.41849728e+01   6.89088603e+01]
 [  1.10293825e+02   1.81157562e+01]
 [  2.80235417e+01   7.15426170e+01]
 [ -4.76615738e+01   3.32701956e+01]
 [ -9.30519964e+01  -2.35603305e+01]
 [  2.14357246e+02   1.55359986e+02]
 [ -9.16013348e+00   6.29900073e+01]
 [  5.31786197e+01  -1.70037368e+01]
 [  1.03888681e+02   3.22470047e+01]
 [ -6.42063544e+01   4.84360144e+01]
 [  9.23945965e+01  -5.49818097e+01]
 [ -3.45746642e+01  -8.74695129e+01]
 [  9.01662219e+01   3.72220859e+01]
 [ -2.13497396e+00   2.02289207e-01]
 [  5.92494867e+01  -4.77613341e+00]
 [  8.81591384e+00  -3.76991272e+01]
 [  1.15605527e+02   1.72284931e+02]
 [  2.04249739e+01   9.69212949e+01]
 [ -5.65699602e+01  -6.47608114e+01]
 [ -5.86660533e+00   1.07525313e+02]
 [ -7.30069631e+01  -1.48586592e+01]
 [  8.39901376e+01  -1.26800781e+02]
 [  4.01923309e+01   1.07608799e+01]
 [ -4.16743541e+01   9.22458268e+01]
 [ -8.65335779e+01  -6.06580979e+01]
 [ -1.17029947e+02  -5.40458761e+01]
 [ -2.93070846e+01  -5.37198272e+01]
 [ -7.21140371e-01   1.28364685e+02]
 [  3.13510463e+00  -1.08083033e+02]
 [  8.41154152e+01   4.76938198e+00]
 [ -2.66696176e+01   5.92902351e+01]
 [  8.10610816e+01  -7.41841974e+01]
 [  7.08311709e+01  -4.98063289e+01]
 [ -1.81605813e+02  -8.42018485e+00]
 [  1.04335490e+02   7.03693389e+01]
 [ -2.04784706e+01   2.91857036e+01]
 [  2.71253240e+01   5.27147812e+01]
 [  3.76410308e+01  -7.03034941e+01]
 [ -1.80955859e+01   1.62664392e+02]
 [  2.89118739e+01   1.45120116e+01]
 [  1.23076568e+01   7.29253879e+01]
 [ -1.12127734e+01   4.85782773e+01]
 [  1.37866475e+01  -5.20374719e+01]
 [  2.10877825e+01   1.26643448e+02]
 [  6.16985927e+00  -1.46120688e+01]
 [  8.19044742e+01   7.34404509e+01]
 [ -1.10296384e+02   1.04254946e+02]
 [  9.37909849e+01  -3.16579048e+01]
 [ -1.62874575e+01  -6.09506364e+01]
 [  1.77952608e+02  -7.96017822e+01]
 [ -5.63925413e+01   2.08762896e+00]
 [ -9.67558207e+01  -1.06474292e+02]
 [  2.13222235e+01  -8.01910473e+01]
 [  1.80104587e+01   2.15834884e+01]
 [ -6.07767166e+01  -9.85682999e+01]
 [  7.36653510e+01  -5.95499413e+00]
 [ -2.69025444e+01   1.14753547e+02]
 [ -8.10774334e+01   3.87673729e+01]
 [ -3.41186012e+01  -4.44557964e+01]
 [ -8.72774846e+01   1.53989442e+01]
 [ -6.54128071e+01  -5.47177752e+01]
 [ -1.21824468e+02  -2.47415132e+01]
 [  1.75380115e+01   8.40587369e+01]
 [ -5.24052954e+01  -3.81260401e+01]
 [ -1.11469149e+01  -1.03854276e+02]
 [  1.89556148e+01   1.52904860e+02]
 [ -2.24138859e+02   2.37436531e+02]
 [  4.77818613e+01  -1.60046407e+02]
 [  2.78425780e+01  -8.56641623e+01]
 [ -5.09488588e+00   1.42371705e+01]
 [ -1.87403046e-02  -5.12674553e+01]
 [ -9.53072596e+00  -3.62094235e+01]
 [  2.16189883e+01  -1.14849896e+02]
 [ -1.79021945e+02  -3.92642360e+01]
 [  1.03069528e+01   2.93466034e+01]
 [  7.64856593e+01  -1.67119332e+01]
 [  4.23299416e+01   1.30858154e+02]
 [ -4.21942042e+01  -2.79912245e+00]
 [  4.86899139e+01   3.31135718e+01]
 [  3.81323483e+01  -6.49490935e+00]
 [ -2.28412962e+01  -8.84832447e+01]
 [ -1.63860051e+02   1.10381706e+02]
 [  1.28942594e+02   1.69429454e+01]
 [  3.78371887e+01  -4.11848956e+01]
 [ -6.51454698e+01  -4.10573828e+01]
 [  2.54221162e+01   3.68182190e+01]
 [ -1.39862331e+00  -7.99910934e+01]
 [ -8.67667070e+01  -4.14680426e+01]
 [  7.50025799e+01   3.36189985e+01]
 [  1.19371155e+02  -6.21195030e+01]
 [  6.32988054e+01   1.28401692e+02]
 [ -1.37069962e+01  -4.77538764e+01]
 [  1.23431789e+01   9.76671175e+00]
 [ -5.28087368e+01   5.60638671e+01]
 [  5.40300572e+01   4.51426659e+01]
 [ -3.66872891e+01  -7.68234273e+01]
 [ -1.21432043e+01  -1.62901954e+01]
 [ -1.47528486e+01  -3.27412526e+00]
 [ -3.69644979e+00  -6.60465745e+01]
 [ -1.05477299e+02  -1.56097644e+02]
 [  3.02226809e+01  -3.27394968e+01]
 [  5.20079341e+01   1.94302698e+01]
 [ -7.17637544e+01   1.77932287e+00]
 [  3.84562260e+01   2.65533461e+01]
 [  2.46474988e+01  -1.56102007e+01]
 [ -3.38527333e+01   2.70399189e+01]
 [  1.13245136e+01  -1.98741530e+00]
 [ -1.45585053e+01  -1.39696500e+02]
 [  7.69601912e+01  -2.88634670e+01]
 [ -4.08812768e+01   1.59840796e+01]
 [ -8.25550857e+01   8.01506455e+01]
 [  6.63907283e+01  -8.38733972e+01]
 [ -1.02993362e+00  -2.61400391e+01]
 [ -7.42908183e+01  -3.04965015e+01]
 [  1.61798623e+02   4.87287089e+01]
 [ -5.48673738e+01  -7.99642337e+01]
 [ -8.89155805e+01   4.45984588e+01]
 [  5.39606693e+01  -9.40987603e+01]
 [  1.76733982e+01  -6.61759814e+01]
 [  3.47356142e+01  -1.01452485e+02]
 [ -3.01071355e+00   8.91225942e+01]
 [ -3.32362199e+01   4.56214657e+01]
 [  1.27371998e+01   5.73841247e+01]
 [ -7.10226112e+01  -1.44833249e+02]
 [  1.62476438e+01  -2.49787273e+01]
 [  4.04782252e+01   1.09541318e+02]
 [  3.37794574e+01  -1.31093500e+02]
 [  7.26932901e+01   4.45152428e+01]
 [  9.25572482e+01   1.95467218e+01]
 [  6.35341565e+01   3.62253831e+01]
 [ -1.09520476e+02  -5.17316948e+00]
 [ -2.59742894e+01  -2.35472154e+01]
 [  5.14751560e+01  -7.63272615e+01]
 [ -1.15103699e+02   2.07103129e+01]
 [  2.46312561e+01  -2.03357892e+00]
 [  5.25389574e+01  -5.00349384e+01]
 [ -1.01179923e+02  -2.08801316e+02]
 [  5.74611552e+01   9.79026549e+01]
 [  5.73394188e+01  -1.11133462e+02]
 [  2.46086963e+00   1.82546758e+02]
 [ -5.53660696e+01  -2.22181139e+01]
 [ -2.92343318e+01  -1.12007984e+02]
 [  6.32978892e+01  -6.81114721e+01]
 [  3.84558543e+01   4.53998140e+01]
 [ -6.00795864e+01   1.47168423e+01]
 [ -1.17609355e+01   1.94074112e+02]
 [ -4.04532110e+01  -6.17903432e+01]
 [  4.93785958e+01  -3.13293948e+01]
 [ -2.25582467e+01   4.68716347e+01]
 [ -2.76567700e+01   4.57812954e+00]
 [ -9.18370905e+01  -2.15165058e+00]
 [  1.15448034e+01  -9.27348271e+01]
 [ -2.73595189e+02   1.40420935e+02]
 [ -6.65813980e+00   7.48897127e+01]
 [  4.73319284e+01   6.87680406e+01]
 [ -5.73662486e+01   3.12088117e+01]
 [ -5.63245238e+01  -9.10069898e+00]
 [ -3.01638833e+01  -9.59064546e+01]
 [  5.17948132e+01   4.14524265e+00]
 [ -4.85759867e+01  -5.03768518e+01]
 [  1.79473910e+02  -2.83566676e+02]
 [ -5.06165869e+01   7.53462025e+01]
 [ -5.40900614e+01  -1.22511113e+02]
 [  8.39417742e+01  -4.40035606e+01]
 [ -3.23729629e+00   2.72653024e+01]
 [ -4.02544314e+01   5.85920281e+01]
 [ -1.03960880e+01   3.95650713e+01]
 [ -2.33862594e+01  -7.69794723e+01]
 [  6.57097401e+01   8.18541573e+01]
 [  2.44494433e+01  -4.37776251e+01]
 [  9.41752277e+01   5.24817313e+01]
 [ -3.35769394e+01   7.28403672e+01]
 [ -6.70821819e+01   2.75064065e+01]
 [  4.42419727e+01   8.28411160e+01]
 [  3.44859066e+01  -5.56019878e+01]
 [ -2.30975687e+01   8.29624023e+01]
 [  4.44569599e+02  -8.10697821e+01]
 [  4.39738132e+01   5.62189773e+01]
 [ -2.90939795e+01  -1.20851910e+01]
 [  1.30855157e+02   3.89228078e+01]
 [  1.41429300e+02   9.89317133e+01]
 [  7.00005880e+01   1.54960706e+01]
 [  3.91011661e+01  -2.11496616e+01]
 [  3.68028716e+01   8.90029328e+01]
 [ -1.17001595e+02   8.26984316e+01]
 [  6.53232388e+01   6.15818142e+01]
 [ -1.49854566e+02  -1.16344236e+02]
 [ -2.08239603e+01  -3.35392110e+01]
 [  9.24888025e+01  -1.40702772e+01]
 [  8.66284849e+00   4.52869863e+01]
 [ -2.24728491e+02  -2.56810907e+01]
 [  1.08848784e+02  -1.85306587e+01]
 [ -3.89546396e+01  -3.04376464e+01]]

In [21]:
class LSA_Ngram_Cooc(VectorSpace):
    def __init__(self, obs_corpus, target_corpus, 
            obs_ngram=1, target_ngram=1, svd_dim=100, svd_n_iter=5):
        self.obs_corpus = obs_corpus
        self.target_corpus = target_corpus
        self.obs_ngram = obs_ngram
        self.target_ngram = target_ngram
        self.svd_dim = svd_dim
        self.svd_n_iter = svd_n_iter
        self.obs_ngram_str = ngram_utils._ngram_str_map[self.obs_ngram]
        self.target_ngram_str = ngram_utils._ngram_str_map[self.target_ngram]

    def _get_cooc_terms(self, lst1, lst2, join_str):
        out = [""] * len(lst1) * len(lst2)
        cnt =  0
        for item1 in lst1:
            for item2 in lst2:
                out[cnt] = item1 + join_str + item2
                cnt += 1
        res = " ".join(out)
        return res

    def transform(self):
        obs_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.obs_ngram, "_"), self.obs_corpus))
        target_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.target_ngram, "_"), self.target_corpus))
        cooc_terms = list(map(lambda lst1,lst2: self._get_cooc_terms(lst1, lst2, "X"), obs_ngrams, target_ngrams))

        tfidf = self._init_word_ngram_tfidf(ngram=1)
        X = tfidf.fit_transform(cooc_terms)
        svd = TruncatedSVD(n_components=self.svd_dim, n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)
    
    
lsa_word = LSA_Ngram_Cooc(train['question1'],train['question2'], svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print lsa_word.transform()


[[ 0.38716338 -0.11492736  0.02693557 ..., -0.21959318  0.04572143
  -0.20302537]
 [ 0.37860533 -0.17642094 -0.11915297 ..., -0.02106806  0.06484059
  -0.14097846]
 [ 0.16008797  0.26690364  0.07742443 ...,  0.00483183 -0.03306614
   0.19714163]
 ..., 
 [ 0.18403166  0.3767789   0.17852834 ..., -0.06044721 -0.00204786
  -0.02433432]
 [ 0.35875601 -0.10181138 -0.30319755 ...,  0.02949101  0.00761067
  -0.05966009]
 [ 0.43520514 -0.10988795  0.11824322 ..., -0.02219283 -0.1385344
  -0.03518994]]

In [25]:
# ------------------------ LSA Cosine Similarity -------------------------------
class LSA_Ngram_CosineSim(VectorSpace):
    def __init__(self, obs_corpus, target_corpus, ngram=3, svd_dim=100, svd_n_iter=5):
        self.obs_corpus = obs_corpus
        self.target_corpus = target_corpus
        self.ngram = ngram
        self.svd_dim = svd_dim
        self.svd_n_iter = svd_n_iter

    def word_transform(self):
        ## get common vocabulary
        tfidf = self._init_word_ngram_tfidf(self.ngram)
        tfidf.fit(list(self.obs_corpus) + list(self.target_corpus))
        vocabulary = tfidf.vocabulary_
        ## obs tfidf
        tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
        X_obs = tfidf.fit_transform(self.obs_corpus)
        ## targetument tfidf
        tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
        X_target = tfidf.fit_transform(self.target_corpus)
        ## svd
        svd = TruncatedSVD(n_components = self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        svd.fit(scipy.sparse.vstack((X_obs, X_target)))
        X_obs = svd.transform(X_obs)
        X_target = svd.transform(X_target)
        ## cosine similarity
        sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
        sim = np.asarray(sim).squeeze()
        return sim
    
    def char_transform(self):
        ## get common vocabulary
        tfidf = self._init_char_ngram_tfidf(self.ngram)
        tfidf.fit(list(self.obs_corpus) + list(self.target_corpus))
        vocabulary = tfidf.vocabulary_
        ## obs tfidf
        tfidf = self._init_char_ngram_tfidf(self.ngram, vocabulary)
        X_obs = tfidf.fit_transform(self.obs_corpus)
        ## targetument tfidf
        tfidf = self._init_char_ngram_tfidf(self.ngram, vocabulary)
        X_target = tfidf.fit_transform(self.target_corpus)
        ## svd
        svd = TruncatedSVD(n_components=self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        svd.fit(scipy.sparse.vstack((X_obs, X_target)))
        X_obs = svd.transform(X_obs)
        X_target = svd.transform(X_target)
        ## cosine similarity
        sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
        sim = np.asarray(sim).squeeze()
        return sim
    
cosinesim_word = LSA_Ngram_CosineSim(train['question1'],train['question2'], ngram=3, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print cosinesim_word.word_transform()
cosinesim_char = LSA_Ngram_CosineSim(train['question1'],train['question2'], ngram=5, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print cosinesim_char.char_transform()


[ 0.97847646  0.68061563  0.95171286  0.21872809  0.94611844  0.96902353
  0.25696486  0.59150984  0.9846641   0.20744675  0.48132165  0.32993468
  0.82545877  0.97468417  0.99838001  0.52968499  0.99922397  0.2458281
  0.78626417  0.99988169  0.72661768  0.47052346  0.49049765  0.20727479
  0.35482186  0.9999447   0.89821017  0.02040301  0.98584412  0.88299513
  0.92618964  0.05305127  0.98456905  0.10727389  0.98793669  0.53883786
  0.96040861  0.44068233  0.91302845  0.16338309  0.27571471  0.99434496
  0.99971668  0.72264042  0.99988953  0.19729982 -0.08330814  0.63705422
  0.93522447  0.45299138  0.88233123  0.78467812  0.19955324  0.88649557
  0.09486967  0.13843015  0.61367832  0.82848928  0.9149584   0.04698079
  0.00418346  0.92522448  0.93782364  0.95337823  0.59217851  0.33222821
  0.79530532  0.43176658  0.86225878  0.39578685  0.97123748  0.88583323
  0.9787098   0.4108435   0.98901547  0.80988778 -0.00455441  0.42917243
  0.14127949  0.95090123  0.53664426  0.35051511  0.78033864  0.97437975
  0.7531871   0.95623102  0.99936168  0.83833572  0.99999133  0.99984932
  0.43550679  0.56377602  0.26765733  0.84891584  0.1201079   0.29613934
  0.09444463  0.28360104  0.92889689  0.25187983  0.90504253  0.50553596
  0.90048071  0.60812189  0.7861426   0.87320403  0.26265359  0.84059766
  0.99427232  0.55902484  0.99955354  0.92016208  0.02025874  0.91300975
  0.24986481  0.92863196  0.82990626  0.57659715  0.99978879  0.29455396
  0.32919569  0.19173036  0.80045336  0.94576466  0.01522838  0.90846361
  0.98092129  0.75891853  0.54464329  0.99107582  0.16645548  0.24876907
  0.38068777  0.48472698  0.98417028  0.97846389  0.86901746  0.82526778
  0.72487132  0.69198736  0.73179923  0.23163869  0.38995163  0.99778074
  0.18034151  0.99982609  0.07680813  0.9852764   0.98228077  0.07700296
  0.01383005  0.29387107  0.89593184  0.43515343  0.14318085  0.92177108
  0.18833192 -0.14314279  0.77343815  0.950378    0.90489878  0.74904658
  0.68081138  0.31342257  0.97131816  0.99901336  0.31519039  0.86299769
  0.55107185  0.20002086  0.52189134  0.56188584  0.02268585  0.54532925
  0.65380586  0.78774665  0.9999381   0.10189365  0.76001005  1.
  0.11854473  0.41568726  0.85111363  0.57712473  0.82751471  0.99991268
  0.05387595  0.61529708  0.34640498  0.76817289  0.97042254  0.40596956
  0.82389423  0.43219719  0.3681637   0.49599878  0.04898087  0.91222096
  0.99963074  0.82201436  0.87131375]
[ 0.99178639  0.63801509  0.98273924  0.4847442   0.79744502  0.98561066
  0.46736319  0.35790612  0.99890627  0.67156868  0.73466789  0.66038146
  0.90862895  0.98665685  0.99799632  0.76940604  0.99937874  0.56742223
  0.86700263  0.99410509  0.71112488  0.84011148  0.91348753  0.65796207
  0.74664463  0.99500172  0.96584957  0.75086233  0.96454036  0.94847977
  0.89200944  0.48774426  0.99784299  0.80597738  0.99484844  0.84949343
  0.8779622   0.82543638  0.98682371  0.64949042  0.65693149  0.99908836
  0.99947643  0.8386149   0.99803415  0.56212099  0.5611447   0.89153242
  0.93992571  0.84395366  0.88601977  0.87953357  0.65444363  0.94854123
  0.28850104  0.55950766  0.81256027  0.87677     0.92905792  0.39836332
  0.71332258  0.77643685  0.86561375  0.9474073   0.82819708  0.78237369
  0.77580958  0.80822275  0.86562124  0.78262114  0.84662999  0.99864456
  0.98171312  0.75858626  0.99806616  0.91618098  0.76045661  0.82331919
  0.4208947   0.95516545  0.7861488   0.79527526  0.84987328  0.88955617
  0.92798058  0.94253326  0.99928598  0.98638385  0.99932128  0.99986008
  0.672289    0.80692915  0.54584914  0.9746655   0.56388286  0.95830926
  0.39924611  0.4505322   0.97422824  0.47469844  0.96745747  0.78161028
  0.83757936  0.81458691  0.77117754  0.96230215  0.47995281  0.98084649
  0.99784848  0.68651074  0.99675229  0.97544143  0.40743256  0.93637781
  0.82034272  0.74135312  0.90699147  0.74010863  0.99525442  0.39714472
  0.84013242  0.21110677  0.96592173  0.98795381  0.28164293  0.89829297
  0.97812002  0.8157163   0.73067408  0.98372464  0.3934995   0.4933461
  0.81520057  0.83278918  0.95715037  0.96785429  0.93023857  0.87521668
  0.83722341  0.82538296  0.82384616  0.50905848  0.56569297  0.98529562
  0.50011197  0.99917931  0.65652144  0.97901325  0.96357356  0.7886815
  0.62506885  0.68459096  0.94366129  0.75168654  0.36506271  0.93385706
  0.78202833  0.70801707  0.83250707  0.99157619  0.89543928  0.95665016
  0.85309646  0.68966543  0.98964196  0.99276634  0.9197685   0.91949687
  0.77531081  0.75951635  0.88899816  0.74909488  0.3020502   0.89798955
  0.72176574  0.89179829  0.9950308   0.48018462  0.92915563  0.96118741
  0.63384878  0.82229088  0.93736981  0.79407055  0.84998167  0.999668
  0.69653724  0.63606812  0.81216597  0.96627741  0.94285381  0.78132456
  0.89681399  0.80899747  0.60599696  0.82675927  0.59491751  0.80364397
  0.99818929  0.91149881  0.97911683]
Out[25]:
201

In [56]:
# ------------------- Char distribution -------------------
class CharDistribution(VectorSpace):
    def __init__(self, obs_corpus, target_corpus):
        self.obs_corpus = obs_corpus
        self.target_corpus = target_corpus

    def normalize(self, text):
        # pat = re.compile("[a-z0-9]")
        pat = re.compile("[a-z]")
        group = pat.findall(text.lower())
        if group is None:
            res = " "
        else:
            res = "".join(group)
            res += " "
        return res

    def preprocess(self, corpus):
        return [self.normalize(text) for text in corpus]

    def get_distribution(self):
        ## obs tfidf
        tfidf = self._init_char_tfidf()
        X_obs = tfidf.fit_transform(self.preprocess(self.obs_corpus)).todense()
        X_obs = np.asarray(X_obs)
        # apply laplacian smoothing
        s = 1.
        X_obs = (X_obs + s) / (np.sum(X_obs, axis=1)[:,None] + X_obs.shape[1]*s)
        ## targetument tfidf
        tfidf = self._init_char_tfidf()
        X_target = tfidf.fit_transform(self.preprocess(self.target_corpus)).todense()
        X_target = np.asarray(X_target)
        X_target = (X_target + s) / (np.sum(X_target, axis=1)[:,None] + X_target.shape[1]*s)
        return X_obs, X_target

class CharDistribution_transform(CharDistribution):
    def __init__(self, obs_corpus, target_corpus, const_A=1., const_B=1.):
        CharDistribution.__init__(self, obs_corpus, target_corpus)
        self.const_A = const_A
        self.const_B = const_B

    def ratio_transform(self):
        X_obs, X_target = self.get_distribution()
        ratio = (X_obs + self.const_A) / (X_target + self.const_B)
        return ratio

    def cosine_transform(self):
        X_obs, X_target = self.get_distribution()
        ## cosine similarity
        sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
        sim = np.asarray(sim).squeeze()
        return sim

    def kl_transform(self):
        X_obs, X_target = self.get_distribution()
        kl = dist_utils._KL(X_obs, X_target)
        return kl
    
# cosinesim_word = CharDistribution(train['question1'],train['question2'])
# print cosinesim_word.get_distribution()
cosinesim_word = CharDistribution_transform(train['question1'],train['question2'])
print cosinesim_word.ratio_transform()
print cosinesim_word.cosine_transform()
print cosinesim_word.kl_transform()


[[ 1.00735043  0.99754127  0.99875356 ...,  0.99875356  0.99754127
   0.99875356]
 [ 0.98521285  0.99487179  0.99487179 ...,  1.00502355  1.02025118
   1.00502355]
 [ 0.99404319  0.97285068  1.00521221 ...,  0.99845201  0.99694423
   0.99845201]
 ..., 
 [ 0.99753695  0.99753695  0.99524376 ...,  0.99874608  0.99874608
   0.99874608]
 [ 0.98924731  0.98924731  0.99966044 ...,  0.99988438  0.99988438
   1.01052145]
 [ 1.01079219  0.98275058  0.99855769 ...,  0.9992674   0.99855769
   0.9992674 ]]
[ 0.9826394   0.94292652  0.95321004  0.81583971  0.92166164  0.97178847
  0.79229621  0.95374099  0.99052999  0.89268778  0.91977606  0.91901409
  0.97485929  0.99200608  0.99795932  0.9512718   0.99629752  0.926243
  0.94867303  0.98153152  0.90606067  0.94434925  0.94616624  0.79632086
  0.82330525  0.98999437  0.96656932  0.94472788  0.98221291  0.92317469
  0.9654985   0.940977    0.98609112  0.89835154  0.97657288  0.94763479
  0.95091058  0.93958255  0.91248902  0.94261539  0.88117256  0.9960804   1.
  0.90555094  0.98909412  0.91974531  0.88622389  0.9133389   0.97558949
  0.9825015   0.98395878  0.9591192   0.82945335  0.95453085  0.79285833
  0.8000618   0.90134218  0.9409703   0.92309775  0.88693031  0.88984149
  0.93816894  0.8450003   0.95307425  0.9727663   0.92600582  0.95759185
  0.88808914  0.96999663  0.85063566  0.96357409  0.99805908  0.98884759
  0.93451154  0.98943459  0.86820135  0.95715105  0.93493311  0.92211953
  0.93810028  0.92097015  0.93512304  0.93340953  0.83704845  0.97473559
  0.97212846  0.99345744  0.97376824  0.98891303  1.          0.94370181
  0.96496422  0.96525097  0.97128586  0.88758809  0.9903538   0.90328925
  0.92425091  0.97627213  0.79397987  0.97117606  0.88987898  0.96586673
  0.97629106  0.92842053  0.91627277  0.85899015  0.99788523  0.98255008
  0.94219273  0.93109797  0.98785811  0.91892329  0.94971112  0.92556984
  0.87510683  0.90485263  0.91072879  0.99022409  0.91993637  0.93521812
  0.92354275  0.97601221  0.95559281  0.89190351  0.91479731  0.97940351
  0.95568445  0.89068571  0.93733967  0.94675312  0.79867668  0.93045826
  0.94433024  0.94009611  0.92219341  0.95841588  0.94617516  0.91882212
  0.98553503  0.91104966  0.88525871  0.91555018  0.97647343  0.93952998
  0.9936686   0.90930733  0.94782149  0.90459782  0.93404988  0.88839122
  0.94315251  0.96645139  0.95198927  0.91979925  0.93349258  0.94737976
  0.94507341  0.93643739  0.97122132  0.94870586  0.87343877  0.93973374
  0.97730594  0.96889656  0.98317018  0.93747691  0.91878274  0.98221012
  0.85967408  0.91288749  0.9572127   0.94637189  0.87212696  0.98704295
  0.98394368  0.96588137  0.92609128  0.96728716  0.96770358  0.93951706
  0.97030339  0.96485741  0.96123795  0.93017941  0.99362467  0.89599749
  0.88631929  0.9141318   0.95305217  0.95704283  0.94421203  0.97524414
  0.95729402  0.9567795   0.98091934  0.94176265  0.96103267  0.99166253
  0.96115965  0.99195011]
[ 0.01682067  0.0796156   0.07171242  0.24855499  0.08986008  0.03641235
  0.2653929   0.05809005  0.0078962   0.13504188  0.09848521  0.10916667
  0.02422421  0.01125173  0.00552049  0.06523018  0.00358402  0.07983557
  0.08141213  0.03547613  0.10338761  0.0584186   0.06030454  0.2490528
  0.18951055  0.01270993  0.04039128  0.08092216  0.02063058  0.08427689
  0.0477473   0.07331234  0.01516779  0.14015784  0.03476908  0.0699751
  0.06384158  0.10487567  0.09109478  0.06183025  0.13964877  0.00756066
  0.          0.12732599  0.01498009  0.09571836  0.14842255  0.12736872
  0.03096037  0.01692633  0.02948748  0.0611181   0.19392275  0.06544355
  0.24281081  0.25804983  0.14928538  0.06932685  0.10367104  0.14826642
  0.12125243  0.10526323  0.17937113  0.05014049  0.03216703  0.0933903
  0.04792378  0.1438571   0.03491942  0.19332067  0.04587188  0.00168473
  0.0135663   0.09544511  0.01065786  0.16934618  0.06598154  0.08303051
  0.10519539  0.06684047  0.09113271  0.09000492  0.10294429  0.16354588
  0.03443088  0.04307841  0.01422265  0.03363462  0.01239627  0.
  0.07840696  0.05226996  0.04704558  0.03106125  0.12227486  0.01582191
  0.12117553  0.09845484  0.04731375  0.20984116  0.0436355   0.14312034
  0.05342368  0.02496784  0.09233166  0.08479122  0.18509517  0.00208024
  0.02146044  0.10545008  0.0804003   0.01143788  0.10280223  0.06405977
  0.09925797  0.1672342   0.12094009  0.12336238  0.01385762  0.13373036
  0.07815333  0.10910176  0.02254955  0.07375036  0.13456616  0.11863963
  0.02713982  0.05819545  0.14472208  0.08472487  0.06123127  0.21909618
  0.06500221  0.06616168  0.10199703  0.09380185  0.04222597  0.08184042
  0.14974309  0.01955891  0.12567014  0.11054515  0.09514862  0.03233559
  0.08875785  0.01039442  0.12396597  0.06958592  0.13508768  0.10383715
  0.16046734  0.07116574  0.04887657  0.05747437  0.09202446  0.08862129
  0.07707962  0.06567223  0.07362039  0.03610086  0.05998102  0.15175464
  0.06419945  0.02790579  0.03978076  0.01889939  0.09403292  0.08938616
  0.02492801  0.15753856  0.12399897  0.06383356  0.07382786  0.15322827
  0.01291446  0.01668082  0.04740949  0.08171892  0.03964108  0.04215899
  0.06869944  0.03926618  0.05914289  0.04270672  0.10857149  0.01052327
  0.14184433  0.17706804  0.10499039  0.08080161  0.04741281  0.06988544
  0.02301047  0.0546356   0.0675504   0.01784008  0.06892845  0.05075866
  0.01185953  0.06393028  0.0074332 ]

In [17]:
from nltk.corpus import wordnet as wn
from utils import dist_utils, ngram_utils, nlp_utils, pkl_utils
from utils import logging_utils, time_utils

# tune the token pattern to get a better correlation with y_train
token_pattern = r"(?u)\b\w\w+\b"
# token_pattern = r"\w{1,}"
# token_pattern = r"\w+"
# token_pattern = r"[\w']+"
# token_pattern = " " 

class WordNet_Similarity:
    """Double aggregation features"""
    def __init__(self, metric="path"):
#         super().__init__(obs_corpus, target_corpus, aggregation_mode, None, aggregation_mode_prev)
        self.metric = metric
        if self.metric == "path":
            self.metric_func = lambda syn1, syn2: wn.path_similarity(syn1, syn2)
        elif self.metric == "lch":
            self.metric_func = lambda syn1, syn2: wn.lch_similarity(syn1, syn2)
        elif self.metric == "wup":
            self.metric_func = lambda syn1, syn2: wn.wup_similarity(syn1, syn2)
        else:
            raise(ValueError("Wrong similarity metric: %s, should be one of path/lch/wup."%self.metric))
            
    def _maximum_similarity_for_two_synset_list(self, syn_list1, syn_list2):
        s = 0.
        if syn_list1 and syn_list2:
            for syn1 in syn_list1:
                for syn2 in syn_list2:
                    try:
                        _s = self.metric_func(syn1, syn2)
                    except:
                        _s = config.MISSING_VALUE_NUMERIC
                    if _s and _s > s:
                        s = _s
        return s

    def transform_one(self, obs, target):
        obs_tokens = nlp_utils._tokenize(obs, token_pattern)
        target_tokens = nlp_utils._tokenize(target, token_pattern)
        obs_synset_list = [wn.synsets((obs_token).decode('utf-8')) for obs_token in obs_tokens]
        target_synset_list = [wn.synsets((target_token).decode('utf-8')) for target_token in target_tokens]
        val_list = []
        for obs_synset in obs_synset_list:
            _val_list = []
            for target_synset in target_synset_list:
                _s = self._maximum_similarity_for_two_synset_list(obs_synset, target_synset)
                _val_list.append(_s)
            if len(_val_list) == 0:
                _val_list = [config.MISSING_VALUE_NUMERIC]
            val_list.append( max(_val_list) )
        if len(val_list) == 0:
            val_list = [[config.MISSING_VALUE_NUMERIC]]
        return np.mean(val_list)
    
t = train.sample(n=10)
wn_list = ["path", "lch", "wup"]
for wn_method in wn_list:
    wn_sim = WordNet_Similarity(metric=wn_method)
    t.apply(lambda x: wn_sim.transform_one(x['question1'],x['question2']), axis=1)


Out[17]:
290086    0.138095
72013     0.363054
366541    0.500000
67367     0.555556
89896     0.500000
217790    0.239914
275125    0.425397
608       0.513889
21845     0.532407
218776    0.127778
dtype: float64
Out[17]:
290086    1.271662
72013     1.721377
366541    1.999934
67367     2.427426
89896     1.818793
217790    1.640691
275125    1.723845
608       2.375150
21845     1.875742
218776    0.904142
dtype: float64
Out[17]:
290086    0.426494
72013     0.515385
366541    0.565171
67367     0.658333
89896     0.500000
217790    0.558175
275125    0.631258
608       0.706926
21845     0.625356
218776    0.270120
dtype: float64

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [4]:
from __future__ import division
import time, os, gc
import numpy as np
import pandas as pd
import scipy
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectPercentile, f_classif
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.feature_extraction import text
from sklearn.metrics import log_loss

import cPickle
import gensim
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize
import config
stop_words = stopwords.words('english')

PATH = config.RAW_PATH
FEAT_PATH = config.FEAT_PATH
# train = pd.read_csv(PATH+'train.csv',nrows=config.TRAIN_SIZE)
# test = pd.read_csv(PATH+'test.csv',nrows=config.TEST_SIZE)
data = train#.ix[:100]

def wmd(s1,s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words("english")
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1,s2)

def norm_wmd(s1,s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words("english")
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return norm_model.wmdistance(s1, s2)


def sent2vec(s):
    words = str(s).lower().decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

# data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
# data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
# data['diff_len'] = data.len_q1 - data.len_q2
# data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
# data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
# data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
# data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))
# data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
# data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

model = gensim.models.KeyedVectors.load_word2vec_format(PATH+'GoogleNews-vectors-negative300.bin.gz', binary=True)
# data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

# norm_model =  gensim.models.KeyedVectors.load_word2vec_format(PATH+'GoogleNews-vectors-negative300.bin.gz', binary=True)
# norm_model.init_sims(replace=True)
# data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)

question1_vectors = np.zeros((data.shape[0], 300))
error_count = 0
for i, q in tqdm(enumerate(data.question1.values)):
    question1_vectors[i, :] = sent2vec(q)

question2_vectors  = np.zeros((data.shape[0], 300))
for i, q in tqdm(enumerate(data.question2.values)):
    question2_vectors[i, :] = sent2vec(q)

# data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]

# data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]

# data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]

# data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]

# data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]

# data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]

# data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
#                                                           np.nan_to_num(question2_vectors))]


# data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
# data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
# data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
# data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

# cPickle.dump(question1_vectors, open(FEAT_PATH+'q1_w2v.pkl', 'wb'), -1)
# cPickle.dump(question2_vectors, open(FEAT_PATH+'q2_w2v.pkl', 'wb'), -1)

# data.to_csv(FEAT_PATH+'ab_features.csv', index=False)

# data


2750086it [10:59, 4170.62it/s]
2750086it [11:02, 4149.21it/s]

In [5]:
from utils import dist_utils, ngram_utils, nlp_utils
data['RMSE_distance'] = [dist_utils._rmse(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]