In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from __future__ import division
import re, time, os, gc
import sys
import string
import numpy as np
import pandas as pd
import scipy
import config
from utils import dist_utils, ngram_utils, nlp_utils, np_utils
from utils import logging_utils, time_utils, pkl_utils
df = pd.read_csv(config.RAW_PATH+'train.csv')
df['question1'] = df['question1'].astype(str)
df['question2'] = df['question2'].astype(str)
train = df.sample(n=100)[['question1', 'question2']]
In [3]:
train_orig = pd.read_csv(config.RAW_PATH+'train.csv', header=0)
test_orig = pd.read_csv(config.RAW_PATH+'test.csv', header=0)
train = pd.concat([train_orig[['question1', 'question2']], \
test_orig[['question1', 'question2']]], axis=0).reset_index(drop='index')
train['question1'] = train['question1'].astype(str)
train['question2'] = train['question2'].astype(str)
In [2]:
def jaccard_ngram(obs, target, ngram=1, token_pattern=" "):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
return dist_utils._jaccard_coef(obs_ngrams, target_ngrams)
def dicedistence_ngram(obs, target, ngram=1, token_pattern=" "):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
return dist_utils._dice_dist(obs_ngrams, target_ngrams)
def compression_dist(obs, target):
return dist_utils._compression_dist(obs, target)
def edit_dist(obs, target):
return dist_utils._edit_dist(obs, target)
def compression_dist_ngram(obs, target, ngram=2, token_pattern=" "):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
val_list = []
# for w1 in obs_ngrams:
# _val_list = []
# for w2 in target_ngrams:
# s = dist_utils._compression_dist(w1, w2)
# _val_list.append(s)
# if len(_val_list) == 0:
# _val_list = [ -1 ]
# val_list.append( _val_list )
# if len(val_list) == 0:
# val_list = [ [-1] ]
for w1 in obs_ngrams:
_val_list = []
for w2 in target_ngrams:
s = dist_utils._compression_dist(w1, w2)
_val_list.append(s)
if len(_val_list) == 0:
_val_list = [-1]
val_list.append( max(_val_list) )
if len(val_list) == 0:
val_list = [-1]
return min(val_list)
def edit_dist_ngram(obs, target, ngram=2, token_pattern=" ", agg=[np.min, np.max]):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
val_list = []
for w1 in obs_ngrams:
_val_list = []
for w2 in target_ngrams:
s = dist_utils._edit_dist(w1, w2)
_val_list.append(s)
if len(_val_list) == 0:
_val_list = [-1]
val_list.append( agg[0](_val_list) )
if len(val_list) == 0:
val_list = [-1]
return float(agg[1](val_list))
In [3]:
for NGRAMS in [1,2,3]:
train['jaccard_n%s'%NGRAMS] = train.apply(lambda x: jaccard_ngram(x['question1'],x['question2'],ngram=NGRAMS), axis=1)
train['dicedistence_n%s'%NGRAMS] = train.apply(lambda x: dicedistence_ngram(x['question1'],x['question2'],ngram=NGRAMS), axis=1)
train['compression_dist'] = train.apply(lambda x: compression_dist(x['question1'],x['question2']), axis=1)
train['edit_dist'] = train.apply(lambda x: edit_dist(x['question1'],x['question2']), axis=1)
np_dict = { 'mean':np.mean, 'min':np.mean, 'max':np.max, 'median':np.median, 'std':np.std }
for AGG_NGRAMS in [2,3]:
for agg1 in np_dict.keys():
for agg2 in np_dict.keys():
AGG_BY = agg1 + '_' + agg2
AGG_FUNC = [np_dict[agg1],np_dict[agg2]]
# train['compression_dist_agg_n%s'%AGG_NGRAMS] = train.apply(lambda x: compression_dist_ngram(x['question1'],x['question2'],ngram=AGG_NGRAMS), axis=1)
train['edit_dist_agg_n%s_%s'%(AGG_NGRAMS,AGG_BY)] = train.apply(lambda x:
edit_dist_ngram(x['question1'],x['question2'], ngram=AGG_NGRAMS, agg=AGG_FUNC), axis=1)
train.corr()
Out[3]:
In [266]:
import datetime
print datetime.datetime.now()
In [267]:
train.to_csv(config.RAW_PATH+'train_1111111.csv',index=False)
In [269]:
train_orig = pd.read_csv(config.RAW_PATH+'train.csv', header=0)
test_orig = pd.read_csv(config.RAW_PATH+'test.csv', header=0)
test_orig['is_duplicate'] = -1
train1 = pd.concat([train_orig[['question1', 'question2','is_duplicate']], \
test_orig[['question1', 'question2','is_duplicate']]], axis=0).reset_index(drop='index')
train['is_duplicate'] = train1['is_duplicate']
In [270]:
train[train['is_duplicate']>=0].corr()
Out[270]:
In [4]:
def get_position_list(obs, target, ngram=1, token_pattern=" "):
"""
Get the list of positions of obs in target
"""
obs_tokens = nlp_utils._tokenize(str(obs), token_pattern)
target_tokens = nlp_utils._tokenize(str(target), token_pattern)
obs = ngram_utils._ngrams(obs_tokens, ngram)
target = ngram_utils._ngrams(target_tokens, ngram)
pos_of_obs_in_target = [0]
if len(obs) != 0:
pos_of_obs_in_target = [j for j,w in enumerate(obs, start=1) if w in target]
if len(pos_of_obs_in_target) == 0:
pos_of_obs_in_target = [0]
return pos_of_obs_in_target, len(obs)
for ngram in [1,2]:
for target_name in ['question1','question2']:
for obs_name in ['question1','question2']:
if target_name != obs_name:
position = train[['question1','question2']].apply(lambda x: get_position_list(obs=x[obs_name],target=x[target_name],ngram=ngram), axis=1)
pos = [i[0] for i in position]
obs_len = [i[1] for i in position]
## stats feat on pos
train["pos_of_%s_n%s_in_%s_min" % (obs_name, ngram, target_name)] = map(np.min, pos)
train["pos_of_%s_n%s_in_%s_mean" % (obs_name, ngram, target_name)] = map(np.mean, pos)
train["pos_of_%s_n%s_in_%s_median" % (obs_name, ngram, target_name)] = map(np.median, pos)
train["pos_of_%s_n%s_in_%s_max" % (obs_name, ngram, target_name)] = map(np.max, pos)
train["pos_of_%s_n%s_in_%s_std" % (obs_name, ngram, target_name)] = map(np.std, pos)
# stats feat on normalized_pos
train["norm_pos_of_%s_n%s_in_%s_min" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_min" % (obs_name, ngram, target_name)], obs_len)
train["norm_pos_of_%s_n%s_in_%s_mean" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_mean" % (obs_name, ngram, target_name)], obs_len)
train["norm_pos_of_%s_n%s_in_%s_median" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_median" % (obs_name, ngram, target_name)], obs_len)
train["norm_pos_of_%s_n%s_in_%s_max" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_max" % (obs_name, ngram, target_name)], obs_len)
train["norm_pos_of_%s_n%s_in_%s_std" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_std" % (obs_name, ngram, target_name)] ,obs_len)
train.corr()[train.corr().index.str.contains('pos_of')]
Out[4]:
In [5]:
# class Count_Ngram_BaseEstimator:
# def __init__(self, idx=-1, ngram=1, aggr="", threshold=config.STR_MATCH_THRESHOLD):
# self.idx = idx
# self.ngram = ngram
# self.threshold = threshold
# def _get_match_count(self, obs, target):
# cnt = 0
# if (len(obs) != 0) and (len(target) != 0):
# if self.idx == -1:
# for obs_word in obs:
# for word in target:
# if dist_utils._is_str_match(word, obs_word, self.threshold):
# cnt += 1
# else:
# for word in target:
# if dist_utils._is_str_match(word, obs[self.idx], self.threshold):
# cnt += 1
# return cnt
# def count_close_ngram(self, obs, target, token_pattern=" "):
# obs_tokens = nlp_utils._tokenize(obs, token_pattern)
# target_tokens = nlp_utils._tokenize(target, token_pattern)
# obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
# target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
# return self._get_match_count(obs_ngrams, target_ngrams)
def count_close_ngram(obs, target, idx=-1, ratio='count', ngram=123, aggr="", token_pattern=" ", threshold=config.STR_MATCH_THRESHOLD):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs = ngram_utils._ngrams(obs_tokens, ngram)
target = ngram_utils._ngrams(target_tokens, ngram)
cnt = 0
if (len(obs) != 0) and (len(target) != 0):
if idx == -1:
for obs_word in obs:
for word in target:
if dist_utils._is_str_match(word, obs_word, threshold):
cnt += 1
else:
for word in target:
if dist_utils._is_str_match(word, obs[idx], threshold):
cnt += 1
if ratio == 'count':
return cnt
else: return np_utils._try_divide(cnt, (len(obs)+len(target))/2.0)
# count1 = Count_Ngram_BaseEstimator(threshold=0.65)
# train['intersect_count'] = train[['question1','question2']].apply(lambda x:
# count1.count_close_ngram(x[0],x[1]), axis=1)
NGRAMS=[1,2,3]
RATIO=['count','ratio']
for ngram in NGRAMS:
for ratio in RATIO:
train['intersect_close_%s_n%s'%(ratio,ngram)] = train[['question1','question2']].apply(lambda x:
count_close_ngram(x[0],x[1],threshold=0.7,ngram=ngram,ratio=ratio), axis=1)
train.corr()[train.corr().index.str.contains('intersect_close')]
Out[5]:
In [6]:
def cooccurrence_ngram(obs, target, ngram=1, threshold=0.8, ratio='ratio', token_pattern=" "):
"""
Get the count cooccurrence_ngram in obs and target
"""
obs_tokens = nlp_utils._tokenize(str(obs), token_pattern)
target_tokens = nlp_utils._tokenize(str(target), token_pattern)
obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
s = 0.
for w1 in obs_ngrams:
for w2 in target_ngrams:
if dist_utils._is_str_match(w1, w2, threshold):
s += 1.
if ratio == 'count':
return s
else: return np_utils._try_divide(s, len(obs_ngrams)*len(target_ngrams))
NGRAMS=[1,2,3]
RATIO=['count','ratio']
for ngram in NGRAMS:
for ratio in RATIO:
train['cooccurrence_close_%s_n%s'%(ratio,ngram)] = train[['question1','question2']].apply(lambda x:
cooccurrence_ngram(x[0],x[1],threshold=0.7,ngram=ngram,ratio=ratio), axis=1)
# train.corr().ix[-1*len(NGRAMS)*len(RATIO):]
train.corr()[train.corr().index.str.contains('cooccurrence')]
Out[6]:
In [7]:
def LongestMatchSize(obs_corpus, target_corpus):
return dist_utils._longest_match_size(obs_corpus, target_corpus)
def LongestMatchRatio(obs_corpus, target_corpus):
return dist_utils._longest_match_ratio(obs_corpus, target_corpus)
train['LongestMatchSize'] = train[['question1','question2']].apply(lambda x: LongestMatchSize(x[0],x[1]), axis=1)
train['LongestMatchRatio'] = train[['question1','question2']].apply(lambda x: LongestMatchRatio(x[0],x[1]), axis=1)
train.corr()[train.corr().index.str.contains('LongestMatch')]
Out[7]:
In [9]:
'''
QuestionQuality
IsInSpellCheckingList
'''
In [10]:
from collections import defaultdict
def _get_df_dict(target_corpus, ngram=1, token_pattern=" "):
d = defaultdict(lambda : 1)
for target in target_corpus:
target_tokens = nlp_utils._tokenize(target, token_pattern)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
for w in set(target_ngrams):
d[w] += 1
return d
def _get_idf(word, idf_dict, N):
return np.log((N - idf_dict[word] + 0.5)/(idf_dict[word] + 0.5))
def cooc_tfidf_ngram(obs, target, ngram=1, threshold=0.85, ratio="ratio", token_pattern=" "):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
val_list = []
for w1 in obs_ngrams:
s = 0.
for w2 in target_ngrams:
if dist_utils._is_str_match(w1, w2, threshold):
s += 1.
if ratio == "count":
val_list.append(s * _get_idf(w1, idf_dict, doc_num))
elif ratio == "ratio":
val_list.append(np_utils._try_divide(s, len(target_ngrams)) * _get_idf(w1, idf_dict, doc_num))
if len(val_list) == 0:
val_list = [config.MISSING_VALUE_NUMERIC]
return val_list
doc_num = train.shape[0]
for ngram in [1,2]:
idf_dict = _get_df_dict(np.concatenate((train['question1'].values , train['question2'].values)), ngram=ngram)
for ratio in ['count','ratio']:
for target_name in ['question1','question2']:
for obs_name in ['question1','question2']:
if target_name != obs_name:
pos = train[['question1','question2']].apply(lambda x: cooc_tfidf_ngram(
obs=x[obs_name],target=x[target_name], ngram=ngram,ratio=ratio), axis=1)
# train["cooc_tfidf_%s_n%s_%s_min" % (obs_name, ngram, ratio)] = map(np.min, pos)
train["cooc_tfidf_%s_n%s_%s_mean" % (obs_name, ngram, ratio)] = map(np.mean, pos)
train["cooc_tfidf_%s_n%s_%s_median" % (obs_name, ngram, ratio)] = map(np.median, pos)
train["cooc_tfidf_%s_n%s_%s_max" % (obs_name, ngram, ratio)] = map(np.max, pos)
train["cooc_tfidf_%s_n%s_%s_std" % (obs_name, ngram, ratio)] = map(np.std, pos)
train.corr()[train.corr().index.str.contains('cooc_tfidf')]
Out[10]:
In [13]:
BM25_K1=config.BM25_K1
BM25_B=config.BM25_B
def _get_avg_ngram_doc_len(target_corpus, ngram=1, token_pattern=" "):
lst = []
for target in target_corpus:
target_tokens = nlp_utils._tokenize(target, token_pattern)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
lst.append(len(target_ngrams))
return np.mean(lst)
def bm25(obs, target, ngram=1, threshold=0.85, ratio="ratio", token_pattern=" ", b=None, k1=None, doc_len=None, idf_dict=idf_dict):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
K = k1 * (1 - b + b * np_utils._try_divide(len(target_ngrams), doc_len))
val_list = []
for w1 in obs_ngrams:
s = 0.
for w2 in target_ngrams:
if dist_utils._is_str_match(w1, w2, threshold):
s += 1.
bm25 = s * _get_idf(w1, idf_dict, doc_num) * np_utils._try_divide(1 + k1, s + K)
val_list.append(bm25)
if len(val_list) == 0:
val_list = [config.MISSING_VALUE_NUMERIC]
return val_list
for ngram in [1,2]:
idf_dict = _get_df_dict(np.concatenate((train['question1'].values , train['question2'].values)), ngram=ngram)
# for ratio in ['count','ratio']:
for target_name in ['question1','question2']:
avg_target_len = _get_avg_ngram_doc_len(train[target_name].values, ngram=ngram)
for obs_name in ['question1','question2']:
if target_name != obs_name:
pos = train[['question1','question2']].apply(lambda x: bm25(obs=x[obs_name],target=x[target_name],
ngram=ngram,threshold=0.85,b=BM25_B, k1=BM25_K1, doc_len=avg_target_len), axis=1)
# train["bm25_%s_n%s_min" % (obs_name, ngram)] = map(np.min, pos)
train["bm25_%s_n%s_mean" % (obs_name, ngram)] = map(np.mean, pos)
train["bm25_%s_n%s_median" % (obs_name, ngram)] = map(np.median, pos)
train["bm25_%s_n%s_max" % (obs_name, ngram)] = map(np.max, pos)
train["bm25_%s_n%s_std" % (obs_name, ngram)] = map(np.std, pos)
train.corr()[train.corr().index.str.contains('bm25_')].sort_values(by='is_duplicate',ascending=0)
Out[13]:
In [57]:
# ------------------------ Vector Space Features -------------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import config
from utils import dist_utils, ngram_utils, nlp_utils, np_utils, pkl_utils
from utils import logging_utils, time_utils
class VectorSpace:
## word based
def _init_word_bow(self, ngram, vocabulary=None):
bow = CountVectorizer(min_df=3,
max_df=0.75,
max_features=None,
# norm="l2",
strip_accents="unicode",
analyzer="word",
token_pattern=r"\w{1,}",
ngram_range=(1, ngram),
vocabulary=vocabulary)
return bow
## word based
def _init_word_ngram_tfidf(self, ngram, vocabulary=None):
tfidf = TfidfVectorizer(min_df=3,
max_df=0.75,
max_features=None,
norm="l2",
strip_accents="unicode",
analyzer="word",
token_pattern=r"\w{1,}",
ngram_range=(1, ngram),
use_idf=1,
smooth_idf=1,
sublinear_tf=1,
# stop_words="english",
vocabulary=vocabulary)
return tfidf
## char based
def _init_char_tfidf(self, include_digit=False):
chars = list(string.ascii_lowercase)
if include_digit:
chars += list(string.digits)
vocabulary = dict(zip(chars, range(len(chars))))
tfidf = TfidfVectorizer(strip_accents="unicode",
analyzer="char",
norm=None,
token_pattern=r"\w{1,}",
ngram_range=(1, 1),
use_idf=0,
vocabulary=vocabulary)
return tfidf
## char based ngram
def _init_char_ngram_tfidf(self, ngram, vocabulary=None):
tfidf = TfidfVectorizer(min_df=3,
max_df=0.75,
max_features=None,
norm="l2",
strip_accents="unicode",
analyzer="char",
token_pattern=r"\w{1,}",
ngram_range=(1, ngram),
use_idf=1,
smooth_idf=1,
sublinear_tf=1,
# stop_words="english",
vocabulary=vocabulary)
return tfidf
# ------------------------ LSA -------------------------------
class LSA_Ngram(VectorSpace):
def __init__(self, corpus, obs_corpus, target_corpus=None, ngram=3, svd_dim=100, svd_n_iter=5):
self.obs_corpus = obs_corpus
self.ngram = ngram
self.svd_dim = svd_dim
self.svd_n_iter = svd_n_iter
self.corpus = corpus
self.target_corpus = target_corpus
def word_transform(self):
tfidf = self._init_word_ngram_tfidf(self.ngram)
tfidf.fit(self.corpus)
X = tfidf.transform(self.obs_corpus)
# word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
svd = TruncatedSVD(n_components = self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
return svd.fit_transform(X)
def char_transform(self):
tfidf = self._init_char_ngram_tfidf(self.ngram)
tfidf.fit(self.corpus)
X = tfidf.transform(self.obs_corpus)
svd = TruncatedSVD(n_components = self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
return svd.fit_transform(X)
def pair_transform(self):
## tfidf
tfidf = self._init_word_ngram_tfidf(ngram=self.ngram)
tfidf.fit(self.corpus)
X_obs = tfidf.transform(self.obs_corpus)
tfidf = self._init_word_ngram_tfidf(ngram=self.ngram)
tfidf.fit(self.corpus)
X_target = tfidf.transform(self.target_corpus)
X_tfidf = scipy.sparse.hstack([X_obs, X_target]).tocsr()
## svd
svd = TruncatedSVD(n_components=self.svd_dim, n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
X_svd = svd.fit_transform(X_tfidf)
return X_svd
all_corpus = []
feats_corpus = ['question1','question2']
for f in feats_corpus:
train[f] = train[f].astype(str)
all_corpus += train[f].values.tolist()
for f in ['question1','question2']:
lsa_word = LSA_Ngram(all_corpus,train[f], ngram=2, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print lsa_word.word_transform()
break
for f in ['question1','question2']:
lsa_char = LSA_Ngram(all_corpus,train[f], ngram=5, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print lsa_char.char_transform()
break
lsa_pair = LSA_Ngram(all_corpus,train['question1'], target_corpus=train['question2'], ngram=2, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print lsa_pair.pair_transform()
In [79]:
class TSNE_LSA_Ngram(LSA_Ngram):
def __init__(self, corpus, obs_corpus, target_corpus=None, ngram=3, svd_dim=100, svd_n_iter=5):
LSA_Ngram.__init__(self, corpus, obs_corpus, target_corpus, ngram, svd_dim, svd_n_iter)
def tsne_word_transform(self):
X_svd = self.word_transform()
X_scaled = StandardScaler().fit_transform(X_svd)
X_tsne = TSNE().fit_transform(X_scaled)
return X_tsne
def tsne_char_transform(self):
X_svd = self.char_transform()
X_scaled = StandardScaler().fit_transform(X_svd)
X_tsne = TSNE().fit_transform(X_scaled)
return X_tsne
def tsne_pair_transform(self):
X_svd = self.pair_transform()
X_scaled = StandardScaler().fit_transform(X_svd)
X_tsne = TSNE().fit_transform(X_scaled)
return X_tsne
for f in ['question1','question2']:
lsa_word = TSNE_LSA_Ngram(all_corpus,train[f], ngram=2, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print lsa_word.tsne_word_transform()
break
for f in ['question1','question2']:
lsa_char = TSNE_LSA_Ngram(all_corpus,train[f], ngram=5, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print lsa_char.tsne_char_transform()
break
lsa_pair = TSNE_LSA_Ngram(all_corpus,train['question1'], target_corpus=train['question2'], ngram=2, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print lsa_pair.tsne_pair_transform()
In [21]:
class LSA_Ngram_Cooc(VectorSpace):
def __init__(self, obs_corpus, target_corpus,
obs_ngram=1, target_ngram=1, svd_dim=100, svd_n_iter=5):
self.obs_corpus = obs_corpus
self.target_corpus = target_corpus
self.obs_ngram = obs_ngram
self.target_ngram = target_ngram
self.svd_dim = svd_dim
self.svd_n_iter = svd_n_iter
self.obs_ngram_str = ngram_utils._ngram_str_map[self.obs_ngram]
self.target_ngram_str = ngram_utils._ngram_str_map[self.target_ngram]
def _get_cooc_terms(self, lst1, lst2, join_str):
out = [""] * len(lst1) * len(lst2)
cnt = 0
for item1 in lst1:
for item2 in lst2:
out[cnt] = item1 + join_str + item2
cnt += 1
res = " ".join(out)
return res
def transform(self):
obs_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.obs_ngram, "_"), self.obs_corpus))
target_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.target_ngram, "_"), self.target_corpus))
cooc_terms = list(map(lambda lst1,lst2: self._get_cooc_terms(lst1, lst2, "X"), obs_ngrams, target_ngrams))
tfidf = self._init_word_ngram_tfidf(ngram=1)
X = tfidf.fit_transform(cooc_terms)
svd = TruncatedSVD(n_components=self.svd_dim, n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
return svd.fit_transform(X)
lsa_word = LSA_Ngram_Cooc(train['question1'],train['question2'], svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print lsa_word.transform()
In [25]:
# ------------------------ LSA Cosine Similarity -------------------------------
class LSA_Ngram_CosineSim(VectorSpace):
def __init__(self, obs_corpus, target_corpus, ngram=3, svd_dim=100, svd_n_iter=5):
self.obs_corpus = obs_corpus
self.target_corpus = target_corpus
self.ngram = ngram
self.svd_dim = svd_dim
self.svd_n_iter = svd_n_iter
def word_transform(self):
## get common vocabulary
tfidf = self._init_word_ngram_tfidf(self.ngram)
tfidf.fit(list(self.obs_corpus) + list(self.target_corpus))
vocabulary = tfidf.vocabulary_
## obs tfidf
tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
X_obs = tfidf.fit_transform(self.obs_corpus)
## targetument tfidf
tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
X_target = tfidf.fit_transform(self.target_corpus)
## svd
svd = TruncatedSVD(n_components = self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
svd.fit(scipy.sparse.vstack((X_obs, X_target)))
X_obs = svd.transform(X_obs)
X_target = svd.transform(X_target)
## cosine similarity
sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
sim = np.asarray(sim).squeeze()
return sim
def char_transform(self):
## get common vocabulary
tfidf = self._init_char_ngram_tfidf(self.ngram)
tfidf.fit(list(self.obs_corpus) + list(self.target_corpus))
vocabulary = tfidf.vocabulary_
## obs tfidf
tfidf = self._init_char_ngram_tfidf(self.ngram, vocabulary)
X_obs = tfidf.fit_transform(self.obs_corpus)
## targetument tfidf
tfidf = self._init_char_ngram_tfidf(self.ngram, vocabulary)
X_target = tfidf.fit_transform(self.target_corpus)
## svd
svd = TruncatedSVD(n_components=self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
svd.fit(scipy.sparse.vstack((X_obs, X_target)))
X_obs = svd.transform(X_obs)
X_target = svd.transform(X_target)
## cosine similarity
sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
sim = np.asarray(sim).squeeze()
return sim
cosinesim_word = LSA_Ngram_CosineSim(train['question1'],train['question2'], ngram=3, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print cosinesim_word.word_transform()
cosinesim_char = LSA_Ngram_CosineSim(train['question1'],train['question2'], ngram=5, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
print cosinesim_char.char_transform()
Out[25]:
In [56]:
# ------------------- Char distribution -------------------
class CharDistribution(VectorSpace):
def __init__(self, obs_corpus, target_corpus):
self.obs_corpus = obs_corpus
self.target_corpus = target_corpus
def normalize(self, text):
# pat = re.compile("[a-z0-9]")
pat = re.compile("[a-z]")
group = pat.findall(text.lower())
if group is None:
res = " "
else:
res = "".join(group)
res += " "
return res
def preprocess(self, corpus):
return [self.normalize(text) for text in corpus]
def get_distribution(self):
## obs tfidf
tfidf = self._init_char_tfidf()
X_obs = tfidf.fit_transform(self.preprocess(self.obs_corpus)).todense()
X_obs = np.asarray(X_obs)
# apply laplacian smoothing
s = 1.
X_obs = (X_obs + s) / (np.sum(X_obs, axis=1)[:,None] + X_obs.shape[1]*s)
## targetument tfidf
tfidf = self._init_char_tfidf()
X_target = tfidf.fit_transform(self.preprocess(self.target_corpus)).todense()
X_target = np.asarray(X_target)
X_target = (X_target + s) / (np.sum(X_target, axis=1)[:,None] + X_target.shape[1]*s)
return X_obs, X_target
class CharDistribution_transform(CharDistribution):
def __init__(self, obs_corpus, target_corpus, const_A=1., const_B=1.):
CharDistribution.__init__(self, obs_corpus, target_corpus)
self.const_A = const_A
self.const_B = const_B
def ratio_transform(self):
X_obs, X_target = self.get_distribution()
ratio = (X_obs + self.const_A) / (X_target + self.const_B)
return ratio
def cosine_transform(self):
X_obs, X_target = self.get_distribution()
## cosine similarity
sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
sim = np.asarray(sim).squeeze()
return sim
def kl_transform(self):
X_obs, X_target = self.get_distribution()
kl = dist_utils._KL(X_obs, X_target)
return kl
# cosinesim_word = CharDistribution(train['question1'],train['question2'])
# print cosinesim_word.get_distribution()
cosinesim_word = CharDistribution_transform(train['question1'],train['question2'])
print cosinesim_word.ratio_transform()
print cosinesim_word.cosine_transform()
print cosinesim_word.kl_transform()
In [17]:
from nltk.corpus import wordnet as wn
from utils import dist_utils, ngram_utils, nlp_utils, pkl_utils
from utils import logging_utils, time_utils
# tune the token pattern to get a better correlation with y_train
token_pattern = r"(?u)\b\w\w+\b"
# token_pattern = r"\w{1,}"
# token_pattern = r"\w+"
# token_pattern = r"[\w']+"
# token_pattern = " "
class WordNet_Similarity:
"""Double aggregation features"""
def __init__(self, metric="path"):
# super().__init__(obs_corpus, target_corpus, aggregation_mode, None, aggregation_mode_prev)
self.metric = metric
if self.metric == "path":
self.metric_func = lambda syn1, syn2: wn.path_similarity(syn1, syn2)
elif self.metric == "lch":
self.metric_func = lambda syn1, syn2: wn.lch_similarity(syn1, syn2)
elif self.metric == "wup":
self.metric_func = lambda syn1, syn2: wn.wup_similarity(syn1, syn2)
else:
raise(ValueError("Wrong similarity metric: %s, should be one of path/lch/wup."%self.metric))
def _maximum_similarity_for_two_synset_list(self, syn_list1, syn_list2):
s = 0.
if syn_list1 and syn_list2:
for syn1 in syn_list1:
for syn2 in syn_list2:
try:
_s = self.metric_func(syn1, syn2)
except:
_s = config.MISSING_VALUE_NUMERIC
if _s and _s > s:
s = _s
return s
def transform_one(self, obs, target):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs_synset_list = [wn.synsets((obs_token).decode('utf-8')) for obs_token in obs_tokens]
target_synset_list = [wn.synsets((target_token).decode('utf-8')) for target_token in target_tokens]
val_list = []
for obs_synset in obs_synset_list:
_val_list = []
for target_synset in target_synset_list:
_s = self._maximum_similarity_for_two_synset_list(obs_synset, target_synset)
_val_list.append(_s)
if len(_val_list) == 0:
_val_list = [config.MISSING_VALUE_NUMERIC]
val_list.append( max(_val_list) )
if len(val_list) == 0:
val_list = [[config.MISSING_VALUE_NUMERIC]]
return np.mean(val_list)
t = train.sample(n=10)
wn_list = ["path", "lch", "wup"]
for wn_method in wn_list:
wn_sim = WordNet_Similarity(metric=wn_method)
t.apply(lambda x: wn_sim.transform_one(x['question1'],x['question2']), axis=1)
Out[17]:
Out[17]:
Out[17]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [4]:
from __future__ import division
import time, os, gc
import numpy as np
import pandas as pd
import scipy
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectPercentile, f_classif
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.feature_extraction import text
from sklearn.metrics import log_loss
import cPickle
import gensim
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize
import config
stop_words = stopwords.words('english')
PATH = config.RAW_PATH
FEAT_PATH = config.FEAT_PATH
# train = pd.read_csv(PATH+'train.csv',nrows=config.TRAIN_SIZE)
# test = pd.read_csv(PATH+'test.csv',nrows=config.TEST_SIZE)
data = train#.ix[:100]
def wmd(s1,s2):
s1 = str(s1).lower().split()
s2 = str(s2).lower().split()
stop_words = stopwords.words("english")
s1 = [w for w in s1 if w not in stop_words]
s2 = [w for w in s2 if w not in stop_words]
return model.wmdistance(s1,s2)
def norm_wmd(s1,s2):
s1 = str(s1).lower().split()
s2 = str(s2).lower().split()
stop_words = stopwords.words("english")
s1 = [w for w in s1 if w not in stop_words]
s2 = [w for w in s2 if w not in stop_words]
return norm_model.wmdistance(s1, s2)
def sent2vec(s):
words = str(s).lower().decode('utf-8')
words = word_tokenize(words)
words = [w for w in words if not w in stop_words]
words = [w for w in words if w.isalpha()]
M = []
for w in words:
try:
M.append(model[w])
except:
continue
M = np.array(M)
v = M.sum(axis=0)
return v / np.sqrt((v ** 2).sum())
# data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
# data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
# data['diff_len'] = data.len_q1 - data.len_q2
# data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
# data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
# data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
# data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))
# data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
# data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
# data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
model = gensim.models.KeyedVectors.load_word2vec_format(PATH+'GoogleNews-vectors-negative300.bin.gz', binary=True)
# data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)
# norm_model = gensim.models.KeyedVectors.load_word2vec_format(PATH+'GoogleNews-vectors-negative300.bin.gz', binary=True)
# norm_model.init_sims(replace=True)
# data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)
question1_vectors = np.zeros((data.shape[0], 300))
error_count = 0
for i, q in tqdm(enumerate(data.question1.values)):
question1_vectors[i, :] = sent2vec(q)
question2_vectors = np.zeros((data.shape[0], 300))
for i, q in tqdm(enumerate(data.question2.values)):
question2_vectors[i, :] = sent2vec(q)
# data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
# np.nan_to_num(question2_vectors))]
# data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
# np.nan_to_num(question2_vectors))]
# data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
# np.nan_to_num(question2_vectors))]
# data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
# np.nan_to_num(question2_vectors))]
# data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
# np.nan_to_num(question2_vectors))]
# data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
# np.nan_to_num(question2_vectors))]
# data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
# np.nan_to_num(question2_vectors))]
# data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
# data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
# data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
# data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]
# cPickle.dump(question1_vectors, open(FEAT_PATH+'q1_w2v.pkl', 'wb'), -1)
# cPickle.dump(question2_vectors, open(FEAT_PATH+'q2_w2v.pkl', 'wb'), -1)
# data.to_csv(FEAT_PATH+'ab_features.csv', index=False)
# data
In [5]:
from utils import dist_utils, ngram_utils, nlp_utils
data['RMSE_distance'] = [dist_utils._rmse(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
np.nan_to_num(question2_vectors))]