In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from __future__ import division
import re, time, os, gc, datetime
import sys
import string
import numpy as np
import pandas as pd
import scipy
import config
from utils import dist_utils, ngram_utils, nlp_utils, np_utils
from utils import logging_utils, time_utils, pkl_utils
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix,hstack
from sklearn.feature_extraction import text
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from scipy import sparse as ssp
from sklearn.datasets import dump_svmlight_file,load_svmlight_file
from sklearn.utils import resample,shuffle
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pearsonr
import distance
from sklearn.model_selection import KFold
from nltk.stem.wordnet import WordNetLemmatizer
from multiprocessing import Pool, cpu_count
cpu_num = cpu_count()
num_partitions = cpu_num #number of partitions to split dataframe
num_cores = cpu_num #number of cores on your machine
print cpu_num
def parallelize_dataframe(df, func):
df_split = np.array_split(df, num_partitions)
pool = Pool(num_cores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
In [23]:
train_orig = pd.read_csv(config.RAW_PATH+'train.csv', header=0)#.sample(n=1000)
test_orig = pd.read_csv(config.RAW_PATH+'test.csv', header=0)#.sample(n=1000)
test_orig['is_duplicate'] = -1
train = pd.concat([train_orig[['question1', 'question2','is_duplicate']], \
test_orig[['question1', 'question2','is_duplicate']]], axis=0).reset_index(drop=True)
train['question1'] = train['question1'].astype(str)
train['question2'] = train['question2'].astype(str)
print train.shape
del train_orig, test_orig
In [24]:
from text_clean import *
train.head()
train['question1'] = train['question1'].apply(lambda x:substitute_thousands(x))
train['question2'] = train['question2'].apply(lambda x:substitute_thousands(x))
train['question1'] = train['question1'].apply(lambda x:text_to_wordlist(x))
train['question2'] = train['question2'].apply(lambda x:text_to_wordlist(x))
# train = abbr_clean(train)
train.head()
train['question1'] = train['question1'].astype(str).apply(lambda x:stem_str(x.lower(),
lemmatize=True, stem=False, stops=stops_eng))
train['question2'] = train['question2'].astype(str).apply(lambda x:stem_str(x.lower(),
lemmatize=True, stem=False, stops=stops_eng))
train['question1'] = train['question1'].astype(str)
train['question2'] = train['question2'].astype(str)
train.head()
Out[24]:
Out[24]:
Out[24]:
In [4]:
def jaccard_ngram(obs, target, ngram=1, token_pattern=" "):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
return dist_utils._jaccard_coef(obs_ngrams, target_ngrams)
def dicedistence_ngram(obs, target, ngram=1, token_pattern=" "):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
return dist_utils._dice_dist(obs_ngrams, target_ngrams)
def compression_dist(obs, target):
return dist_utils._compression_dist(obs, target)
def edit_dist(obs, target):
return dist_utils._edit_dist(obs, target)
def compression_dist_ngram(obs, target, ngram=2, token_pattern=" "):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
val_list = []
for w1 in obs_ngrams:
_val_list = []
for w2 in target_ngrams:
s = dist_utils._compression_dist(w1, w2)
_val_list.append(s)
if len(_val_list) == 0:
_val_list = [-1]
val_list.append( max(_val_list) )
if len(val_list) == 0:
val_list = [-1]
return min(val_list)
def edit_dist_ngram(obs, target, ngram=2, token_pattern=" ", agg=[np.min, np.max]):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
val_list = []
for w1 in obs_ngrams:
_val_list = []
for w2 in target_ngrams:
s = dist_utils._edit_dist(w1, w2)
_val_list.append(s)
if len(_val_list) == 0:
_val_list = [-1]
val_list.append( agg[0](_val_list) )
if len(val_list) == 0:
val_list = [-1]
return float(agg[1](val_list))
In [5]:
np_dict = { 'mean':np.mean, 'min':np.mean, 'max':np.max, 'median':np.median, 'std':np.std }
def multiply_columns(train):
for NGRAMS in [1,2,3]:
train['jaccard_n%s'%NGRAMS] = train.apply(lambda x: jaccard_ngram(x['question1'],x['question2'],ngram=NGRAMS), axis=1)
train['dicedistence_n%s'%NGRAMS] = train.apply(lambda x: dicedistence_ngram(x['question1'],x['question2'],ngram=NGRAMS), axis=1)
train['compression_dist'] = train.apply(lambda x: compression_dist(x['question1'],x['question2']), axis=1)
train['edit_dist'] = train.apply(lambda x: edit_dist(x['question1'],x['question2']), axis=1)
return train
train = parallelize_dataframe(train, multiply_columns)
print train.shape
def multiply_columns(train):
for AGG_NGRAMS in [1,2,3]:
for agg1 in ["mean", "max", "min", "median"]:
for agg2 in np_dict.keys():
AGG_BY = agg1 + '_' + agg2
AGG_FUNC = [np_dict[agg1],np_dict[agg2]]
# train['compression_dist_agg_n%s'%AGG_NGRAMS] = train.apply(lambda x: compression_dist_ngram(x['question1'],x['question2'],ngram=AGG_NGRAMS), axis=1)
train['edit_dist_agg_n%s_%s'%(AGG_NGRAMS,AGG_BY)] = train.apply(lambda x:
edit_dist_ngram(x['question1'],x['question2'], ngram=AGG_NGRAMS, agg=AGG_FUNC), axis=1)
return train
train = parallelize_dataframe(train, multiply_columns)
print train.shape
print datetime.datetime.now()
In [14]:
train.corr()
Out[14]:
In [6]:
def get_position_list(obs, target, ngram=1, token_pattern=" "):
"""
Get the list of positions of obs in target
"""
obs_tokens = nlp_utils._tokenize(str(obs), token_pattern)
target_tokens = nlp_utils._tokenize(str(target), token_pattern)
obs = ngram_utils._ngrams(obs_tokens, ngram)
target = ngram_utils._ngrams(target_tokens, ngram)
pos_of_obs_in_target = [0]
if len(obs) != 0:
pos_of_obs_in_target = [j for j,w in enumerate(obs, start=1) if w in target]
if len(pos_of_obs_in_target) == 0:
pos_of_obs_in_target = [0]
return pos_of_obs_in_target, len(obs)
def count_close_ngram(obs, target, idx=-1, ratio='count', ngram=123, aggr="", token_pattern=" ", threshold=config.STR_MATCH_THRESHOLD):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs = ngram_utils._ngrams(obs_tokens, ngram)
target = ngram_utils._ngrams(target_tokens, ngram)
cnt = 0
if (len(obs) != 0) and (len(target) != 0):
if idx == -1:
for obs_word in obs:
for word in target:
if dist_utils._is_str_match(word, obs_word, threshold):
cnt += 1
else:
for word in target:
if dist_utils._is_str_match(word, obs[idx], threshold):
cnt += 1
if ratio == 'count':
return cnt
else: return np_utils._try_divide(cnt, (len(obs)+len(target))/2.0)
def cooccurrence_ngram(obs, target, ngram=1, threshold=0.8, ratio='ratio', token_pattern=" "):
"""
Get the count cooccurrence_ngram in obs and target
"""
obs_tokens = nlp_utils._tokenize(str(obs), token_pattern)
target_tokens = nlp_utils._tokenize(str(target), token_pattern)
obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
s = 0.
for w1 in obs_ngrams:
for w2 in target_ngrams:
if dist_utils._is_str_match(w1, w2, threshold):
s += 1.
if ratio == 'count':
return s
else: return np_utils._try_divide(s, len(obs_ngrams)*len(target_ngrams))
def LongestMatchSize(obs_corpus, target_corpus):
return dist_utils._longest_match_size(obs_corpus, target_corpus)
def LongestMatchRatio(obs_corpus, target_corpus):
return dist_utils._longest_match_ratio(obs_corpus, target_corpus)
from collections import defaultdict
def _get_df_dict(target_corpus, ngram=1, token_pattern=" "):
d = defaultdict(lambda : 1)
for target in target_corpus:
target_tokens = nlp_utils._tokenize(target, token_pattern)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
for w in set(target_ngrams):
d[w] += 1
return d
def _get_idf(word, idf_dict, N):
return np.log((N - idf_dict[word] + 0.5)/(idf_dict[word] + 0.5))
def cooc_tfidf_ngram(obs, target, idf_dict=None, ngram=1, threshold=0.8, ratio="ratio", token_pattern=" ", AGG_FUNC=None):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
val_list = []
for w1 in obs_ngrams:
s = 0.
for w2 in target_ngrams:
if dist_utils._is_str_match(w1, w2, threshold):
s += 1.
if ratio == "count":
val_list.append(s * _get_idf(w1, idf_dict, doc_num))
elif ratio == "ratio":
val_list.append(np_utils._try_divide(s, len(target_ngrams)) * _get_idf(w1, idf_dict, doc_num))
if len(val_list) == 0:
val_list = [config.MISSING_VALUE_NUMERIC]
return AGG_FUNC(val_list)
def _get_avg_ngram_doc_len(target_corpus, ngram=1, token_pattern=" "):
lst = []
for target in target_corpus:
target_tokens = nlp_utils._tokenize(target, token_pattern)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
lst.append(len(target_ngrams))
return np.mean(lst)
def bm25(obs, target, ngram=1, threshold=0.8, ratio="ratio", token_pattern=" ", b=None, k1=None, doc_len=None, idf_dict=None, AGG_FUNC=None):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs_ngrams = ngram_utils._ngrams(obs_tokens, ngram)
target_ngrams = ngram_utils._ngrams(target_tokens, ngram)
K = k1 * (1 - b + b * np_utils._try_divide(len(target_ngrams), doc_len))
val_list = []
for w1 in obs_ngrams:
s = 0.
for w2 in target_ngrams:
if dist_utils._is_str_match(w1, w2, threshold):
s += 1.
bm25 = s * _get_idf(w1, idf_dict, doc_num) * np_utils._try_divide(1 + k1, s + K)
val_list.append(bm25)
if len(val_list) == 0:
val_list = [config.MISSING_VALUE_NUMERIC]
return AGG_FUNC(val_list)
In [7]:
np_dict = { 'mean':np.mean, 'min':np.mean, 'max':np.max, 'median':np.median, 'std':np.std }
doc_num = train.shape[0]
df_all=train[['question1','question2']].copy()
BM25_K1=config.BM25_K1
BM25_B=config.BM25_B
def multiply_columns(train):
for ngram in [1,2]:
for target_name in ['question1','question2']:
for obs_name in ['question1','question2']:
if target_name != obs_name:
position = train[['question1','question2']].apply(lambda x: get_position_list(obs=x[obs_name],target=x[target_name],ngram=ngram), axis=1)
pos = [i[0] for i in position]
obs_len = [i[1] for i in position]
## stats feat on pos
train["pos_of_%s_n%s_in_%s_min" % (obs_name, ngram, target_name)] = map(np.min, pos)
train["pos_of_%s_n%s_in_%s_mean" % (obs_name, ngram, target_name)] = map(np.mean, pos)
train["pos_of_%s_n%s_in_%s_median" % (obs_name, ngram, target_name)] = map(np.median, pos)
train["pos_of_%s_n%s_in_%s_max" % (obs_name, ngram, target_name)] = map(np.max, pos)
train["pos_of_%s_n%s_in_%s_std" % (obs_name, ngram, target_name)] = map(np.std, pos)
# stats feat on normalized_pos
train["norm_pos_of_%s_n%s_in_%s_min" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_min" % (obs_name, ngram, target_name)], obs_len)
train["norm_pos_of_%s_n%s_in_%s_mean" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_mean" % (obs_name, ngram, target_name)], obs_len)
train["norm_pos_of_%s_n%s_in_%s_median" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_median" % (obs_name, ngram, target_name)], obs_len)
train["norm_pos_of_%s_n%s_in_%s_max" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_max" % (obs_name, ngram, target_name)], obs_len)
train["norm_pos_of_%s_n%s_in_%s_std" % (obs_name, ngram, target_name)] = map(np_utils._try_divide, train["pos_of_%s_n%s_in_%s_std" % (obs_name, ngram, target_name)] ,obs_len)
NGRAMS=[1,2,3]
RATIO=['count','ratio']
for ngram in NGRAMS:
for ratio in RATIO:
train['intersect_close_%s_n%s'%(ratio,ngram)] = train[['question1','question2']].apply(lambda x:
count_close_ngram(x[0],x[1],threshold=0.8,ngram=ngram,ratio=ratio), axis=1)
NGRAMS=[1,2,3]
RATIO=['count','ratio']
for ngram in NGRAMS:
for ratio in RATIO:
train['cooccurrence_close_%s_n%s'%(ratio,ngram)] = train[['question1','question2']].apply(lambda x:
cooccurrence_ngram(x[0],x[1],threshold=0.8,ngram=ngram,ratio=ratio), axis=1)
train['LongestMatchSize'] = train[['question1','question2']].apply(lambda x: LongestMatchSize(x[0],x[1]), axis=1)
train['LongestMatchRatio'] = train[['question1','question2']].apply(lambda x: LongestMatchRatio(x[0],x[1]), axis=1)
return train
def multiply_cooc(train):
for agg in np_dict.keys():
AGG_FUNC = np_dict[agg]
train["cooc_tfidf_%s_n%s_%s_%s" % (obs_name, ngram, ratio,agg)] = train[['question1','question2']].apply(lambda x:
cooc_tfidf_ngram( obs=x[obs_name],target=x[target_name], threshold=0.8,
idf_dict=idf_dict, ngram=ngram, ratio=ratio, AGG_FUNC=AGG_FUNC), axis=1)
return train
def multiply_bm25(train):
for agg in np_dict.keys():
AGG_FUNC = np_dict[agg]
train["bm25_tfidf_%s_n%s_%s_%s" % (obs_name, ngram, ratio,agg)] = train[['question1','question2']].apply(lambda x:
bm25(obs=x[obs_name],target=x[target_name], ngram=ngram,threshold=0.8,b=BM25_B,
k1=BM25_K1, idf_dict=idf_dict, doc_len=avg_target_len, AGG_FUNC=AGG_FUNC), axis=1)
# train["bm25_%s_n%s_min" % (obs_name, ngram)] = map(np.min, pos)
# train["bm25_%s_n%s_mean" % (obs_name, ngram)] = map(np.mean, pos)
# train["bm25_%s_n%s_median" % (obs_name, ngram)] = map(np.median, pos)
# train["bm25_%s_n%s_max" % (obs_name, ngram)] = map(np.max, pos)
# train["bm25_%s_n%s_std" % (obs_name, ngram)] = map(np.std, pos)
return train
for ngram in [1,2,3]:
idf_dict = _get_df_dict(np.concatenate((df_all['question1'].values , df_all['question2'].values)), ngram=ngram)
for ratio in ['count','ratio']:
for target_name in ['question1','question2']:
for obs_name in ['question1','question2']:
if target_name != obs_name:
train = parallelize_dataframe(train, multiply_cooc)
print "cooc_tfidf_%s_n%s_%s" % (obs_name, ngram, ratio)
print 'cooc: {}'.format(train.shape)
for ngram in [1,2,3]:
idf_dict = _get_df_dict(np.concatenate((df_all['question1'].values , df_all['question2'].values)), ngram=ngram)
# for ratio in ['count','ratio']:
for target_name in ['question1','question2']:
avg_target_len = _get_avg_ngram_doc_len(df_all[target_name].values, ngram=ngram)
for obs_name in ['question1','question2']:
if target_name != obs_name:
train = parallelize_dataframe(train, multiply_bm25)
print "bm25_tfidf_%s_n%s_%s" % (obs_name, ngram, ratio)
print 'bm25: {}'.format(train.shape)
train = parallelize_dataframe(train, multiply_columns)
print 'postion, insection close, longestmatch'
print train.shape
print datetime.datetime.now()
del df_all, idf_dict
gc.collect()
Out[7]:
In [17]:
train[train['is_duplicate']>=0].corr()
Out[17]:
In [8]:
train.to_csv(config.FEAT_PATH+'feature_base_lemmer.csv',index=False)
In [ ]:
In [ ]:
In [ ]:
In [9]:
train = train[['is_duplicate','question1', 'question2']]
gc.collect()
Out[9]:
In [2]:
train = pd.read_csv(config.FEAT_PATH+'feature_vect_lemmer.csv')
In [3]:
# ------------------------ Vector Space Features -------------------------------
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import config
from utils import dist_utils, ngram_utils, nlp_utils, np_utils, pkl_utils
from utils import logging_utils, time_utils
class VectorSpace:
## word based
def _init_word_bow(self, ngram, vocabulary=None):
bow = CountVectorizer(min_df=config.MIN_DF,
max_df=config.MAX_DF,
max_features=None,
# norm="l2",
strip_accents="unicode",
analyzer="word",
token_pattern=r"\w{1,}",
ngram_range=(1, ngram),
vocabulary=vocabulary)
return bow
## word based
def _init_word_ngram_tfidf(self, ngram, vocabulary=None):
tfidf = TfidfVectorizer(min_df=config.MIN_DF,
max_df=config.MAX_DF,
max_features=None,
norm="l2",
strip_accents="unicode",
analyzer="word",
token_pattern=r"\w{1,}",
ngram_range=(1, ngram),
use_idf=1,
smooth_idf=1,
sublinear_tf=1,
# stop_words="english",
vocabulary=vocabulary)
return tfidf
## char based
def _init_char_tfidf(self, include_digit=False):
chars = list(string.ascii_lowercase)
if include_digit:
chars += list(string.digits)
vocabulary = dict(zip(chars, range(len(chars))))
tfidf = TfidfVectorizer(strip_accents="unicode",
analyzer="char",
norm=None,
token_pattern=r"\w{1,}",
ngram_range=(1, 1),
use_idf=0,
vocabulary=vocabulary)
return tfidf
## char based ngram
def _init_char_ngram_tfidf(self, ngram, vocabulary=None):
tfidf = TfidfVectorizer(min_df=config.MIN_DF,
max_df=config.MAX_DF,
max_features=None,
norm="l2",
strip_accents="unicode",
analyzer="char",
token_pattern=r"\w{1,}",
ngram_range=(1, ngram),
use_idf=1,
smooth_idf=1,
sublinear_tf=1,
# stop_words="english",
vocabulary=vocabulary)
return tfidf
# ------------------------ LSA -------------------------------
class LSA_Ngram(VectorSpace):
def __init__(self, corpus, obs_corpus, target_corpus=None, ngram=3, svd_dim=100, svd_n_iter=5):
self.obs_corpus = obs_corpus
self.ngram = ngram
self.svd_dim = svd_dim
self.svd_n_iter = svd_n_iter
self.corpus = corpus
self.target_corpus = target_corpus
def word_transform(self):
tfidf = self._init_word_ngram_tfidf(self.ngram)
tfidf.fit(self.corpus)
X = tfidf.transform(self.obs_corpus)
# word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
svd = TruncatedSVD(n_components = self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
return svd.fit_transform(X)
def char_transform(self):
tfidf = self._init_char_ngram_tfidf(self.ngram)
tfidf.fit(self.corpus)
X = tfidf.transform(self.obs_corpus)
svd = TruncatedSVD(n_components = self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
return svd.fit_transform(X)
def pair_transform(self):
## tfidf
tfidf = self._init_word_ngram_tfidf(ngram=self.ngram)
tfidf.fit(self.corpus)
X_obs = tfidf.transform(self.obs_corpus)
tfidf = self._init_word_ngram_tfidf(ngram=self.ngram)
tfidf.fit(self.corpus)
X_target = tfidf.transform(self.target_corpus)
X_tfidf = scipy.sparse.hstack([X_obs, X_target]).tocsr()
## svd
svd = TruncatedSVD(n_components=self.svd_dim, n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
X_svd = svd.fit_transform(X_tfidf)
return X_svd
all_corpus = []
feats_corpus = ['question1','question2']
for f in feats_corpus:
train[f] = train[f].astype(str)
all_corpus += train[f].values.tolist()
print len(all_corpus)
In [ ]:
w_ngram = 3
c_ngram = 10
svd_dim = config.SVD_DIM
lsa_word = LSA_Ngram(all_corpus,train['question1'], ngram=w_ngram, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
lsa_word_q1 = lsa_word.word_transform()
print 'lsa_word_q1'
lsa_word = LSA_Ngram(all_corpus,train['question2'], ngram=w_ngram, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
lsa_word_q2 = lsa_word.word_transform()
print 'lsa_word_q2'
lsa_pair = LSA_Ngram(all_corpus,train['question1'], target_corpus=train['question2'], ngram=w_ngram, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
lsa_pair = lsa_pair.pair_transform()
print 'lsa_pair'
lsa_char = LSA_Ngram(all_corpus,train['question1'], ngram=c_ngram, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
lsa_char_q1 = lsa_word.word_transform()
print 'lsa_char_q1'
lsa_char = LSA_Ngram(all_corpus,train['question2'], ngram=c_ngram, svd_dim=config.SVD_DIM, svd_n_iter=config.SVD_N_ITER)
lsa_char_q2 = lsa_word.word_transform()
print 'lsa_char_q2'
lsa_w_q1_df = pd.DataFrame(lsa_word_q1,columns=['lsa_wn%s_q1_%s'%(str(w_ngram),i) for i in range(svd_dim)])
lsa_w_q2_df = pd.DataFrame(lsa_word_q2,columns=['lsa_wn%s_q2_%s'%(str(w_ngram),i) for i in range(svd_dim)])
lsa_c_q1_df = pd.DataFrame(lsa_char_q1,columns=['lsa_cn%s_q1_%s'%(str(c_ngram),i) for i in range(svd_dim)])
lsa_c_q2_df = pd.DataFrame(lsa_char_q2,columns=['lsa_cn%s_q2_%s'%(str(c_ngram),i) for i in range(svd_dim)])
lsa_pair_df = pd.DataFrame(lsa_pair,columns=['lsa_pn%s_%s'%(str(c_ngram),i) for i in range(svd_dim)])
train = pd.concat([train, lsa_w_q1_df, lsa_w_q2_df, lsa_c_q1_df, lsa_c_q2_df, lsa_pair_df], axis=1)
print 'lsa shape: {}'.format(train.shape)
del lsa_w_q1_df,lsa_w_q2_df,lsa_c_q1_df,lsa_c_q2_df,lsa_pair_df
gc.collect()
In [19]:
train[train['is_duplicate']!=-1].corr()
Out[19]:
In [27]:
class TSNE_LSA_Ngram(LSA_Ngram):
def __init__(self, corpus, obs_corpus, target_corpus=None, ngram=3, svd_dim=100, svd_n_iter=5):
LSA_Ngram.__init__(self, corpus, obs_corpus, target_corpus, ngram, svd_dim, svd_n_iter)
def tsne_word_transform(self):
X_svd = self.word_transform()
X_scaled = StandardScaler().fit_transform(X_svd)
X_tsne = TSNE(init='pca').fit_transform(X_scaled)
return X_tsne
def tsne_char_transform(self):
X_svd = self.chsvd_dim
X_scaled = StandardScaler().fit_transform(X_svd)
X_tsne = TSNE().fit_transform(X_scaled)
return X_tsne
def tsne_pair_transform(self):
X_svd = self.pair_transform()
X_scaled = StandardScaler().fit_transform(X_svd)
X_tsne = TSNE().fit_transform(X_scaled)
return X_tsne
def nmf_word_transform(self):
X_svd = self.word_transform()
X_scaled = MinMaxScaler().fit_transform(X_svd)
X_nmf = NMF(n_components=2, init='random', random_state=np.random.randint(1, 10000)).fit_transform(X_scaled)
return X_nmf
def nmf_char_transform(self):
X_svd = self.char_transform()
X_scaled = MinMaxScaler().fit_transform(X_svd)
X_nmf = NMF(n_components=2, init='random', random_state=np.random.randint(1, 10000)).fit_transform(X_scaled)
return X_nmf
def nmf_pair_transform(self):
X_svd = self.pair_transform()
X_scaled = MinMaxScaler().fit_transform(X_svd)
X_nmf = NMF(n_components=2, init='random', random_state=np.random.randint(1, 10000)).fit_transform(X_scaled)
return X_nmf
# svd_dim = config.SVD_DIM
# svd_dim = 10
############# tsne ###################
# for NGRAM in [2,3]:
# for COL in ['question1','question2']:
# model = TSNE_LSA_Ngram(all_corpus, train[COL], ngram=NGRAM, svd_dim=svd_dim, svd_n_iter=config.SVD_N_ITER)
# lsa_word = model.tsne_word_transform()
# lsa_word = pd.DataFrame(lsa_word,columns=['tsne_w_%s_n%s_%s'%(COL,str(NGRAM),i) for i in range(2)])
# train = pd.concat([train, lsa_word], axis=1)
# print 'tsne_word_transform: {}'.format(train.shape)
# for NGRAM in [5,10]:
# for COL in ['question1','question2']:
# model = TSNE_LSA_Ngram(all_corpus, train[COL], ngram=NGRAM, svd_dim=svd_dim,
# svd_n_iter=config.SVD_N_ITER)
# lsa_word = model.tsne_char_transform()
# lsa_word = pd.DataFrame(lsa_word,columns=['tsne_c_%s_n%s_%s'%(COL,str(NGRAM),i) for i in range(2)])
# train = pd.concat([train, lsa_word], axis=1)
# print 'tsne_char_transform: {}'.format(train.shape)
# for NGRAM in [1,2]:
# model = TSNE_LSA_Ngram(all_corpus,train['question1'], target_corpus=train['question2'], ngram=NGRAM,
# svd_dim=svd_dim, svd_n_iter=config.SVD_N_ITER)
# tsne_pair = model.tsne_pair_transform()
# tsne_pair = pd.DataFrame(tsne_pair,columns=['tsne_p_n%s_%s'%(str(NGRAM),i) for i in range(2)])
# train = pd.concat([train, tsne_pair], axis=1)
# print 'tsne_pair_transform: {}'.format(train.shape)
############# nmf ###################
for NGRAM in [2,3]:
for COL in ['question1','question2']:
model = TSNE_LSA_Ngram(all_corpus, train[COL], ngram=NGRAM, svd_dim=svd_dim, svd_n_iter=config.SVD_N_ITER)
lsa_word = model.nmf_word_transform()
lsa_word = pd.DataFrame(lsa_word,columns=['nmf_w_%s_n%s_%s'%(COL,str(NGRAM),i) for i in range(2)])
train = pd.concat([train, lsa_word], axis=1)
print 'nmf_word_transform: {}'.format(train.shape)
for NGRAM in [5,10]:
for COL in ['question1','question2']:
model = TSNE_LSA_Ngram(all_corpus, train[COL], ngram=NGRAM, svd_dim=svd_dim,
svd_n_iter=config.SVD_N_ITER)
lsa_word = model.nmf_char_transform()
lsa_word = pd.DataFrame(lsa_word,columns=['nmf_c_%s_n%s_%s'%(COL,str(NGRAM),i) for i in range(2)])
train = pd.concat([train, lsa_word], axis=1)
print 'nmf_char_transform: {}'.format(train.shape)
for NGRAM in [1,2,3]:
model = TSNE_LSA_Ngram(all_corpus,train['question1'], target_corpus=train['question2'], ngram=NGRAM,
svd_dim=svd_dim, svd_n_iter=config.SVD_N_ITER)
tsne_pair = model.nmf_pair_transform()
tsne_pair = pd.DataFrame(tsne_pair,columns=['nmf_p_n%s_%s'%(str(NGRAM),i) for i in range(2)])
train = pd.concat([train, tsne_pair], axis=1)
print 'nmf_pair_transform: {}'.format(train.shape)
del lsa_word, tsne_pair
gc.collect()
Out[27]:
In [7]:
train[train['is_duplicate']!=-1].corr()
Out[7]:
In [28]:
class LSA_Ngram_Cooc(VectorSpace):
def __init__(self, obs_corpus, target_corpus,
obs_ngram=1, target_ngram=1, svd_dim=100, svd_n_iter=5):
self.obs_corpus = obs_corpus
self.target_corpus = target_corpus
self.obs_ngram = obs_ngram
self.target_ngram = target_ngram
self.svd_dim = svd_dim
self.svd_n_iter = svd_n_iter
self.obs_ngram_str = ngram_utils._ngram_str_map[self.obs_ngram]
self.target_ngram_str = ngram_utils._ngram_str_map[self.target_ngram]
def _get_cooc_terms(self, lst1, lst2, join_str):
out = [""] * len(lst1) * len(lst2)
cnt = 0
for item1 in lst1:
for item2 in lst2:
out[cnt] = item1 + join_str + item2
cnt += 1
res = " ".join(out)
return res
def transform(self):
obs_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.obs_ngram, "_"), self.obs_corpus))
target_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.target_ngram, "_"), self.target_corpus))
cooc_terms = list(map(lambda lst1,lst2: self._get_cooc_terms(lst1, lst2, "X"), obs_ngrams, target_ngrams))
tfidf = self._init_word_ngram_tfidf(ngram=1)
X = tfidf.fit_transform(cooc_terms)
svd = TruncatedSVD(n_components=self.svd_dim, n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
return svd.fit_transform(X)
NGRAMS=[1,2]
for ngram1 in NGRAMS:
for ngram2 in NGRAMS:
lsa_word = LSA_Ngram_Cooc(train['question1'],train['question2'], svd_dim=svd_dim, svd_n_iter=config.SVD_N_ITER,
obs_ngram=ngram1, target_ngram=ngram2)
lsa_cooc = lsa_word.transform()
lsa_cooc = pd.DataFrame(lsa_cooc,columns=['svd_cooc_on%s_tn%s_%s'%(str(ngram1),str(ngram2),i) for i in range(svd_dim)])
train = pd.concat([train, lsa_cooc], axis=1)
print 'svd_word_cooc_transform: {}'.format(train.shape)
del lsa_cooc, lsa_word
gc.collect()
Out[28]:
In [ ]:
# ------------------------ LSA Cosine Similarity -------------------------------
class LSA_Ngram_CosineSim(VectorSpace):
def __init__(self, obs_corpus, target_corpus, ngram=3, svd_dim=100, svd_n_iter=5):
self.obs_corpus = obs_corpus
self.target_corpus = target_corpus
self.ngram = ngram
self.svd_dim = svd_dim
self.svd_n_iter = svd_n_iter
def word_transform(self):
## get common vocabulary
tfidf = self._init_word_ngram_tfidf(self.ngram)
tfidf.fit(list(self.obs_corpus) + list(self.target_corpus))
vocabulary = tfidf.vocabulary_
## obs tfidf
tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
X_obs = tfidf.fit_transform(self.obs_corpus)
## targetument tfidf
tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
X_target = tfidf.fit_transform(self.target_corpus)
## svd
svd = TruncatedSVD(n_components = self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
svd.fit(scipy.sparse.vstack((X_obs, X_target)))
X_obs = svd.transform(X_obs)
X_target = svd.transform(X_target)
## cosine similarity
sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
sim = np.asarray(sim).squeeze()
return sim
def char_transform(self):
## get common vocabulary
tfidf = self._init_char_ngram_tfidf(self.ngram)
tfidf.fit(list(self.obs_corpus) + list(self.target_corpus))
vocabulary = tfidf.vocabulary_
## obs tfidf
tfidf = self._init_char_ngram_tfidf(self.ngram, vocabulary)
X_obs = tfidf.fit_transform(self.obs_corpus)
## targetument tfidf
tfidf = self._init_char_ngram_tfidf(self.ngram, vocabulary)
X_target = tfidf.fit_transform(self.target_corpus)
## svd
svd = TruncatedSVD(n_components=self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
svd.fit(scipy.sparse.vstack((X_obs, X_target)))
X_obs = svd.transform(X_obs)
X_target = svd.transform(X_target)
## cosine similarity
sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
sim = np.asarray(sim).squeeze()
return sim
svd_dim = config.SVD_DIM
# for NGRAM in [2,3]:
# cosinesim_word = LSA_Ngram_CosineSim(train['question1'],train['question2'], ngram=NGRAM, svd_dim=svd_dim, svd_n_iter=config.SVD_N_ITER)
# cosinesim_word = cosinesim_word.word_transform()
# train['svd_cosine_w_n%s_%s'%(str(NGRAM),config.SVD_DIM)] = cosinesim_word
# print 'cosinesim_word_svd: {}'.format(train.shape)
for NGRAM in [10]:
cosinesim_word = LSA_Ngram_CosineSim(train['question1'],train['question2'], ngram=NGRAM, svd_dim=svd_dim, svd_n_iter=config.SVD_N_ITER)
cosinesim_word = cosinesim_word.char_transform()
train['svd_cosine_c_n%s_%s'%(str(NGRAM),config.SVD_DIM)] = cosinesim_word
print 'cosinesim_char_svd: {}'.format(train.shape)
In [4]:
# ------------------- Char distribution -------------------
class CharDistribution(VectorSpace):
def __init__(self, obs_corpus, target_corpus):
self.obs_corpus = obs_corpus
self.target_corpus = target_corpus
def normalize(self, text):
# pat = re.compile("[a-z0-9]")
pat = re.compile("[a-z]")
group = pat.findall(text.lower())
if group is None:
res = " "
else:
res = "".join(group)
res += " "
return res
def preprocess(self, corpus):
return [self.normalize(text) for text in corpus]
def get_distribution(self):
## obs tfidf
tfidf = self._init_char_tfidf()
X_obs = tfidf.fit_transform(self.preprocess(self.obs_corpus)).todense()
X_obs = np.asarray(X_obs)
# apply laplacian smoothing
s = 1.
X_obs = (X_obs + s) / (np.sum(X_obs, axis=1)[:,None] + X_obs.shape[1]*s)
## targetument tfidf
tfidf = self._init_char_tfidf()
X_target = tfidf.fit_transform(self.preprocess(self.target_corpus)).todense()
X_target = np.asarray(X_target)
X_target = (X_target + s) / (np.sum(X_target, axis=1)[:,None] + X_target.shape[1]*s)
return X_obs, X_target
class CharDistribution_transform(CharDistribution):
def __init__(self, obs_corpus, target_corpus, const_A=1., const_B=1.):
CharDistribution.__init__(self, obs_corpus, target_corpus)
self.const_A = const_A
self.const_B = const_B
self.X_obs, self.X_target = self.get_distribution()
def ratio_transform(self):
# X_obs, X_target = self.get_distribution()
ratio = (self.X_obs + self.const_A) / (self.X_target + self.const_B)
return ratio
def cosine_transform(self):
# X_obs, X_target = self.get_distribution()
## cosine similarity
sim = list(map(dist_utils._cosine_sim, self.X_obs, self.X_target))
sim = np.asarray(sim).squeeze()
return sim
def kl_transform(self):
# X_obs, X_target = self.get_distribution()
kl = dist_utils._KL(self.X_obs, self.X_target)
return kl
# cosinesim_word = CharDistribution(train['question1'],train['question2'])
# print cosinesim_word.get_distribution()
cosinesim_word = CharDistribution_transform(train['question1'],train['question2'])
cosinesim_word_ratio = cosinesim_word.ratio_transform()
cosinesim_word_ratio_shape = cosinesim_word_ratio.shape[1]
cosinesim_word_ratio = pd.DataFrame(cosinesim_word_ratio, columns=['cosinesim_word_ratio_%s'%(i) for i in range(cosinesim_word_ratio_shape)])
# train = pd.concat([train, cosinesim_word_df], axis=1)
train['char_distribution_ratio_mean'] = cosinesim_word_ratio.mean(axis=1)
train['char_distribution_ratio_min'] = cosinesim_word_ratio.min(axis=1)
train['char_distribution_ratio_max'] = cosinesim_word_ratio.max(axis=1)
train['char_distribution_ratio_std'] = cosinesim_word_ratio.std(axis=1)
train['char_distribution_cosine'] = cosinesim_word.cosine_transform()
train['char_distribution_kl'] = cosinesim_word.kl_transform()
print 'char_distribution: {}'.format(train.shape)
In [ ]:
from nltk.corpus import wordnet as wn
from utils import dist_utils, ngram_utils, nlp_utils, pkl_utils
from utils import logging_utils, time_utils
# tune the token pattern to get a better correlation with y_train
token_pattern = r"(?u)\b\w\w+\b"
# token_pattern = r"\w{1,}"
# token_pattern = r"\w+"
# token_pattern = r"[\w']+"
token_pattern = " "
class WordNet_Similarity:
"""Double aggregation features"""
def __init__(self, metric="path"):
# super().__init__(obs_corpus, target_corpus, aggregation_mode, None, aggregation_mode_prev)
self.metric = metric
if self.metric == "path":
self.metric_func = lambda syn1, syn2: wn.path_similarity(syn1, syn2)
elif self.metric == "lch":
self.metric_func = lambda syn1, syn2: wn.lch_similarity(syn1, syn2)
elif self.metric == "wup":
self.metric_func = lambda syn1, syn2: wn.wup_similarity(syn1, syn2)
else:
raise(ValueError("Wrong similarity metric: %s, should be one of path/lch/wup."%self.metric))
def _maximum_similarity_for_two_synset_list(self, syn_list1, syn_list2):
s = 0.
if syn_list1 and syn_list2:
for syn1 in syn_list1:
for syn2 in syn_list2:
try:
_s = self.metric_func(syn1, syn2)
except:
_s = config.MISSING_VALUE_NUMERIC
if _s and _s > s:
s = _s
return s
def transform_one(self, obs, target):
obs_tokens = nlp_utils._tokenize(obs, token_pattern)
target_tokens = nlp_utils._tokenize(target, token_pattern)
obs_synset_list = [wn.synsets((obs_token).decode('utf-8')) for obs_token in obs_tokens]
target_synset_list = [wn.synsets((target_token).decode('utf-8')) for target_token in target_tokens]
val_list = []
for obs_synset in obs_synset_list:
_val_list = []
for target_synset in target_synset_list:
_s = self._maximum_similarity_for_two_synset_list(obs_synset, target_synset)
_val_list.append(_s)
if len(_val_list) == 0:
_val_list = [config.MISSING_VALUE_NUMERIC]
val_list.append( _val_list )
if len(val_list) == 0:
val_list = [[config.MISSING_VALUE_NUMERIC]]
return val_list
wn_list = ["lch", "path", "wup"]
# wn_list = ["wup"]
np_dict = { 'mean':np.mean, 'min':np.mean, 'max':np.max, 'median':np.median, 'std':np.std }
agg1 = ["mean", "max", "min", "median"]
agg2 = ["mean", "std", "max", "min", "median"]
def wn_sim(train):
wn_sim = WordNet_Similarity(metric=wn_method)
wn_matrix = train.apply(lambda x: wn_sim.transform_one(x['question1'],x['question2']), axis=1)
for AGG1 in agg1:
for AGG2 in agg2:
train['WordNet_%s_%s_%s'%(wn_method,AGG1,AGG2)] = [np_dict[AGG2](np_dict[AGG1](wn_row,axis=1)) for wn_row in wn_matrix]
return train
for wn_method in wn_list:
train = parallelize_dataframe(train, wn_sim)
print '{} ==> WordNet_Similarity {}: {}'.format(datetime.datetime.now(), wn_method, train.shape)
train.to_csv(config.FEAT_PATH+'feature_vect_lemmer.csv',index=False)
gc.collect()
In [5]:
train[train['is_duplicate']!=-1].corr()
Out[5]:
In [3]:
train.to_csv(config.FEAT_PATH+'feature_vect_lemmer.csv',index=False)
In [12]:
train.head()
In [11]:
print '{} ==> WordNet_Similarity {}: {}'.format(datetime.datetime.now(), 1, 2)