Based on Abhishek Thakur's features published on GitHub and Kaggle forum.
This utility package imports numpy
, pandas
, matplotlib
and a helper kg
module into the root namespace.
In [1]:
from pygoose import *
In [2]:
import os
import warnings
In [3]:
import gensim
In [4]:
from fuzzywuzzy import fuzz
In [5]:
from nltk import word_tokenize
from nltk.corpus import stopwords
In [6]:
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
Automatically discover the paths to various data folders and compose the project structure.
In [7]:
project = kg.Project.discover()
Identifier for storing these features on disk and referring to them later.
In [8]:
feature_list_id = '3rdparty_abhishek'
The path to the saved GoogleNews Word2Vec model.
In [9]:
google_news_model_path = os.path.join(project.aux_dir, 'word2vec', 'GoogleNews-vectors-negative300.bin.gz')
Original question datasets.
In [10]:
df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('').drop(['id', 'qid1', 'qid2'], axis=1)
df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('').drop(['test_id'], axis=1)
In [11]:
stop_words = stopwords.words('english')
Raw implementations from Abhishek below (excluding the features we already have in other notebooks):
In [12]:
def wmd(model, s1, s2):
s1 = str(s1).lower().split()
s2 = str(s2).lower().split()
stop_words = stopwords.words('english')
s1 = [w for w in s1 if w not in stop_words]
s2 = [w for w in s2 if w not in stop_words]
return model.wmdistance(s1, s2)
In [13]:
def norm_wmd(model, s1, s2):
s1 = str(s1).lower().split()
s2 = str(s2).lower().split()
stop_words = stopwords.words('english')
s1 = [w for w in s1 if w not in stop_words]
s2 = [w for w in s2 if w not in stop_words]
return model.wmdistance(s1, s2)
In [14]:
def sent2vec(model, s):
words = s.lower()
words = word_tokenize(words)
words = [w for w in words if not w in stop_words]
words = [w for w in words if w.isalpha()]
M = []
for w in words:
try:
M.append(model[w])
except:
continue
M = np.array(M)
v = M.sum(axis=0)
return v / np.sqrt((v ** 2).sum())
In [15]:
def extend_with_features(data):
data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
model = gensim.models.KeyedVectors.load_word2vec_format(google_news_model_path, binary=True)
data['wmd'] = data.apply(lambda x: wmd(model, x['question1'], x['question2']), axis=1)
norm_model = gensim.models.KeyedVectors.load_word2vec_format(google_news_model_path, binary=True)
norm_model.init_sims(replace=True)
data['norm_wmd'] = data.apply(lambda x: norm_wmd(norm_model, x['question1'], x['question2']), axis=1)
question1_vectors = np.zeros((data.shape[0], 300))
for i, q in progressbar(enumerate(data.question1.values), total=len(data)):
question1_vectors[i, :] = sent2vec(model, q)
question2_vectors = np.zeros((data.shape[0], 300))
for i, q in progressbar(enumerate(data.question2.values), total=len(data)):
question2_vectors[i, :] = sent2vec(model, q)
question1_vectors = np.nan_to_num(question1_vectors)
question2_vectors = np.nan_to_num(question2_vectors)
data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]
data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]
data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]
data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]
data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]
data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(question1_vectors, question2_vectors)]
data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(question1_vectors, question2_vectors)]
data['skew_q1vec'] = [skew(x) for x in question1_vectors]
data['skew_q2vec'] = [skew(x) for x in question2_vectors]
data['kur_q1vec'] = [kurtosis(x) for x in question1_vectors]
data['kur_q2vec'] = [kurtosis(x) for x in question2_vectors]
In [16]:
warnings.filterwarnings('ignore')
In [ ]:
extend_with_features(df_train)
In [ ]:
extend_with_features(df_test)
In [ ]:
df_train.drop(['is_duplicate', 'question1', 'question2'], axis=1, inplace=True)
df_test.drop(['question1', 'question2'], axis=1, inplace=True)
In [ ]:
X_train = np.array(df_train.values, dtype='float64')
X_test = np.array(df_test.values, dtype='float64')
In [ ]:
print('X_train:', X_train.shape)
print('X_test: ', X_test.shape)
In [ ]:
df_train.describe().T
Out[ ]:
In [ ]:
feature_names = [
'abh_common_words',
'abh_fuzz_qratio',
'abh_fuzz_WRatio',
'abh_wmd',
'abh_norm_wmd',
'abh_cosine_distance',
'abh_cityblock_distance',
'abh_jaccard_distance',
'abh_canberra_distance',
'abh_euclidean_distance',
'abh_minkowski_distance',
'abh_braycurtis_distance',
'abh_skew_q1vec',
'abh_skew_q2vec',
'abh_kur_q1vec',
'abh_kur_q2vec',
]
In [ ]:
project.save_features(X_train, X_test, feature_names, feature_list_id)