Features extracted to find the similarity between two given questions. Various types of features are extracted, they are as follows :
Note : This notebook is python 3 compatible.
All the related information about the features can be found in this great post.
In [1]:
# packages
import argparse
import functools
from collections import defaultdict
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from collections import Counter
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
We are trying to create more features on the top of features generated by Abhishek on kaggle which can be downloaded from here.
In [2]:
# Functions
def word_match_share(row, stops=None):
q1words = {}
q2words = {}
for word in row['question1']:
if word not in stops:
q1words[word] = 1
for word in row['question2']:
if word not in stops:
q2words[word] = 1
if len(q1words) == 0 or len(q2words) == 0:
# The computer-generated chaff includes a few questions that are nothing but stopwords
return 0
shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
return R
def jaccard(row):
wic = set(row['question1']).intersection(set(row['question2']))
uw = set(row['question1']).union(row['question2'])
if len(uw) == 0:
uw = [1]
return (len(wic) / len(uw))
def common_words(row):
return len(set(row['question1']).intersection(set(row['question2'])))
def total_unique_words(row):
return len(set(row['question1']).union(row['question2']))
def total_unq_words_stop(row, stops):
return len([x for x in set(row['question1']).union(row['question2']) if x not in stops])
def wc_diff(row):
return abs(len(row['question1']) - len(row['question2']))
def wc_ratio(row):
l1 = len(row['question1'])*1.0
l2 = len(row['question2'])
if l2 == 0:
return np.nan
if l1 / l2:
return l2 / l1
else:
return l1 / l2
def wc_diff_unique(row):
return abs(len(set(row['question1'])) - len(set(row['question2'])))
def wc_ratio_unique(row):
l1 = len(set(row['question1'])) * 1.0
l2 = len(set(row['question2']))
if l2 == 0:
return np.nan
if l1 / l2:
return l2 / l1
else:
return l1 / l2
def wc_diff_unique_stop(row, stops=None):
return abs(len([x for x in set(row['question1']) if x not in stops]) - len([x for x in set(row['question2']) if x not in stops]))
def wc_ratio_unique_stop(row, stops=None):
l1 = len([x for x in set(row['question1']) if x not in stops])*1.0
l2 = len([x for x in set(row['question2']) if x not in stops])
if l2 == 0:
return np.nan
if l1 / l2:
return l2 / l1
else:
return l1 / l2
def same_start_word(row):
if not row['question1'] or not row['question2']:
return np.nan
return int(row['question1'][0] == row['question2'][0])
def char_diff(row):
return abs(len(''.join(row['question1'])) - len(''.join(row['question2'])))
def char_ratio(row):
l1 = len(''.join(row['question1']))
l2 = len(''.join(row['question2']))
if l2 == 0:
return np.nan
if l1 / l2:
return l2 / l1
else:
return l1 / l2
def char_diff_unique_stop(row, stops=None):
return abs(len(''.join([x for x in set(row['question1']) if x not in stops])) - len(''.join([x for x in set(row['question2']) if x not in stops])))
def get_weight(count, eps=10000, min_count=2):
if count < min_count:
return 0
else:
return 1 / (count + eps)
def tfidf_word_match_share_stops(row, stops=None, weights=None):
q1words = {}
q2words = {}
for word in row['question1']:
if word not in stops:
q1words[word] = 1
for word in row['question2']:
if word not in stops:
q2words[word] = 1
if len(q1words) == 0 or len(q2words) == 0:
# The computer-generated chaff includes a few questions that are nothing but stopwords
return 0
shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
R = np.sum(shared_weights) / np.sum(total_weights)
return R
def tfidf_word_match_share(row, weights=None):
q1words = {}
q2words = {}
for word in row['question1']:
q1words[word] = 1
for word in row['question2']:
q2words[word] = 1
if len(q1words) == 0 or len(q2words) == 0:
# The computer-generated chaff includes a few questions that are nothing but stopwords
return 0
shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
R = np.sum(shared_weights) / np.sum(total_weights)
return R
def build_features(data, stops, weights):
X = pd.DataFrame()
f = functools.partial(word_match_share, stops=stops)
X['word_match'] = data.apply(f, axis=1, raw=True) #1
f = functools.partial(tfidf_word_match_share, weights=weights)
X['tfidf_wm'] = data.apply(f, axis=1, raw=True) #2
f = functools.partial(tfidf_word_match_share_stops, stops=stops, weights=weights)
X['tfidf_wm_stops'] = data.apply(f, axis=1, raw=True) #3
X['jaccard'] = data.apply(jaccard, axis=1, raw=True) #4
X['wc_diff'] = data.apply(wc_diff, axis=1, raw=True) #5
X['wc_ratio'] = data.apply(wc_ratio, axis=1, raw=True) #6
X['wc_diff_unique'] = data.apply(wc_diff_unique, axis=1, raw=True) #7
X['wc_ratio_unique'] = data.apply(wc_ratio_unique, axis=1, raw=True) #8
f = functools.partial(wc_diff_unique_stop, stops=stops)
X['wc_diff_unq_stop'] = data.apply(f, axis=1, raw=True) #9
f = functools.partial(wc_ratio_unique_stop, stops=stops)
X['wc_ratio_unique_stop'] = data.apply(f, axis=1, raw=True) #10
X['same_start'] = data.apply(same_start_word, axis=1, raw=True) #11
X['char_diff'] = data.apply(char_diff, axis=1, raw=True) #12
f = functools.partial(char_diff_unique_stop, stops=stops)
X['char_diff_unq_stop'] = data.apply(f, axis=1, raw=True) #13
X['common_words'] = data.apply(common_words, axis=1, raw=True) #14
X['total_unique_words'] = data.apply(total_unique_words, axis=1, raw=True) #15
f = functools.partial(total_unq_words_stop, stops=stops)
X['total_unq_words_stop'] = data.apply(f, axis=1, raw=True) #16
X['char_ratio'] = data.apply(char_ratio, axis=1, raw=True) #17
return X
In [5]:
# processing
print ('loading features')
df_train = pd.read_csv('train_features.csv', encoding="ISO-8859-1")
X_train_ab = df_train.iloc[:, 2:-1]
X_train_ab = X_train_ab.drop('euclidean_distance', axis=1)
X_train_ab = X_train_ab.drop('jaccard_distance', axis=1)
print ('loading actual data')
df_train = pd.read_csv('train.csv')
df_train = df_train.fillna(' ')
df_test = pd.read_csv('test.csv')
ques = pd.concat([df_train[['question1', 'question2']], \
df_test[['question1', 'question2']]], axis=0).reset_index(drop='index')
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
q_dict[ques.question1[i]].add(ques.question2[i])
q_dict[ques.question2[i]].add(ques.question1[i])
def q1_freq(row):
return(len(q_dict[row['question1']]))
def q2_freq(row):
return(len(q_dict[row['question2']]))
def q1_q2_intersect(row):
return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))
print ('making leaky features')
df_train['q1_q2_intersect'] = df_train.apply(q1_q2_intersect, axis=1, raw=True)
df_train['q1_freq'] = df_train.apply(q1_freq, axis=1, raw=True)
df_train['q2_freq'] = df_train.apply(q2_freq, axis=1, raw=True)
df_test['q1_q2_intersect'] = df_test.apply(q1_q2_intersect, axis=1, raw=True)
df_test['q1_freq'] = df_test.apply(q1_freq, axis=1, raw=True)
df_test['q2_freq'] = df_test.apply(q2_freq, axis=1, raw=True)
test_leaky = df_test.loc[:, ['q1_q2_intersect','q1_freq','q2_freq']]
del df_test
train_leaky = df_train.loc[:, ['q1_q2_intersect','q1_freq','q2_freq']]
# explore
stops = set(stopwords.words("english"))
df_train['question1'] = df_train['question1'].map(lambda x: str(x).lower().split())
df_train['question2'] = df_train['question2'].map(lambda x: str(x).lower().split())
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist())
words = [x for y in train_qs for x in y]
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}
print('Building Features')
X_train = build_features(df_train, stops, weights)
print ('concatenating')
X_train = pd.concat((X_train, X_train_ab, train_leaky), axis=1)
y_train = df_train['is_duplicate'].values
In [ ]:
# test processing
print('Building Test Features')
df_test = pd.read_csv('test_features.csv', encoding="ISO-8859-1")
x_test_ab = df_test.iloc[:, 2:-1]
x_test_ab = x_test_ab.drop('euclidean_distance', axis=1)
x_test_ab = x_test_ab.drop('jaccard_distance', axis=1)
print ('reading test data')
df_test = pd.read_csv('test.csv')
df_test = df_test.fillna(' ')
df_test['question1'] = df_test['question1'].map(lambda x: str(x).lower().split())
df_test['question2'] = df_test['question2'].map(lambda x: str(x).lower().split())
print('going to function')
x_test = build_features(df_test, stops, weights)
print('concatenating')
x_test = pd.concat((x_test, x_test_ab, test_leaky), axis=1)
In [ ]:
X_train.to_csv('X_train.csv',index=False)
x_test.to_csv('X_test.csv',index = False)
In [2]:
trn_features = pd.read_csv('X_train.csv')
In [2]:
import nltk
from collections import Counter
from nltk.corpus import stopwords
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.optimize import minimize
stops = set(stopwords.words("english"))
import multiprocessing
import difflib
import numpy as np
import pandas as pd
import datetime
import operator
In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [4]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
cvect = CountVectorizer(stop_words='english', ngram_range=(1, 1))
tfidf_txt = pd.Series(train['question1'].tolist() + train['question2'].tolist() + test['question1'].tolist() + test['question2'].tolist()).astype(str)
tfidf.fit_transform(tfidf_txt)
cvect.fit_transform(tfidf_txt)
def diff_ratios(st1, st2):
seq = difflib.SequenceMatcher()
seq.set_seqs(str(st1).lower(), str(st2).lower())
return seq.ratio()
def add_word_count(x, df, word):
x['q1_' + word] = df['question1'].apply(lambda x: (word in str(x).lower())*1)
x['q2_' + word] = df['question2'].apply(lambda x: (word in str(x).lower())*1)
x[word + '_both'] = x['q1_' + word] * x['q2_' + word]
def get_features(df_features):
print('nouns...')
df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
print('tfidf...')
df_features['z_tfidf_sum1'] = df_features.question1.map(lambda x: np.sum(tfidf.transform([str(x)]).data))
df_features['z_tfidf_sum2'] = df_features.question2.map(lambda x: np.sum(tfidf.transform([str(x)]).data))
df_features['z_tfidf_mean1'] = df_features.question1.map(lambda x: np.mean(tfidf.transform([str(x)]).data))
df_features['z_tfidf_mean2'] = df_features.question2.map(lambda x: np.mean(tfidf.transform([str(x)]).data))
df_features['z_tfidf_len1'] = df_features.question1.map(lambda x: len(tfidf.transform([str(x)]).data))
df_features['z_tfidf_len2'] = df_features.question2.map(lambda x: len(tfidf.transform([str(x)]).data))
print ('Caps features')
df_features['caps_count_q1'] = df_features['question1'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
df_features['caps_count_q2'] = df_features['question2'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
df_features['diff_caps'] = df_features['caps_count_q1'] - df_features['caps_count_q2']
df_features['exactly_same'] = (df_features['question1'] == df_features['question2']).astype(int)
df_features['duplicated'] = df_features.duplicated(['question1','question2']).astype(int)
print('some more')
add_word_count(df_features, df_features,'how')
add_word_count(df_features, df_features,'what')
add_word_count(df_features, df_features,'which')
add_word_count(df_features, df_features,'who')
add_word_count(df_features, df_features,'where')
add_word_count(df_features, df_features,'when')
add_word_count(df_features, df_features,'why')
return df_features.fillna(0.0)
In [5]:
trn_features = get_features(train)
In [6]:
tst_features = get_features(test)
In [22]:
def get_noun(df_features):
print('shared nouns')
df_features['z_noun_match'] = df_features.apply(lambda r: sum([1 for w in r.question1_nouns if w in r.question2_nouns]), axis=1)
return df_features.fillna(0.0)
In [23]:
trn_features = get_noun(trn_features)
tst_features = get_noun(tst_features)
In [ ]:
# removing unnecessary columns from train and test data
trn_features = trn_features.ix[:,8:]
tst_features = tst_features.ix[:,5:]
In [3]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
In [34]:
X_train.shape, X_test.shape
Out[34]:
In [35]:
X_train = pd.concat((X_train, trn_features), axis=1)
X_test = pd.concat((X_test, tst_features), axis=1)
X_train.shape, X_test.shape
Out[35]:
In [ ]:
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
In [5]:
X_train.columns.values
Out[5]:
In [6]:
X_train.shape
Out[6]:
Total 79 features are generated with page rank features which is generated in page_rank.ipynb