Based on the kernel XGB with whq jaccard by David Solis.
This utility package imports numpy
, pandas
, matplotlib
and a helper kg
module into the root namespace.
In [1]:
from pygoose import *
NLTK tools
In [2]:
import nltk
In [3]:
from collections import Counter
from nltk.corpus import stopwords
In [4]:
nltk.download('stopwords')
Out[4]:
Automatically discover the paths to various data folders and compose the project structure.
In [5]:
project = kg.Project.discover()
Identifier for storing these features on disk and referring to them later.
In [6]:
feature_list_id = '3rdparty_dasolmar_whq'
Original question sets.
In [7]:
df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('')
df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('')
NLTK built-in stopwords.
In [8]:
stops = set(stopwords.words("english"))
In [9]:
# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=10000, min_count=2):
return 0 if count < min_count else 1 / (count + eps)
In [10]:
def add_word_count(x, df, word):
x['das_q1_' + word] = df['question1'].apply(lambda x: (word in str(x).lower())*1)
x['das_q2_' + word] = df['question2'].apply(lambda x: (word in str(x).lower())*1)
x['das_' + word + '_both'] = x['das_q1_' + word] * x['das_q2_' + word]
In [11]:
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}
In [12]:
def word_shares(row):
q1_list = str(row['question1']).lower().split()
q1 = set(q1_list)
q1words = q1.difference(stops)
if len(q1words) == 0:
return '0:0:0:0:0:0:0:0'
q2_list = str(row['question2']).lower().split()
q2 = set(q2_list)
q2words = q2.difference(stops)
if len(q2words) == 0:
return '0:0:0:0:0:0:0:0'
words_hamming = sum(1 for i in zip(q1_list, q2_list) if i[0]==i[1])/max(len(q1_list), len(q2_list))
q1stops = q1.intersection(stops)
q2stops = q2.intersection(stops)
q1_2gram = set([i for i in zip(q1_list, q1_list[1:])])
q2_2gram = set([i for i in zip(q2_list, q2_list[1:])])
shared_2gram = q1_2gram.intersection(q2_2gram)
shared_words = q1words.intersection(q2words)
shared_weights = [weights.get(w, 0) for w in shared_words]
q1_weights = [weights.get(w, 0) for w in q1words]
q2_weights = [weights.get(w, 0) for w in q2words]
total_weights = q1_weights + q1_weights
R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share
R2 = len(shared_words) / (len(q1words) + len(q2words) - len(shared_words)) #count share
R31 = len(q1stops) / len(q1words) #stops in q1
R32 = len(q2stops) / len(q2words) #stops in q2
Rcosine_denominator = (np.sqrt(np.dot(q1_weights,q1_weights))*np.sqrt(np.dot(q2_weights,q2_weights)))
Rcosine = np.dot(shared_weights, shared_weights)/Rcosine_denominator
if len(q1_2gram) + len(q2_2gram) == 0:
R2gram = 0
else:
R2gram = len(shared_2gram) / (len(q1_2gram) + len(q2_2gram))
return '{}:{}:{}:{}:{}:{}:{}:{}'.format(R1, R2, len(shared_words), R31, R32, R2gram, Rcosine, words_hamming)
In [13]:
df = pd.concat([df_train, df_test])
df['word_shares'] = df.apply(word_shares, axis=1, raw=True)
x = pd.DataFrame()
x['das_word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[0]))
x['das_word_match_2root'] = np.sqrt(x['das_word_match'])
x['das_tfidf_word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[1]))
x['das_shared_count'] = df['word_shares'].apply(lambda x: float(x.split(':')[2]))
x['das_stops1_ratio'] = df['word_shares'].apply(lambda x: float(x.split(':')[3]))
x['das_stops2_ratio'] = df['word_shares'].apply(lambda x: float(x.split(':')[4]))
x['das_shared_2gram'] = df['word_shares'].apply(lambda x: float(x.split(':')[5]))
x['das_cosine'] = df['word_shares'].apply(lambda x: float(x.split(':')[6]))
x['das_words_hamming'] = df['word_shares'].apply(lambda x: float(x.split(':')[7]))
x['das_diff_stops_r'] = np.abs(x['das_stops1_ratio'] - x['das_stops2_ratio'])
x['das_len_q1'] = df['question1'].apply(lambda x: len(str(x)))
x['das_len_q2'] = df['question2'].apply(lambda x: len(str(x)))
x['das_diff_len'] = np.abs(x['das_len_q1'] - x['das_len_q2'])
x['das_caps_count_q1'] = df['question1'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
x['das_caps_count_q2'] = df['question2'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
x['das_diff_caps'] = np.abs(x['das_caps_count_q1'] - x['das_caps_count_q2'])
x['das_len_char_q1'] = df['question1'].apply(lambda x: len(str(x).replace(' ', '')))
x['das_len_char_q2'] = df['question2'].apply(lambda x: len(str(x).replace(' ', '')))
x['das_diff_len_char'] = np.abs(x['das_len_char_q1'] - x['das_len_char_q2'])
x['das_len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
x['das_len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
x['das_diff_len_word'] = np.abs(x['das_len_word_q1'] - x['das_len_word_q2'])
x['das_avg_word_len1'] = x['das_len_char_q1'] / x['das_len_word_q1']
x['das_avg_word_len2'] = x['das_len_char_q2'] / x['das_len_word_q2']
x['das_diff_avg_word'] = np.abs(x['das_avg_word_len1'] - x['das_avg_word_len2'])
# x['exactly_same'] = (df['question1'] == df['question2']).astype(int)
# x['duplicated'] = df.duplicated(['question1','question2']).astype(int)
whq_words = ['how', 'what', 'which', 'who', 'where', 'when', 'why']
for whq in whq_words:
add_word_count(x, df, whq)
In [14]:
whq_columns_q1 = ['das_q1_' + whq for whq in whq_words]
whq_columns_q2 = ['das_q2_' + whq for whq in whq_words]
In [15]:
x['whq_count_q1'] = x[whq_columns_q1].sum(axis=1)
x['whq_count_q2'] = x[whq_columns_q2].sum(axis=1)
x['whq_count_diff'] = np.abs(x['whq_count_q1'] - x['whq_count_q2'])
In [16]:
feature_names = list(x.columns.values)
print("Features: {}".format(feature_names))
In [17]:
X_train = x[:df_train.shape[0]].values
X_test = x[df_train.shape[0]:].values
In [18]:
project.save_features(X_train, X_test, feature_names, feature_list_id)