In [ ]:
# imports
import sys, os, argparse, logging # NOQA
from pprint import pprint
from twBase import * # NOQA
from twQuoraRun import * # NOQA
from tqdm import tqdm
import string
import spacy
#import gensim
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]
NLP = spacy.load('en1G')
#NLP = spacy.load('en')
#Allow relative imports to directories above cwd/
sys.path.insert(1, os.path.join(sys.path[0], '..'))
# constants
#scriptPath = print(os.path.abspath(__file__))
In [ ]:
params = {
"DATA": {
"BASE_DIR": "./data",
"GLOVE_DIR": './data/glove.6B',
"DATA_FN": './data/train.csv',
"DATA_TEST_FN": './data/test.csv',
"H5DATA_FN": None,
"H5DATA_TEST_FN": None,
"EMBED_FN": './data/emb.300.200k.npy',
"isSample": False,
"embedOnly": False,
},
"SPACY": {
"MODEL": "en1G", # en1G
#"MODEL": "en", # en1G
},
"MODEL": {
"MAX_SEQUENCE_LENGTH": 40, # 1000
"MAX_NB_WORDS": 200000,
"EMBED_DIM": 300,
"BASIC_FEAT_DIM": 26,
#"NAME": "brnn.maxp.cos.1.60",
"SUFFIX": "1.40",
"NAME": None,
#"CLASS": "xxx",
#"CLASS": "CosModel",
#"CLASS": "twEmbeddingModel",
#"CLASS": "MultiModel",
#"CLASS": "MultiSiameseModel",
"CLASS": "MultiSiameseModelAll",
"isLoad": None,
"RNN_TOPOLOGY": [100],
"TOPOLOGY": [512, 256, 1024],
"DROPOUT": [0.4, 0.4],
"REGULIZER": [0.0, 0.0], # 0.005
"OPTIMIZER": ['Adam', {"lr": 0.001}]
},
'SIMILARITY': {
#'mode': 'cosine',
'mode': 'euclidean',
'gamma': 1,
'c': 1,
'd': 2,
'dropout': 0.5,
},
"TRAINING": {
"DEVICE": "/cpu:0",
"VALIDATION_SPLIT": 0.2,
"BATCH_SIZE": 2048,
"EPOCHS": 50,
"PATIENCE": 10,
"DECAY_EPOCH": 25,
"isShuffle": False,
"VERBOSITY": 1
},
"OUTPUT": {
"BASE_DIR": "./data/out",
"MODEL_DIR": None
}
}
# params generated by jinja
###{{params}}###
P = Struct(params)
P.DATA.H5DATA_FN = "./data/train.{}.{}.{}.h5".format(P.MODEL.MAX_SEQUENCE_LENGTH, P.MODEL.EMBED_DIM, P.MODEL.MAX_NB_WORDS) # build name
P.DATA.H5DATA_TEST_FN = "./data/test.{}.{}.{}.h5".format(P.MODEL.MAX_SEQUENCE_LENGTH, P.MODEL.EMBED_DIM, P.MODEL.MAX_NB_WORDS) # build name
P.MODEL.NAME = "{}.{}".format(P.MODEL.CLASS, P.MODEL.SUFFIX) # build name
P.OUTPUT.MODEL_DIR = os.path.join(P.OUTPUT.BASE_DIR,
"{}.{}.{}".format(P.MODEL.MAX_SEQUENCE_LENGTH, P.MODEL.EMBED_DIM, P.MODEL.MAX_NB_WORDS),
P.MODEL.NAME) # build output dir
P
In [ ]:
dname = 'test'
df = pd.read_csv('./data/{}.csv'.format(dname), nrows=None)
df.info()
In [ ]:
dname = 'test'
path = os.path.join(P.DATA.BASE_DIR, '{}.feat2.pckl'.format(dname))
path = os.path.join('/mnt/data', '{}.feat2.pckl'.format(dname))
df = pickle.load(open(path, 'rb'))
df.info()
In [ ]:
path = os.path.join(P.DATA.BASE_DIR,'df.test.40.200k.store.h5')
s = pd.HDFStore(path, mode='r')
df = s['df']
df.info()
In [ ]:
%time
path = os.path.join(P.DATA.BASE_DIR,'df.test.40.200k.hdf5')
f = h5py.File(path, "r")
df['q1_feats'] = list(f['q1_feats'])
df['q2_feats'] = list(f['q2_feats'])
df.info()
In [ ]:
df.info()
In [ ]:
def text_to_wordlist(text, remove_stop_words=True, stem_words=False):
# Clean the text, with the option to remove stop_words and to stem words.
# Clean the text
text = re.sub(r"[^A-Za-z0-9]", " ", text)
text = re.sub(r"what's", "", text, flags=re.IGNORECASE)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"I'm", "I am", text)
text = re.sub(r" m ", " am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r"60k", " 60000 ", text)
text = re.sub(r" e g ", " eg ", text)
text = re.sub(r" b g ", " bg ", text)
text = re.sub(r"\0s", "0", text)
text = re.sub(r" 9 11 ", "911", text)
text = re.sub(r"e-mail", "email", text)
text = re.sub(r"\s{2,}", " ", text)
text = re.sub(r"quikly", "quickly", text)
text = re.sub(r" usa ", " America ", text)
text = re.sub(r" USA ", " America ", text)
text = re.sub(r" u s ", " America ", text)
text = re.sub(r" uk ", " England ", text)
text = re.sub(r" UK ", " England ", text)
text = re.sub(r"india", "India", text)
text = re.sub(r"switzerland", "Switzerland", text)
text = re.sub(r"china", "China", text)
text = re.sub(r"chinese", "Chinese", text)
text = re.sub(r"imrovement", "improvement", text)
text = re.sub(r"intially", "initially", text)
text = re.sub(r"quora", "Quora", text)
text = re.sub(r" dms ", "direct messages ", text)
text = re.sub(r"demonitization", "demonetization", text)
text = re.sub(r"actived", "active", text)
text = re.sub(r"kms", " kilometers ", text)
text = re.sub(r"KMs", " kilometers ", text)
text = re.sub(r" cs ", " computer science ", text)
text = re.sub(r" upvotes ", " up votes ", text)
text = re.sub(r" iPhone ", " phone ", text)
text = re.sub(r"\0rs ", " rs ", text)
text = re.sub(r"calender", "calendar", text)
text = re.sub(r"ios", "operating system", text)
text = re.sub(r"gps", "GPS", text)
text = re.sub(r"gst", "GST", text)
text = re.sub(r"programing", "programming", text)
text = re.sub(r"bestfriend", "best friend", text)
text = re.sub(r"dna", "DNA", text)
text = re.sub(r"III", "3", text)
text = re.sub(r"the US", "America", text)
text = re.sub(r"Astrology", "astrology", text)
text = re.sub(r"Method", "method", text)
text = re.sub(r"Find", "find", text)
text = re.sub(r"banglore", "Banglore", text)
text = re.sub(r" J K ", " JK ", text)
text = re.sub(r"\'s", " 's ", text)
text = re.sub(r",", " ", text)
text = re.sub(r"\.", " ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r"\+", " + ", text)
text = re.sub(r"\-", " - ", text)
text = re.sub(r"\=", " = ", text)
# Remove punctuation from text
text = ''.join([c for c in text if c not in SYMBOLS])
# Optionally, remove stop words
if remove_stop_words:
text = text.split()
text = [w for w in text if not w in STOPLIST]
text = " ".join(text)
# Optionally, shorten words to their stems
if stem_words:
text = text.split()
stemmer = SnowballStemmer('english')
stemmed_words = [stemmer.stem(word) for word in text]
text = " ".join(stemmed_words)
# Return a list of words
return(text)
In [ ]:
def clean_data(P, df, remove_stop_words):
df['question1'] = df['question1'].astype(str)
df['question2'] = df['question2'].astype(str)
df['question1'] = df['question1'].fillna('empty')
df['question2'] = df['question2'].fillna('empty')
log.info('Cleaning q1')
cleanedTexts = []
for text in tqdm(df['question1']):
cleanedTexts.append(text_to_wordlist(text, remove_stop_words=remove_stop_words))
df['q1_cleaned'] = cleanedTexts
log.info('Cleaning q2')
cleanedTexts = []
for text in tqdm(df['question2']):
cleanedTexts.append(text_to_wordlist(text, remove_stop_words=remove_stop_words))
df['q2_cleaned'] = cleanedTexts
path = os.path.join(P.DATA.BASE_DIR, '{}.clean.pckl'.format(dname))
with open(path, 'wb') as f:
pickle.dump(df, f)
return df
In [ ]:
df = clean_data(P, df, remove_stop_words = False)
In [ ]:
def create_feat1(P, df):
with tqdm(total=150) as pbar:
df['len_q1'] = df.question1.apply(lambda x: len(str(x)))
pbar.update(10)
df['len_q2'] = df.question2.apply(lambda x: len(str(x)))
pbar.update(10)
df['diff_len'] = df.len_q1 - df.len_q2
pbar.update(10)
df['len_char_q1'] = df.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
pbar.update(10)
df['len_char_q2'] = df.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
pbar.update(10)
df['len_word_q1'] = df.question1.apply(lambda x: len(str(x).split()))
pbar.update(10)
df['len_word_q2'] = df.question2.apply(lambda x: len(str(x).split()))
pbar.update(10)
df['common_words'] = df.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
pbar.update(10)
df['fuzz_qratio'] = df.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
pbar.update(10)
df['fuzz_WRatio'] = df.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
pbar.update(10)
df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
pbar.update(10)
df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
pbar.update(10)
df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
pbar.update(10)
df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
pbar.update(10)
df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
pbar.update(10)
path = os.path.join(P.DATA.BASE_DIR, '{}.feat1.pckl'.format(dname))
with open(path, 'wb') as f:
pickle.dump(df, f)
return df
In [ ]:
df = create_feat1(P, df)
In [ ]:
def create_word2tfidf(P):
nrows = 100 if P.DATA.isSample else None
sources = [P.DATA.DATA_FN, P.DATA.DATA_TEST_FN] # Test data is too much noice due to machine generation
sources = [P.DATA.DATA_FN]
questions = []
for source in sources:
df = pd.read_csv(source, nrows=nrows)
df['question1'] = df['question1'].astype(str)
df['question2'] = df['question2'].astype(str)
# merge texts
questions.extend(list(df['question1']))
questions.extend(list(df['question2']))
tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)
# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
path = os.path.join(P.DATA.BASE_DIR, "word2tfidf.pckl")
with open(path, 'wb') as f:
pickle.dump(word2tfidf, f)
log.info("Created and saved.", fn=path)
In [ ]:
create_word2tfidf(P)
In [ ]:
def sent2vec(sents, word2tfidf):
vecs1 = []
log.info("Creating vectors out of questions")
for qu in tqdm(sents):
doc = NLP(qu)
mean_vec = np.zeros([len(doc), 300])
for word in doc:
# word2vec
vec = word.vector
# fetch df score
try:
idf = word2tfidf[str(word)]
except:
#print word
idf = 0
# compute final vec
mean_vec += vec * idf
mean_vec = mean_vec.mean(axis=0)
vecs1.append(mean_vec)
return vecs1
In [ ]:
def create_feat2(P, df):
with open(os.path.join(P.DATA.BASE_DIR, "word2tfidf.pckl"), 'rb') as f:
word2tfidf = pickle.load(f)
log.info("Word2tfidf loaded.")
vecs = sent2vec(list(df['q1_cleaned']), word2tfidf)
df['q1_feats'] = vecs
vecs = sent2vec(list(df['q2_cleaned']), word2tfidf)
df['q2_feats'] = vecs
path = os.path.join(P.DATA.BASE_DIR, '{}.feat2.pckl'.format(dname))
with open(path, 'wb') as f:
pickle.dump(df, f)
return df
In [ ]:
df = create_feat2(P, df)
In [ ]:
def create_feat3(P, df):
with tqdm(total=160) as pbar:
# prep array: shape (?, 300), out of (?,) + (300,)
q1 = np.concatenate(df['q1_feats']).reshape((len(df), P.MODEL.EMBED_DIM))
q1 = np.nan_to_num(q1)
pbar.update(20)
q2 = np.concatenate(df['q2_feats']).reshape((len(df), P.MODEL.EMBED_DIM))
q2 = np.nan_to_num(q2)
pbar.update(20)
df['cosine_distance'] = [cosine(x, y) for (x, y) in zip(q1, q2)]
pbar.update(10)
df['cosine_distance'] = [cosine(x, y) for (x, y) in zip(q1, q2)]
pbar.update(10)
df['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(q1, q2)]
pbar.update(10)
df['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(q1, q2)]
pbar.update(10)
df['canberra_distance'] = [canberra(x, y) for (x, y) in zip(q1, q2)]
pbar.update(10)
df['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(q1, q2)]
pbar.update(10)
df['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(q1, q2)]
pbar.update(10)
df['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(q1, q2)]
pbar.update(10)
df['skew_q1vec'] = [skew(x) for x in q1]
pbar.update(10)
df['skew_q2vec'] = [skew(x) for x in q2]
pbar.update(10)
df['kur_q1vec'] = [kurtosis(x) for x in q1]
pbar.update(10)
df['kur_q2vec'] = [kurtosis(x) for x in q2]
pbar.update(10)
#path = os.path.join(P.DATA.BASE_DIR, '{}.feat3.pckl'.format(dname))
#with open(path, 'wb') as f:
# pickle.dump(df, f)
return df
In [ ]:
df = create_feat3(P, df)
In [ ]:
path = '/mnt/data/test.feat3.pckl'
with open(path, 'wb') as f:
pickle.dump(df, f)
In [ ]:
df.info()
In [ ]:
def get_featuresSpacy(docs, max_length):
'''
create sentence embeddings/features for lookup in embeddings layer
'''
docs = list(docs)
word2ix = dict()
Xs = np.zeros((len(docs), max_length), dtype='int32')
for i, doc in enumerate(docs):
j = 0
for token in doc:
word2ix[token.orth_] = token.rank
if token.has_vector and not token.is_punct and not token.is_space:
Xs[i, j] = token.rank # frequency rank, 0: no vector existent?
j += 1
if j >= max_length:
break
return Xs, word2ix
In [ ]:
def create_feat4(P, df):
q1 = df['q1_cleaned'].values
q2 = df['q2_cleaned'].values
texts = np.concatenate([q1, q2], axis=0)
log.info('Questions q1, q2 read', nq1=len(q1), nq2=len(q2), ntexts=len(texts))
data, word2ix = get_featuresSpacy(list(NLP.pipe(texts, parse=False, tag=False, entity=False, n_threads=-1, batch_size=10000)), P.MODEL.MAX_SEQUENCE_LENGTH)
#if isTest: # Predictions
# path = "{}/word2ixSpacy_test.pckl".format(P.DATA.BASE_DIR)
#else: # Training
# path = "{}/word2ixSpacy.pckl".format(P.DATA.BASE_DIR)
#with open(path, 'wb') as f:
# pickle.dump(word2ix, f)
#log.info("Saved", data=path)
q1_data = data[:len(q1)]
q2_data = data[len(q1):]
log.info('Shape of data tensors:', q1_data=q1_data.shape, q2_data=q2_data.shape)
df['q1_embed'] = list(q1_data)
df['q2_embed'] = list(q2_data)
path = os.path.join(P.DATA.BASE_DIR, '{}.feat4.pckl'.format(dname))
with open(path, 'wb') as f:
pickle.dump(df, f)
return df
In [ ]:
df = create_feat4(P, df)
In [ ]:
df.info()
In [ ]:
def create_h5(P, df, isTest=False, isSample=False, embed_only=False):
#embed_only = P.DATA.embedOnly
if not embed_only:
cols = ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate',
'len_q1', 'len_q2', 'diff_len', 'len_char_q1', 'len_char_q2',
'len_word_q1', 'len_word_q2', 'common_words', 'fuzz_qratio',
'fuzz_WRatio', 'fuzz_partial_ratio', 'fuzz_partial_token_set_ratio',
'fuzz_partial_token_sort_ratio', 'fuzz_token_set_ratio',
'fuzz_token_sort_ratio', 'q1_feats', 'q2_feats', 'cosine_distance',
'cityblock_distance', 'jaccard_distance', 'canberra_distance',
'euclidean_distance', 'minkowski_distance', 'braycurtis_distance',
'skew_q1vec', 'skew_q2vec', 'kur_q1vec', 'kur_q2vec', 'q1_embed',
'q2_embed']
basic_features = [ 'len_q1', 'len_q2', 'diff_len', 'len_char_q1', 'len_char_q2',
'len_word_q1', 'len_word_q2', 'common_words', 'fuzz_qratio',
'fuzz_WRatio', 'fuzz_partial_ratio', 'fuzz_partial_token_set_ratio',
'fuzz_partial_token_sort_ratio', 'fuzz_token_set_ratio',
'fuzz_token_sort_ratio', 'cosine_distance',
'cityblock_distance', 'jaccard_distance', 'canberra_distance',
'euclidean_distance', 'minkowski_distance', 'braycurtis_distance',
'skew_q1vec', 'skew_q2vec', 'kur_q1vec', 'kur_q2vec']
#X_basic = df[basic_features].values # (?, 27)
X1_feats = df['q1_feats'].values
X1_feats = np.concatenate(X1_feats).reshape((len(df), len(X1_feats[0])))
X2_feats = df['q2_feats'].values
X2_feats = np.concatenate(X2_feats).reshape((len(df), len(X2_feats[0])))
X1_embed = df['q1_embed'].values
X1_embed = np.concatenate(X1_embed).reshape((len(df), len(X1_embed[0])))
X2_embed = df['q2_embed'].values
X2_embed = np.concatenate(X2_embed).reshape((len(df), len(X2_embed[0])))
if isTest:
h5_file = P.DATA.H5DATA_TEST_FN
y = np.zeros(len(df))
qid = df['test_id'].values
else:
h5_file = P.DATA.H5DATA_FN
y = df['is_duplicate'].values
qid = df['id'].values
if os.path.exists(h5_file):
os.remove(h5_file)
if isSample:
h5_file = "{}.s".format(h5_file)
limit = int(0.1*len(X1_embed))
else:
limit = len(X1_embed)
f = h5py.File(h5_file)
f['qid'] = qid[:limit]
if not embed_only:
f['X_basic'] = df[basic_features].values[:limit] # (?, 27)
f['X1_feats'] = X1_feats[:limit] # (?, 300)
f['X2_feats'] = X2_feats[:limit]
f['X1_embed'] = X1_embed[:limit] # (?, 60)
f['X2_embed'] = X2_embed[:limit]
f['y'] = y[:limit]
f.close()
log.info("Saved", data=h5_file)
return
In [ ]:
P.DATA.H5DATA_FN = "./data/train.s.{}.{}.{}.h5".format(P.MODEL.MAX_SEQUENCE_LENGTH, P.MODEL.EMBED_DIM, P.MODEL.MAX_NB_WORDS) # build name
create_h5(P, df, isTest=False, isSample=True, embed_only=False)
#create_h5(P, df, isTest=False, embed_only=True)
In [ ]:
P.DATA.H5DATA_TEST_FN = "./data/test.s.{}.{}.{}.h5".format(P.MODEL.MAX_SEQUENCE_LENGTH, P.MODEL.EMBED_DIM, P.MODEL.MAX_NB_WORDS) # build name
create_h5(P, df, isTest=True, isSample=True, embed_only=False)
#create_h5(P, df, isTest=False, embed_only=True)
In [ ]: