PrepData for Quora Kaggle Competition

Imports


In [ ]:
# imports
import sys, os, argparse, logging  # NOQA
from pprint import pprint
from twBase import *  # NOQA
from twQuoraRun import *  # NOQA
from tqdm import tqdm

import string
import spacy
#import gensim
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

NLP = spacy.load('en1G')
#NLP = spacy.load('en')

#Allow relative imports to directories above cwd/
sys.path.insert(1, os.path.join(sys.path[0], '..'))

# constants
#scriptPath = print(os.path.abspath(__file__))

In [ ]:
params = {
        "DATA": {
            "BASE_DIR": "./data",
            "GLOVE_DIR": './data/glove.6B',
            "DATA_FN": './data/train.csv',
            "DATA_TEST_FN": './data/test.csv',
            "H5DATA_FN": None,
            "H5DATA_TEST_FN": None,
            "EMBED_FN": './data/emb.300.200k.npy',
            "isSample": False,
            "embedOnly": False,
        },
        "SPACY": {
            "MODEL": "en1G",  # en1G
            #"MODEL": "en",  # en1G
        },
        "MODEL": {
            "MAX_SEQUENCE_LENGTH": 40,  # 1000
            "MAX_NB_WORDS": 200000,
            "EMBED_DIM": 300,
            "BASIC_FEAT_DIM": 26,
            #"NAME": "brnn.maxp.cos.1.60",
            "SUFFIX": "1.40",
            "NAME": None,
            #"CLASS": "xxx",
            #"CLASS": "CosModel",
            #"CLASS": "twEmbeddingModel",
            #"CLASS": "MultiModel",
            #"CLASS": "MultiSiameseModel",
            "CLASS": "MultiSiameseModelAll",
            "isLoad": None,
            "RNN_TOPOLOGY": [100],
            "TOPOLOGY": [512, 256, 1024],
            "DROPOUT": [0.4, 0.4],
            "REGULIZER": [0.0, 0.0],  # 0.005
            "OPTIMIZER": ['Adam', {"lr": 0.001}]
        },
        'SIMILARITY': {
            #'mode': 'cosine',
            'mode': 'euclidean',
            'gamma': 1,
            'c': 1,
            'd': 2,
            'dropout': 0.5,
        },
        "TRAINING": {
            "DEVICE": "/cpu:0",
            "VALIDATION_SPLIT": 0.2,
            "BATCH_SIZE": 2048,
            "EPOCHS": 50,
            "PATIENCE": 10,
            "DECAY_EPOCH": 25,
            "isShuffle": False,
            "VERBOSITY": 1
        },
        "OUTPUT": {
            "BASE_DIR": "./data/out",
            "MODEL_DIR": None
        }
    }

    # params generated by jinja
    ###{{params}}###
    P = Struct(params)
    P.DATA.H5DATA_FN = "./data/train.{}.{}.{}.h5".format(P.MODEL.MAX_SEQUENCE_LENGTH, P.MODEL.EMBED_DIM, P.MODEL.MAX_NB_WORDS)  # build name
    P.DATA.H5DATA_TEST_FN = "./data/test.{}.{}.{}.h5".format(P.MODEL.MAX_SEQUENCE_LENGTH, P.MODEL.EMBED_DIM, P.MODEL.MAX_NB_WORDS)  # build name
    P.MODEL.NAME = "{}.{}".format(P.MODEL.CLASS, P.MODEL.SUFFIX)  # build name
    P.OUTPUT.MODEL_DIR = os.path.join(P.OUTPUT.BASE_DIR,
                                  "{}.{}.{}".format(P.MODEL.MAX_SEQUENCE_LENGTH, P.MODEL.EMBED_DIM, P.MODEL.MAX_NB_WORDS),
                                  P.MODEL.NAME)  # build output dir


P

Load data


In [ ]:
dname = 'test'
df = pd.read_csv('./data/{}.csv'.format(dname), nrows=None)
df.info()

In [ ]:
dname = 'test'
path = os.path.join(P.DATA.BASE_DIR, '{}.feat2.pckl'.format(dname))
path = os.path.join('/mnt/data', '{}.feat2.pckl'.format(dname))
df = pickle.load(open(path, 'rb'))
df.info()

Load Test Data, HDF5

Load splitted data from pd.HDF and h4py into dataframe for processing with 'create_h5'


In [ ]:
path = os.path.join(P.DATA.BASE_DIR,'df.test.40.200k.store.h5')
s = pd.HDFStore(path, mode='r')
df = s['df']
df.info()

In [ ]:
%time
path = os.path.join(P.DATA.BASE_DIR,'df.test.40.200k.hdf5')
f = h5py.File(path, "r")
df['q1_feats'] = list(f['q1_feats'])
df['q2_feats'] = list(f['q2_feats'])
df.info()

In [ ]:
df.info()

Clean data


In [ ]:
def text_to_wordlist(text, remove_stop_words=True, stem_words=False):
    # Clean the text, with the option to remove stop_words and to stem words.

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"what's", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r"switzerland", "Switzerland", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text)
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r"quora", "Quora", text)
    text = re.sub(r" dms ", "direct messages ", text)
    text = re.sub(r"demonitization", "demonetization", text)
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text)
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iPhone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text)
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"gps", "GPS", text)
    text = re.sub(r"gst", "GST", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"dna", "DNA", text)
    text = re.sub(r"III", "3", text)
    text = re.sub(r"the US", "America", text)
    text = re.sub(r"Astrology", "astrology", text)
    text = re.sub(r"Method", "method", text)
    text = re.sub(r"Find", "find", text)
    text = re.sub(r"banglore", "Banglore", text)
    text = re.sub(r" J K ", " JK ", text)

    text = re.sub(r"\'s", " 's ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)

    # Remove punctuation from text
    text = ''.join([c for c in text if c not in SYMBOLS])

    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in STOPLIST]
        text = " ".join(text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    # Return a list of words
    return(text)

In [ ]:
def clean_data(P, df, remove_stop_words):
    df['question1'] = df['question1'].astype(str)
    df['question2'] = df['question2'].astype(str)
    df['question1'] = df['question1'].fillna('empty')
    df['question2'] = df['question2'].fillna('empty')

    log.info('Cleaning q1')
    cleanedTexts = []
    for text in tqdm(df['question1']):
        cleanedTexts.append(text_to_wordlist(text, remove_stop_words=remove_stop_words))
    df['q1_cleaned'] = cleanedTexts

    log.info('Cleaning q2')
    cleanedTexts = []
    for text in tqdm(df['question2']):
        cleanedTexts.append(text_to_wordlist(text, remove_stop_words=remove_stop_words))
    df['q2_cleaned'] = cleanedTexts
    path = os.path.join(P.DATA.BASE_DIR, '{}.clean.pckl'.format(dname))
    with open(path, 'wb') as f:
        pickle.dump(df, f)
    return df

In [ ]:
df = clean_data(P, df, remove_stop_words = False)

Feat1: Basic


In [ ]:
def create_feat1(P, df):
    with tqdm(total=150) as pbar:
        df['len_q1'] = df.question1.apply(lambda x: len(str(x)))
        pbar.update(10)
        df['len_q2'] = df.question2.apply(lambda x: len(str(x)))
        pbar.update(10)
        df['diff_len'] = df.len_q1 - df.len_q2
        pbar.update(10)
        df['len_char_q1'] = df.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
        pbar.update(10)
        df['len_char_q2'] = df.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
        pbar.update(10)
        df['len_word_q1'] = df.question1.apply(lambda x: len(str(x).split()))
        pbar.update(10)
        df['len_word_q2'] = df.question2.apply(lambda x: len(str(x).split()))
        pbar.update(10)
        df['common_words'] = df.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
        pbar.update(10)
        df['fuzz_qratio'] = df.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
        pbar.update(10)
        df['fuzz_WRatio'] = df.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
        pbar.update(10)
        df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
        pbar.update(10)
        df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
        pbar.update(10)
        df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
        pbar.update(10)
        df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
        pbar.update(10)
        df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
        pbar.update(10)

    path = os.path.join(P.DATA.BASE_DIR, '{}.feat1.pckl'.format(dname))
    with open(path, 'wb') as f:
        pickle.dump(df, f)
    return df

In [ ]:
df = create_feat1(P, df)

Feat2: Sent2Vec


In [ ]:
def create_word2tfidf(P):
    nrows = 100 if P.DATA.isSample else None
    sources = [P.DATA.DATA_FN, P.DATA.DATA_TEST_FN]  # Test data is too much noice due to machine generation
    sources = [P.DATA.DATA_FN]
    questions = []
    for source in sources:
        df = pd.read_csv(source, nrows=nrows)
        df['question1'] = df['question1'].astype(str)
        df['question2'] = df['question2'].astype(str)
        # merge texts
        questions.extend(list(df['question1']))
        questions.extend(list(df['question2']))

    tfidf = TfidfVectorizer(lowercase=False, )
    tfidf.fit_transform(questions)

    # dict key:word and value:tf-idf score
    word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
    path = os.path.join(P.DATA.BASE_DIR, "word2tfidf.pckl")
    with open(path, 'wb') as f:
        pickle.dump(word2tfidf, f)
    log.info("Created and saved.", fn=path)

In [ ]:
create_word2tfidf(P)

In [ ]:
def sent2vec(sents, word2tfidf):
    vecs1 = []
    log.info("Creating vectors out of questions")
    for qu in tqdm(sents):
        doc = NLP(qu)
        mean_vec = np.zeros([len(doc), 300])
        for word in doc:
            # word2vec
            vec = word.vector
            # fetch df score
            try:
                idf = word2tfidf[str(word)]
            except:
                #print word
                idf = 0
            # compute final vec
            mean_vec += vec * idf
        mean_vec = mean_vec.mean(axis=0)
        vecs1.append(mean_vec)
    return vecs1

In [ ]:
def create_feat2(P, df):
    with open(os.path.join(P.DATA.BASE_DIR, "word2tfidf.pckl"), 'rb') as f:
        word2tfidf = pickle.load(f)
    log.info("Word2tfidf loaded.")

    vecs = sent2vec(list(df['q1_cleaned']), word2tfidf)
    df['q1_feats'] = vecs
    vecs = sent2vec(list(df['q2_cleaned']), word2tfidf)
    df['q2_feats'] = vecs

    path = os.path.join(P.DATA.BASE_DIR, '{}.feat2.pckl'.format(dname))
    with open(path, 'wb') as f:
        pickle.dump(df, f)
    return df

In [ ]:
df = create_feat2(P, df)

Feat3: Distances:


In [ ]:
def create_feat3(P, df):
    with tqdm(total=160) as pbar:
        # prep array: shape (?, 300), out of (?,) + (300,)
        q1 = np.concatenate(df['q1_feats']).reshape((len(df), P.MODEL.EMBED_DIM))
        q1 = np.nan_to_num(q1)
        pbar.update(20)
        q2 = np.concatenate(df['q2_feats']).reshape((len(df), P.MODEL.EMBED_DIM))
        q2 = np.nan_to_num(q2)
        pbar.update(20)

        df['cosine_distance'] = [cosine(x, y) for (x, y) in zip(q1, q2)]
        pbar.update(10)

        df['cosine_distance'] = [cosine(x, y) for (x, y) in zip(q1, q2)]
        pbar.update(10)

        df['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(q1, q2)]
        pbar.update(10)

        df['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(q1, q2)]
        pbar.update(10)

        df['canberra_distance'] = [canberra(x, y) for (x, y) in zip(q1, q2)]
        pbar.update(10)

        df['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(q1, q2)]
        pbar.update(10)

        df['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(q1, q2)]
        pbar.update(10)

        df['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(q1, q2)]
        pbar.update(10)

        df['skew_q1vec'] = [skew(x) for x in q1]
        pbar.update(10)

        df['skew_q2vec'] = [skew(x) for x in q2]
        pbar.update(10)

        df['kur_q1vec'] = [kurtosis(x) for x in q1]
        pbar.update(10)

        df['kur_q2vec'] = [kurtosis(x) for x in q2]
        pbar.update(10)

        #path = os.path.join(P.DATA.BASE_DIR, '{}.feat3.pckl'.format(dname))
        #with open(path, 'wb') as f:
        #    pickle.dump(df, f)
    return df

In [ ]:
df = create_feat3(P, df)

In [ ]:
path = '/mnt/data/test.feat3.pckl'
with open(path, 'wb') as f:
    pickle.dump(df, f)

In [ ]:
df.info()

Feat4: Sentence Embedding:


In [ ]:
def get_featuresSpacy(docs, max_length):
    '''
        create sentence embeddings/features for lookup in embeddings layer
    '''
    docs = list(docs)
    word2ix = dict()
    Xs = np.zeros((len(docs), max_length), dtype='int32')
    for i, doc in enumerate(docs):
        j = 0
        for token in doc:
            word2ix[token.orth_] = token.rank
            if token.has_vector and not token.is_punct and not token.is_space:
                Xs[i, j] = token.rank  # frequency rank, 0: no vector existent?
                j += 1
                if j >= max_length:
                    break
    return Xs, word2ix

In [ ]:
def create_feat4(P, df):
    q1 = df['q1_cleaned'].values
    q2 = df['q2_cleaned'].values
    texts = np.concatenate([q1, q2], axis=0)

    log.info('Questions q1, q2 read', nq1=len(q1), nq2=len(q2), ntexts=len(texts))

    data, word2ix = get_featuresSpacy(list(NLP.pipe(texts, parse=False, tag=False, entity=False, n_threads=-1, batch_size=10000)), P.MODEL.MAX_SEQUENCE_LENGTH)
    #if isTest:  # Predictions
    #    path = "{}/word2ixSpacy_test.pckl".format(P.DATA.BASE_DIR)
    #else:  # Training
    #    path = "{}/word2ixSpacy.pckl".format(P.DATA.BASE_DIR)
    #with open(path, 'wb') as f:
    #    pickle.dump(word2ix, f)
    #log.info("Saved", data=path)

    q1_data = data[:len(q1)]
    q2_data = data[len(q1):]
    log.info('Shape of data tensors:', q1_data=q1_data.shape, q2_data=q2_data.shape)
    df['q1_embed'] = list(q1_data)
    df['q2_embed'] = list(q2_data)

    path = os.path.join(P.DATA.BASE_DIR, '{}.feat4.pckl'.format(dname))
    with open(path, 'wb') as f:
        pickle.dump(df, f)

    return df

In [ ]:
df = create_feat4(P, df)

In [ ]:
df.info()

Create H5 Datafile


In [ ]:
def create_h5(P, df, isTest=False, isSample=False, embed_only=False):
    #embed_only = P.DATA.embedOnly
    if not embed_only:
        cols = ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate',
            'len_q1', 'len_q2', 'diff_len', 'len_char_q1', 'len_char_q2',
            'len_word_q1', 'len_word_q2', 'common_words', 'fuzz_qratio',
            'fuzz_WRatio', 'fuzz_partial_ratio', 'fuzz_partial_token_set_ratio',
            'fuzz_partial_token_sort_ratio', 'fuzz_token_set_ratio',
            'fuzz_token_sort_ratio', 'q1_feats', 'q2_feats', 'cosine_distance',
            'cityblock_distance', 'jaccard_distance', 'canberra_distance',
            'euclidean_distance', 'minkowski_distance', 'braycurtis_distance',
            'skew_q1vec', 'skew_q2vec', 'kur_q1vec', 'kur_q2vec', 'q1_embed',
            'q2_embed']

        basic_features = [ 'len_q1', 'len_q2', 'diff_len', 'len_char_q1', 'len_char_q2',
            'len_word_q1', 'len_word_q2', 'common_words', 'fuzz_qratio',
            'fuzz_WRatio', 'fuzz_partial_ratio', 'fuzz_partial_token_set_ratio',
            'fuzz_partial_token_sort_ratio', 'fuzz_token_set_ratio',
            'fuzz_token_sort_ratio', 'cosine_distance',
            'cityblock_distance', 'jaccard_distance', 'canberra_distance',
            'euclidean_distance', 'minkowski_distance', 'braycurtis_distance',
            'skew_q1vec', 'skew_q2vec', 'kur_q1vec', 'kur_q2vec']

        #X_basic = df[basic_features].values  # (?, 27)
        X1_feats = df['q1_feats'].values
        X1_feats = np.concatenate(X1_feats).reshape((len(df), len(X1_feats[0])))
        X2_feats = df['q2_feats'].values
        X2_feats = np.concatenate(X2_feats).reshape((len(df), len(X2_feats[0])))
        
    X1_embed = df['q1_embed'].values
    X1_embed = np.concatenate(X1_embed).reshape((len(df), len(X1_embed[0])))
    X2_embed = df['q2_embed'].values
    X2_embed = np.concatenate(X2_embed).reshape((len(df), len(X2_embed[0])))

    if isTest:
        h5_file = P.DATA.H5DATA_TEST_FN
        y = np.zeros(len(df))
        qid = df['test_id'].values
    else:
        h5_file = P.DATA.H5DATA_FN
        y = df['is_duplicate'].values
        qid = df['id'].values

    if os.path.exists(h5_file):
        os.remove(h5_file)

    if isSample:
        h5_file = "{}.s".format(h5_file)
        limit = int(0.1*len(X1_embed))
    else:
        limit = len(X1_embed)
        
    f = h5py.File(h5_file)
    f['qid'] = qid[:limit]
    if not embed_only:
        f['X_basic'] = df[basic_features].values[:limit]  # (?, 27)
        f['X1_feats'] = X1_feats[:limit] # (?, 300)
        f['X2_feats'] = X2_feats[:limit]
    f['X1_embed'] = X1_embed[:limit] # (?, 60)
    f['X2_embed'] = X2_embed[:limit]
    f['y'] = y[:limit]
    f.close()
    log.info("Saved", data=h5_file)
    return

In [ ]:
P.DATA.H5DATA_FN = "./data/train.s.{}.{}.{}.h5".format(P.MODEL.MAX_SEQUENCE_LENGTH, P.MODEL.EMBED_DIM, P.MODEL.MAX_NB_WORDS)  # build name

create_h5(P, df, isTest=False, isSample=True, embed_only=False)
#create_h5(P, df, isTest=False, embed_only=True)

In [ ]:
P.DATA.H5DATA_TEST_FN = "./data/test.s.{}.{}.{}.h5".format(P.MODEL.MAX_SEQUENCE_LENGTH, P.MODEL.EMBED_DIM, P.MODEL.MAX_NB_WORDS)  # build name


create_h5(P, df, isTest=True, isSample=True, embed_only=False)
#create_h5(P, df, isTest=False, embed_only=True)

In [ ]: