In [1]:
from utils import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import string
import re
import pickle
from tqdm import tqdm

# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

BASE_DIR = '.'
GLOVE_DIR = BASE_DIR + '/data/glove.6B/'
RAW_DATA_FN = BASE_DIR + '/data/train.csv'
MAX_SEQUENCE_LENGTH = 40  # 1000
VALIDATION_SPLIT = 0.2

# Output
OUTPUT_DIR = BASE_DIR + '/data/output'
WORD_INDEX_FN = os.path.join(OUTPUT_DIR, 'word_index.pckl')
EMB_INDEX_FN = os.path.join(OUTPUT_DIR, 'emb_index.pckl')
H5DATA_FN = os.path.join(OUTPUT_DIR, 'data.300.h5')

# Model Output
MODEL_NAME = '/yyy'
OUTPUT_DIR_MODEL = OUTPUT_DIR + MODEL_NAME
WEIGHTS_FN = OUTPUT_DIR_MODEL + '/weights.h5'
MODEL_FN = OUTPUT_DIR_MODEL + '/model.json'
MODEL_INTERRUPTED_FN = OUTPUT_DIR_MODEL + '/model_interrupted.h5'
CSV_FN = OUTPUT_DIR_MODEL + '/log.csv'
PLOT_MODEL_FN = OUTPUT_DIR_MODEL + '/model.png'
PLOT_BASIS_MODEL_FN = OUTPUT_DIR_MODEL + '/basis_model.png'
MONITOR_DISTANCE_FN = OUTPUT_DIR_MODEL + '/monitor_distance.csv'


Using TensorFlow backend.

In [3]:
def restore_sentence(array, ind2word):
    s = []
    for i in array:
        if i != 0:
            s.append(ind2word[i])
    return " ".join(s)

In [7]:
df = pd.read_csv(RAW_DATA_FN, nrows=1000)
#df = pd.read_csv(RAW_DATA_FN, encoding="ISO-8859-1")

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(df['question1']) + list(df['question2'])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))


Out[8]:
<2000x4224 sparse matrix of type '<class 'numpy.float64'>'
	with 19730 stored elements in Compressed Sparse Row format>

In [18]:
word2tfidf;

In [19]:
# Set up spaCy
#from spacy.en import English
#parser = English()
import spacy
nlp = spacy.load('en')

In [20]:
vecs1 = []
for qu in tqdm(list(df['question1'])):
    doc = nlp(qu) 
    mean_vec = np.zeros([len(doc), 300])
    for word in doc:
        # word2vec
        vec = word.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec += vec * idf
    mean_vec = mean_vec.mean(axis=0)
    vecs1.append(mean_vec)
df['q1_feats'] = list(vecs1)


100%|██████████| 1000/1000 [00:01<00:00, 651.94it/s]

In [4]:
import argparse
parser = argparse.ArgumentParser(description="programpurpose")
parser.add_argument("-s", "--sample", help="run on sample", action="store_true")
args = parser.parse_args(["-s"])
limit = 10

df = pd.read_csv(RAW_DATA_FN, encoding="ISO-8859-1")
y_train = df['is_duplicate'].values

q1, q2 = [], []  # list of text samples

qid = df['id'].values
q1 = df['question1'].values
q2 = df['question2'].fillna("").values  # non duplicates can be nan
if args.sample:
    q1 = q1[:limit]
    q2 = q2[:limit]
    df = df[:limit]
texts = np.concatenate([q1, q2], axis=0)

texts.shape
df.head()


Out[4]:
_StoreTrueAction(option_strings=['-s', '--sample'], dest='sample', nargs=0, const=True, default=False, type=None, choices=None, help='run on sample', metavar=None)
Out[4]:
(20,)
Out[4]:
id qid1 qid2 question1 question2 is_duplicate
0 0 1 2 What is the step by step guide to invest in sh... What is the step by step guide to invest in sh... 0
1 1 3 4 What is the story of Kohinoor (Koh-i-Noor) Dia... What would happen if the Indian government sto... 0
2 2 5 6 How can I increase the speed of my internet co... How can Internet speed be increased by hacking... 0
3 3 7 8 Why am I mentally very lonely? How can I solve... Find the remainder when [math]23^{24}[/math] i... 0
4 4 9 10 Which one dissolve in water quikly sugar, salt... Which fish would survive in salt water? 0

SpaCy


In [5]:
# Set up spaCy
#from spacy.en import English
#parser = English()
import spacy
nlp = spacy.load('en')

In [6]:
w2f = {lex.orth_: lex.rank for lex in nlp.vocab}
f2w = {lex.rank: lex.orth_ for lex in nlp.vocab}

In [115]:
nlp.vocab[0].orth_, nlp.vocab[0].rank


Out[115]:
('', 0)

In [116]:
[f2w[i] for i in range(1,19)]


Out[116]:
['.',
 ',',
 'the',
 'to',
 'a',
 'I',
 'of',
 'and',
 'is',
 'that',
 '\n\n',
 'it',
 'you',
 'in',
 ' ',
 '"',
 "'s",
 "n't"]

In [7]:
def get_embeddings(vocab):
    max_rank = max(lex.rank for lex in vocab if lex.has_vector)
    # rank+1 because index 0 is NULL
    vectors = np.zeros((max_rank+1, vocab.vectors_length), dtype='float32')
    for lex in vocab:
        if lex.has_vector:
            vectors[lex.rank] = lex.vector
    return len(vectors), vectors

#num_words, embedding_matrix = get_embMatrix(word_index, embeddings_index)
num_words, embedding_matrix = get_embeddings(nlp.vocab)
embedding_matrix.shape


Out[7]:
(742224, 300)

In [56]:
%%time
# vectorize the questions
vecs1 = [doc.vector for doc in nlp.pipe(df['question1'], n_threads=50)]
vecs1 =  np.array(vecs1)
df['q1_feats'] = list(vecs1)

vecs2 = [doc.vector for doc in nlp.pipe(df['question2'], n_threads=50)]
vecs2 =  np.array(vecs2)
df['q2_feats'] = list(vecs2)

# save features
pd.to_pickle(df, 'data/1_df.pkl')


CPU times: user 94.1 ms, sys: 4.59 ms, total: 98.6 ms
Wall time: 105 ms

In [65]:
df = pd.read_pickle('data/1_df.pkl')
df.head()

df.ix[1, 'question2']


Out[65]:
id qid1 qid2 question1 question2 is_duplicate q1_feats q2_feats
0 0 1 2 What is the step by step guide to invest in sh... What is the step by step guide to invest in sh... 0 [0.0195267, 0.203426, 0.00217022, 0.0389881, 0... [0.0156703, 0.214883, -0.0177128, 0.0472451, 0...
1 1 3 4 What is the story of Kohinoor (Koh-i-Noor) Dia... What would happen if the Indian government sto... 0 [-0.0691538, 0.26211, -0.0911685, -0.0305812, ... [-0.0571951, 0.172125, -0.100987, -0.00079805,...
2 2 5 6 How can I increase the speed of my internet co... How can Internet speed be increased by hacking... 0 [-0.0334206, 0.227167, -0.219999, -0.0785076, ... [-0.218246, 0.241675, -0.101896, -0.10092, -0....
3 3 7 8 Why am I mentally very lonely? How can I solve... Find the remainder when [math]23^{24}[/math] i... 0 [-0.0741342, 0.358056, -0.271047, -0.13483, 0.... [-0.0235632, 0.0672538, -0.0241042, -0.0751297...
4 4 9 10 Which one dissolve in water quikly sugar, salt... Which fish would survive in salt water? 0 [-0.0576909, 0.197554, 0.000139066, -0.12711, ... [-0.0745629, 0.0984321, 0.0713399, -0.144573, ...
Out[65]:
'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?'

In [158]:
def get_featuresSpacy(docs, max_length):
    docs = list(docs)
    word2ind = dict()
    Xs = np.zeros((len(docs), max_length), dtype='int32')
    for i, doc in enumerate(docs):
        j = 0
        for token in doc:
            token = nlp.vocab[token.lower_]
            word2ind[token.orth_] = token.rank
            if token.has_vector and not token.is_punct and not token.is_space:
                Xs[i, j] = token.rank # frequency rank, 0: no vector existent?
                j += 1
                if j >= max_length:
                    break

    log.info('Found %s unique tokens.' % len(word2ind))

    with open(WORD_INDEX_FN, 'wb') as f:
        pickle.dump(word2ind, f)
    log.info("Saved", data=WORD_INDEX_FN, vocab=len(word2ind))
    
    return Xs, word2ind

In [159]:
data, word2ind = get_featuresSpacy(list(nlp.pipe(texts, n_threads=40, batch_size=60000)), MAX_SEQUENCE_LENGTH)
data.shape


event='Found 123 unique tokens.'
vocab=123 data='./data/output/word_index.pckl' event='Saved'
Out[159]:
(20, 40)

In [146]:
word2ind


Out[146]:
{"'m": 75,
 'a': 5,
 'about': 49,
 'active': 2612,
 'am': 180,
 'and': 8,
 'back': 171,
 'be': 23,
 'buy': 341,
 'by': 66,
 'can': 53,
 'cap': 5500,
 'carbon': 3257,
 'company': 452,
 'connection': 2160,
 'di': 2837,
 'diamond': 244532,
 'divided': 5798,
 'do': 28,
 'does': 85,
 'far': 274,
 'find': 206,
 'fish': 2291,
 'for': 19,
 'free': 251,
 'from': 59,
 'games': 590,
 'good': 110,
 'government': 179,
 'great': 262,
 'guide': 4417,
 'hack': 3458,
 'hacking': 6069,
 'happen': 423,
 'how': 94,
 'i': 129,
 'if': 50,
 'in': 14,
 'increase': 1015,
 'increased': 2246,
 'instead': 377,
 'internet': 526,
 'invest': 2569,
 'is': 9,
 'it': 12,
 'keeps': 1755,
 'lonely': 546906,
 'market': 286,
 'me': 71,
 'mentally': 3434,
 'moon': 3417,
 'my': 54,
 'of': 7,
 'one': 64,
 'phone': 963,
 'rising': 4043,
 'salt': 4057,
 'say': 120,
 'share': 997,
 'should': 97,
 'solve': 1968,
 'speed': 1435,
 'step': 1129,
 'stole': 3530,
 'story': 385,
 'sugar': 2895,
 'sun': 2188,
 'survive': 2163,
 'that': 10,
 'the': 3,
 'this': 33,
 'through': 246,
 'to': 4,
 'triple': 65763,
 'use': 147,
 'using': 282,
 'very': 137,
 'video': 394,
 'water': 567,
 'what': 58,
 'when': 87,
 'which': 111,
 'while': 219,
 'why': 145,
 'would': 42,
 'you': 13}

In [105]:
data[0]


Out[105]:
array([ 142,    9,    3, 1129,   66, 1129, 4417,    4, 2569,   14,  997,  286,   14,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [69]:
# tokenize the questions
q1_docs = list(nlp.pipe(df['question1'], batch_size=5000, n_threads=8))
#dev_docs = list(nlp.pipe(dev_texts, batch_size=5000, n_threads=3))

with open('q1_docs.pckl', 'wb') as f:
    pickle.dump(q1_docs, f)

In [70]:
# tokenize the questions
q2_docs = list(nlp.pipe(df['question2'], batch_size=5000, n_threads=8))
#dev_docs = list(nlp.pipe(dev_texts, batch_size=5000, n_threads=3))

with open('q2_docs.pckl', 'wb') as f:
    pickle.dump(q2_docs, f)

In [71]:
len(q1_docs), len(q2_docs)
assert len(q1_docs) == len(q2_docs)


Out[71]:
(10, 10)

In [74]:
q1_feats = get_features(q1_docs, 50)

q1_feats.shape
q1_feats[0]
q1_docs[0]


Out[74]:
(10, 50)
Out[74]:
array([ 143,   10,    4, 1130,   67, 1130, 4418,    5, 2570,   15,  998,
        287,   15,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)
Out[74]:
What is the step by step guide to invest in share market in india?

In [76]:
# frequency == rank != index in vocab
w = q1_docs[0][0]
w, w.rank
nlp.vocab['What'].orth


Out[76]:
(What, 142)
Out[76]:
727

In [77]:
# how to index and rank relate
nlp.vocab[727].orth_
nlp.vocab[727].rank


Out[77]:
'What'
Out[77]:
142

In [80]:
w2f['What'], f2w[142]


Out[80]:
(142, 'What')

Keras


In [93]:
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100

In [147]:
def get_featuresKeras(texts, max_length):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word2ind = tokenizer.word_index
    #ind2word = {v: k for k, v in word2ind.items()}
    log.info('Found %s unique tokens.' % len(word2ind))

    with open(WORD_INDEX_FN, 'wb') as f:
        pickle.dump(word2ind, f)
    log.info("Saved", data=WORD_INDEX_FN, vocab=len(word2ind))

    #Xs = pad_sequences(sequences, maxlen=max_length)
    Xs = pad_sequences(sequences, maxlen=None)
    return Xs, word2ind

In [148]:
#data = get_featuresSpacy(list(nlp.pipe(texts, n_threads=40, batch_size=60000)), MAX_SEQUENCE_LENGTH)
data, word2indK = get_featuresKeras(texts, MAX_SEQUENCE_LENGTH)
data.shape


event='Found 113 unique tokens.'
vocab=113 data='./data/output/word_index.pckl' event='Saved'
Out[148]:
(20, 17)

In [149]:
word2indK


Out[149]:
{'23': 44,
 '24': 28,
 'a': 7,
 'about': 26,
 'active': 91,
 'am': 25,
 'and': 5,
 'ascendant': 89,
 'astrology': 58,
 'back': 74,
 'be': 16,
 'buy': 103,
 'by': 11,
 'can': 8,
 'cap': 47,
 'capricorn': 17,
 'carbon': 71,
 'charter': 72,
 'childern': 109,
 'company': 65,
 'connection': 54,
 'dcx3400': 30,
 'di': 97,
 'diamond': 21,
 'dissolve': 108,
 'divided': 67,
 'dns': 102,
 'do': 10,
 'does': 40,
 'far': 77,
 'find': 88,
 'fish': 63,
 'for': 85,
 'free': 99,
 'from': 60,
 'games': 112,
 'geologist': 49,
 'good': 57,
 'government': 79,
 'great': 110,
 'guide': 37,
 'hack': 42,
 'hacking': 68,
 'happen': 90,
 'how': 6,
 'i': 1,
 "i'm": 82,
 'if': 95,
 'in': 4,
 'increase': 83,
 'increased': 78,
 'india': 61,
 'indian': 93,
 'instead': 31,
 'internet': 18,
 'invest': 38,
 'is': 9,
 'it': 86,
 'keeps': 84,
 'koh': 48,
 'kohinoor': 46,
 'lonely': 105,
 'market': 33,
 'math': 36,
 'me': 39,
 'mentally': 52,
 'methane': 104,
 'moon': 23,
 'motorola': 22,
 'motorolla': 69,
 'my': 34,
 'noor': 29,
 'of': 13,
 'one': 53,
 'oxide': 96,
 'phone': 55,
 'quikly': 113,
 'remainder': 75,
 'rising': 92,
 'salt': 19,
 'say': 51,
 'share': 41,
 'should': 27,
 'solve': 64,
 'speed': 24,
 'step': 12,
 'stole': 81,
 'story': 80,
 'sugar': 66,
 'sun': 45,
 'survive': 70,
 'that': 59,
 'the': 3,
 'this': 100,
 'through': 106,
 'tiago': 73,
 'to': 14,
 'triple': 76,
 'use': 32,
 'using': 87,
 'very': 56,
 'video': 101,
 'vpn': 111,
 'water': 35,
 'what': 2,
 'when': 15,
 'which': 50,
 'while': 62,
 'why': 94,
 'would': 20,
 'you': 43,
 'ã\x81\x97': 98,
 'ã\x82·': 107}

In [160]:
# find the difference between Keras and SpaCy
set(word2ind.keys()) - set(word2indK.keys())
set(word2indK.keys()) - set(word2ind.keys())


Out[160]:
{'"',
 '&',
 "'m",
 '(',
 ')',
 ',',
 '-',
 '...',
 '24,23',
 ':',
 '?',
 '[',
 ']',
 'math]23^{24}[/math'}
Out[160]:
{'23', '24', "i'm", 'math'}

In [157]:
[s for s in texts if s.find('oxidee') != 0]


Out[157]:
['What is the step by step guide to invest in share market in india?',
 'What is the story of Kohinoor (Koh-i-Noor) Diamond?',
 'How can I increase the speed of my internet connection while using a VPN?',
 'Why am I mentally very lonely? How can I solve it?',
 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?',
 'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?',
 'Should I buy tiago?',
 'How can I be a good geologist?',
 'When do you use ã\x82· instead of ã\x81\x97?',
 'Motorola (company): Can I hack my Charter Motorolla DCX3400?',
 'What is the step by step guide to invest in share market?',
 'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?',
 'How can Internet speed be increased by hacking through DNS?',
 'Find the remainder when [math]23^{24}[/math] is divided by 24,23?',
 'Which fish would survive in salt water?',
 "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?",
 'What keeps childern active and far from phone and video games?',
 'What should I do to be a great geologist?',
 'When do you use "&" instead of "and"?',
 'How do I hack Motorola DCX3400 for free internet?']

In [110]:
data[0]


Out[110]:
array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  2,  9,  3, 12, 11, 12, 37, 14, 38,  4, 41, 33,  4, 61], dtype=int32)

In [81]:
def index_emb(args):
    """
    1. build index, mapping words in the embeddings set to their embedding vector
    """
    log.info('Indexing word vectors.')

    embeddings_index = {}
    f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    log.info('Found word vectors.', n=len(embeddings_index))

    with open(EMB_INDEX_FN, 'wb') as f:
        pickle.dump(embeddings_index, f)
    log.info("Saved", fn=EMB_INDEX_FN)
    return embeddings_index

In [91]:
# embedding_matrix[0] == 0
# no offset of 1 needed as seen in some examples

def get_embMatrix(word_index, embeddings_index):
    '''
    compute an index, mapping words to known embeddings, by parsing the data dump of pre-trained embeddings
    '''
    log.info('Preparing embedding matrix.')

    num_words = min(MAX_NB_WORDS, len(word_index))
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= MAX_NB_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be/stay all-zeros.
            embedding_matrix[i] = embedding_vector
    return num_words, embedding_matrix

In [83]:
embeddings_index = index_emb(None)


event='Indexing word vectors.'
n=400000 event='Found word vectors.'
fn='./data/output/emb_index.pckl' event='Saved'

In [87]:
with open(WORD_INDEX_FN, 'rb') as f:
    word2ind = pickle.load(f)
with open(EMB_INDEX_FN, 'rb') as f:
    embeddings_index = pickle.load(f)

len(word2ind), len(embeddings_index)


Out[87]:
(95603, 400000)

In [85]:
i = 0
for k,v in word2ind.items():
    print(k, v)
    i += 1
    if i == 10:
        break

word2ind["paperback'"]


saves 14976
haar 54918
sympatric 36379
penicillin 27085
80kmph 46024
29y 69185
'fahrenheit 61859
karol 69106
illustrations 24273
paperback' 62662
Out[85]:
62662

In [42]:
embeddings_index['hallo']


Out[42]:
array([ 0.36537999, -0.10879   ,  0.17691   ,  0.22384   , -0.31248999,
       -0.77702999,  0.083026  , -0.19175   ,  0.25918001, -0.38552001,
        0.31325999,  0.091299  , -0.65684003,  0.50344002,  0.1322    ,
       -0.24023999,  0.19908001, -0.21407001,  0.027921  ,  0.41314   ,
        0.87686002,  0.1165    , -0.36769   , -0.82401001,  0.11035   ,
        0.11297   ,  0.087235  ,  0.37029999,  0.37641999, -0.31900001,
       -0.15839   ,  0.060404  ,  0.63230002,  0.88050997, -0.0071794 ,
        0.80598998,  0.20733   , -0.70696002, -0.2325    , -0.42739001,
        0.70345002,  0.15075999, -0.17053001,  0.92660999, -0.20591   ,
       -0.34477001,  0.13541999, -0.0033754 , -0.40729001,  0.17761   ,
        0.058826  ,  0.026633  , -0.19412   , -0.22645999, -0.37819999,
        0.71743   ,  0.75711   ,  0.039923  , -0.72781998, -0.52138001,
       -0.10117   , -0.84917003, -0.77096999, -0.59053999, -0.50967002,
        0.67018002, -0.56453001, -0.1392    , -0.17783999, -0.060695  ,
       -0.33917001,  0.46097001, -0.71060002,  0.047548  , -0.12077   ,
        0.685     ,  0.069214  ,  0.42223999,  0.18037   ,  0.15075   ,
        0.19465999,  0.045795  ,  0.15576001, -0.20928   ,  0.55207002,
        0.20095   , -0.35288   , -0.64231002,  0.28084999, -0.19318999,
        0.032665  , -0.64752001,  0.39320999,  0.41231   ,  0.12662999,
        0.046476  ,  0.24716   , -0.42932001, -0.93985999,  0.16095001], dtype=float32)

In [90]:
num_wordsK, embedding_matrixK = get_embMatrix(word2ind, embeddings_index)
embedding_matrixK.shape


event='Preparing embedding matrix.'
Out[90]:
(20000, 100)