In [1]:

    
from utils import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import string
import re
import pickle
from tqdm import tqdm

# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

BASE_DIR = '.'
GLOVE_DIR = BASE_DIR + '/data/glove.6B/'
RAW_DATA_FN = BASE_DIR + '/data/train.csv'
MAX_SEQUENCE_LENGTH = 40  # 1000
VALIDATION_SPLIT = 0.2

# Output
OUTPUT_DIR = BASE_DIR + '/data/output'
WORD_INDEX_FN = os.path.join(OUTPUT_DIR, 'word_index.pckl')
EMB_INDEX_FN = os.path.join(OUTPUT_DIR, 'emb_index.pckl')
H5DATA_FN = os.path.join(OUTPUT_DIR, 'data.300.h5')

# Model Output
MODEL_NAME = '/yyy'
OUTPUT_DIR_MODEL = OUTPUT_DIR + MODEL_NAME
WEIGHTS_FN = OUTPUT_DIR_MODEL + '/weights.h5'
MODEL_FN = OUTPUT_DIR_MODEL + '/model.json'
MODEL_INTERRUPTED_FN = OUTPUT_DIR_MODEL + '/model_interrupted.h5'
CSV_FN = OUTPUT_DIR_MODEL + '/log.csv'
PLOT_MODEL_FN = OUTPUT_DIR_MODEL + '/model.png'
PLOT_BASIS_MODEL_FN = OUTPUT_DIR_MODEL + '/basis_model.png'
MONITOR_DISTANCE_FN = OUTPUT_DIR_MODEL + '/monitor_distance.csv'









    



Using TensorFlow backend.



In [3]:

    
def restore_sentence(array, ind2word):
    s = []
    for i in array:
        if i != 0:
            s.append(ind2word[i])
    return " ".join(s)



In [7]:

    
df = pd.read_csv(RAW_DATA_FN, nrows=1000)
#df = pd.read_csv(RAW_DATA_FN, encoding="ISO-8859-1")



In [8]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(df['question1']) + list(df['question2'])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))









    Out[8]:





<2000x4224 sparse matrix of type '<class 'numpy.float64'>'
	with 19730 stored elements in Compressed Sparse Row format>



In [18]:

    
word2tfidf;



In [19]:

    
# Set up spaCy
#from spacy.en import English
#parser = English()
import spacy
nlp = spacy.load('en')



In [20]:

    
vecs1 = []
for qu in tqdm(list(df['question1'])):
    doc = nlp(qu) 
    mean_vec = np.zeros([len(doc), 300])
    for word in doc:
        # word2vec
        vec = word.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec += vec * idf
    mean_vec = mean_vec.mean(axis=0)
    vecs1.append(mean_vec)
df['q1_feats'] = list(vecs1)









    



100%|██████████| 1000/1000 [00:01<00:00, 651.94it/s]



In [4]:

    
import argparse
parser = argparse.ArgumentParser(description="programpurpose")
parser.add_argument("-s", "--sample", help="run on sample", action="store_true")
args = parser.parse_args(["-s"])
limit = 10

df = pd.read_csv(RAW_DATA_FN, encoding="ISO-8859-1")
y_train = df['is_duplicate'].values

q1, q2 = [], []  # list of text samples

qid = df['id'].values
q1 = df['question1'].values
q2 = df['question2'].fillna("").values  # non duplicates can be nan
if args.sample:
    q1 = q1[:limit]
    q2 = q2[:limit]
    df = df[:limit]
texts = np.concatenate([q1, q2], axis=0)

texts.shape
df.head()









    Out[4]:





_StoreTrueAction(option_strings=['-s', '--sample'], dest='sample', nargs=0, const=True, default=False, type=None, choices=None, help='run on sample', metavar=None)






    Out[4]:





(20,)






    Out[4]:






  
    
      
      id
      qid1
      qid2
      question1
      question2
      is_duplicate
    
  
  
    
      0
      0
      1
      2
      What is the step by step guide to invest in sh...
      What is the step by step guide to invest in sh...
      0
    
    
      1
      1
      3
      4
      What is the story of Kohinoor (Koh-i-Noor) Dia...
      What would happen if the Indian government sto...
      0
    
    
      2
      2
      5
      6
      How can I increase the speed of my internet co...
      How can Internet speed be increased by hacking...
      0
    
    
      3
      3
      7
      8
      Why am I mentally very lonely? How can I solve...
      Find the remainder when [math]23^{24}[/math] i...
      0
    
    
      4
      4
      9
      10
      Which one dissolve in water quikly sugar, salt...
      Which fish would survive in salt water?
      0

SpaCy



In [5]:

    
# Set up spaCy
#from spacy.en import English
#parser = English()
import spacy
nlp = spacy.load('en')



In [6]:

    
w2f = {lex.orth_: lex.rank for lex in nlp.vocab}
f2w = {lex.rank: lex.orth_ for lex in nlp.vocab}



In [115]:

    
nlp.vocab[0].orth_, nlp.vocab[0].rank









    Out[115]:





('', 0)



In [116]:

    
[f2w[i] for i in range(1,19)]









    Out[116]:





['.',
 ',',
 'the',
 'to',
 'a',
 'I',
 'of',
 'and',
 'is',
 'that',
 '\n\n',
 'it',
 'you',
 'in',
 ' ',
 '"',
 "'s",
 "n't"]



In [7]:

    
def get_embeddings(vocab):
    max_rank = max(lex.rank for lex in vocab if lex.has_vector)
    # rank+1 because index 0 is NULL
    vectors = np.zeros((max_rank+1, vocab.vectors_length), dtype='float32')
    for lex in vocab:
        if lex.has_vector:
            vectors[lex.rank] = lex.vector
    return len(vectors), vectors

#num_words, embedding_matrix = get_embMatrix(word_index, embeddings_index)
num_words, embedding_matrix = get_embeddings(nlp.vocab)
embedding_matrix.shape









    Out[7]:





(742224, 300)



In [56]:

    
%%time
# vectorize the questions
vecs1 = [doc.vector for doc in nlp.pipe(df['question1'], n_threads=50)]
vecs1 =  np.array(vecs1)
df['q1_feats'] = list(vecs1)

vecs2 = [doc.vector for doc in nlp.pipe(df['question2'], n_threads=50)]
vecs2 =  np.array(vecs2)
df['q2_feats'] = list(vecs2)

# save features
pd.to_pickle(df, 'data/1_df.pkl')









    



CPU times: user 94.1 ms, sys: 4.59 ms, total: 98.6 ms
Wall time: 105 ms



In [65]:

    
df = pd.read_pickle('data/1_df.pkl')
df.head()

df.ix[1, 'question2']









    Out[65]:






  
    
      
      id
      qid1
      qid2
      question1
      question2
      is_duplicate
      q1_feats
      q2_feats
    
  
  
    
      0
      0
      1
      2
      What is the step by step guide to invest in sh...
      What is the step by step guide to invest in sh...
      0
      [0.0195267, 0.203426, 0.00217022, 0.0389881, 0...
      [0.0156703, 0.214883, -0.0177128, 0.0472451, 0...
    
    
      1
      1
      3
      4
      What is the story of Kohinoor (Koh-i-Noor) Dia...
      What would happen if the Indian government sto...
      0
      [-0.0691538, 0.26211, -0.0911685, -0.0305812, ...
      [-0.0571951, 0.172125, -0.100987, -0.00079805,...
    
    
      2
      2
      5
      6
      How can I increase the speed of my internet co...
      How can Internet speed be increased by hacking...
      0
      [-0.0334206, 0.227167, -0.219999, -0.0785076, ...
      [-0.218246, 0.241675, -0.101896, -0.10092, -0....
    
    
      3
      3
      7
      8
      Why am I mentally very lonely? How can I solve...
      Find the remainder when [math]23^{24}[/math] i...
      0
      [-0.0741342, 0.358056, -0.271047, -0.13483, 0....
      [-0.0235632, 0.0672538, -0.0241042, -0.0751297...
    
    
      4
      4
      9
      10
      Which one dissolve in water quikly sugar, salt...
      Which fish would survive in salt water?
      0
      [-0.0576909, 0.197554, 0.000139066, -0.12711, ...
      [-0.0745629, 0.0984321, 0.0713399, -0.144573, ...
    
  








    Out[65]:





'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?'



In [158]:

    
def get_featuresSpacy(docs, max_length):
    docs = list(docs)
    word2ind = dict()
    Xs = np.zeros((len(docs), max_length), dtype='int32')
    for i, doc in enumerate(docs):
        j = 0
        for token in doc:
            token = nlp.vocab[token.lower_]
            word2ind[token.orth_] = token.rank
            if token.has_vector and not token.is_punct and not token.is_space:
                Xs[i, j] = token.rank # frequency rank, 0: no vector existent?
                j += 1
                if j >= max_length:
                    break

    log.info('Found %s unique tokens.' % len(word2ind))

    with open(WORD_INDEX_FN, 'wb') as f:
        pickle.dump(word2ind, f)
    log.info("Saved", data=WORD_INDEX_FN, vocab=len(word2ind))
    
    return Xs, word2ind



In [159]:

    
data, word2ind = get_featuresSpacy(list(nlp.pipe(texts, n_threads=40, batch_size=60000)), MAX_SEQUENCE_LENGTH)
data.shape









    



event='Found 123 unique tokens.'
vocab=123 data='./data/output/word_index.pckl' event='Saved'






    Out[159]:





(20, 40)



In [146]:

    
word2ind









    Out[146]:





{"'m": 75,
 'a': 5,
 'about': 49,
 'active': 2612,
 'am': 180,
 'and': 8,
 'back': 171,
 'be': 23,
 'buy': 341,
 'by': 66,
 'can': 53,
 'cap': 5500,
 'carbon': 3257,
 'company': 452,
 'connection': 2160,
 'di': 2837,
 'diamond': 244532,
 'divided': 5798,
 'do': 28,
 'does': 85,
 'far': 274,
 'find': 206,
 'fish': 2291,
 'for': 19,
 'free': 251,
 'from': 59,
 'games': 590,
 'good': 110,
 'government': 179,
 'great': 262,
 'guide': 4417,
 'hack': 3458,
 'hacking': 6069,
 'happen': 423,
 'how': 94,
 'i': 129,
 'if': 50,
 'in': 14,
 'increase': 1015,
 'increased': 2246,
 'instead': 377,
 'internet': 526,
 'invest': 2569,
 'is': 9,
 'it': 12,
 'keeps': 1755,
 'lonely': 546906,
 'market': 286,
 'me': 71,
 'mentally': 3434,
 'moon': 3417,
 'my': 54,
 'of': 7,
 'one': 64,
 'phone': 963,
 'rising': 4043,
 'salt': 4057,
 'say': 120,
 'share': 997,
 'should': 97,
 'solve': 1968,
 'speed': 1435,
 'step': 1129,
 'stole': 3530,
 'story': 385,
 'sugar': 2895,
 'sun': 2188,
 'survive': 2163,
 'that': 10,
 'the': 3,
 'this': 33,
 'through': 246,
 'to': 4,
 'triple': 65763,
 'use': 147,
 'using': 282,
 'very': 137,
 'video': 394,
 'water': 567,
 'what': 58,
 'when': 87,
 'which': 111,
 'while': 219,
 'why': 145,
 'would': 42,
 'you': 13}



In [105]:

    
data[0]









    Out[105]:





array([ 142,    9,    3, 1129,   66, 1129, 4417,    4, 2569,   14,  997,  286,   14,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)



In [69]:

    
# tokenize the questions
q1_docs = list(nlp.pipe(df['question1'], batch_size=5000, n_threads=8))
#dev_docs = list(nlp.pipe(dev_texts, batch_size=5000, n_threads=3))

with open('q1_docs.pckl', 'wb') as f:
    pickle.dump(q1_docs, f)



In [70]:

    
# tokenize the questions
q2_docs = list(nlp.pipe(df['question2'], batch_size=5000, n_threads=8))
#dev_docs = list(nlp.pipe(dev_texts, batch_size=5000, n_threads=3))

with open('q2_docs.pckl', 'wb') as f:
    pickle.dump(q2_docs, f)



In [71]:

    
len(q1_docs), len(q2_docs)
assert len(q1_docs) == len(q2_docs)









    Out[71]:





(10, 10)



In [74]:

    
q1_feats = get_features(q1_docs, 50)

q1_feats.shape
q1_feats[0]
q1_docs[0]









    Out[74]:





(10, 50)






    Out[74]:





array([ 143,   10,    4, 1130,   67, 1130, 4418,    5, 2570,   15,  998,
        287,   15,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)






    Out[74]:





What is the step by step guide to invest in share market in india?



In [76]:

    
# frequency == rank != index in vocab
w = q1_docs[0][0]
w, w.rank
nlp.vocab['What'].orth









    Out[76]:





(What, 142)






    Out[76]:





727



In [77]:

    
# how to index and rank relate
nlp.vocab[727].orth_
nlp.vocab[727].rank









    Out[77]:





'What'






    Out[77]:





142



In [80]:

    
w2f['What'], f2w[142]









    Out[80]:





(142, 'What')

Keras



In [93]:

    
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100



In [147]:

    
def get_featuresKeras(texts, max_length):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word2ind = tokenizer.word_index
    #ind2word = {v: k for k, v in word2ind.items()}
    log.info('Found %s unique tokens.' % len(word2ind))

    with open(WORD_INDEX_FN, 'wb') as f:
        pickle.dump(word2ind, f)
    log.info("Saved", data=WORD_INDEX_FN, vocab=len(word2ind))

    #Xs = pad_sequences(sequences, maxlen=max_length)
    Xs = pad_sequences(sequences, maxlen=None)
    return Xs, word2ind



In [148]:

    
#data = get_featuresSpacy(list(nlp.pipe(texts, n_threads=40, batch_size=60000)), MAX_SEQUENCE_LENGTH)
data, word2indK = get_featuresKeras(texts, MAX_SEQUENCE_LENGTH)
data.shape









    



event='Found 113 unique tokens.'
vocab=113 data='./data/output/word_index.pckl' event='Saved'






    Out[148]:





(20, 17)



In [149]:

    
word2indK









    Out[149]:





{'23': 44,
 '24': 28,
 'a': 7,
 'about': 26,
 'active': 91,
 'am': 25,
 'and': 5,
 'ascendant': 89,
 'astrology': 58,
 'back': 74,
 'be': 16,
 'buy': 103,
 'by': 11,
 'can': 8,
 'cap': 47,
 'capricorn': 17,
 'carbon': 71,
 'charter': 72,
 'childern': 109,
 'company': 65,
 'connection': 54,
 'dcx3400': 30,
 'di': 97,
 'diamond': 21,
 'dissolve': 108,
 'divided': 67,
 'dns': 102,
 'do': 10,
 'does': 40,
 'far': 77,
 'find': 88,
 'fish': 63,
 'for': 85,
 'free': 99,
 'from': 60,
 'games': 112,
 'geologist': 49,
 'good': 57,
 'government': 79,
 'great': 110,
 'guide': 37,
 'hack': 42,
 'hacking': 68,
 'happen': 90,
 'how': 6,
 'i': 1,
 "i'm": 82,
 'if': 95,
 'in': 4,
 'increase': 83,
 'increased': 78,
 'india': 61,
 'indian': 93,
 'instead': 31,
 'internet': 18,
 'invest': 38,
 'is': 9,
 'it': 86,
 'keeps': 84,
 'koh': 48,
 'kohinoor': 46,
 'lonely': 105,
 'market': 33,
 'math': 36,
 'me': 39,
 'mentally': 52,
 'methane': 104,
 'moon': 23,
 'motorola': 22,
 'motorolla': 69,
 'my': 34,
 'noor': 29,
 'of': 13,
 'one': 53,
 'oxide': 96,
 'phone': 55,
 'quikly': 113,
 'remainder': 75,
 'rising': 92,
 'salt': 19,
 'say': 51,
 'share': 41,
 'should': 27,
 'solve': 64,
 'speed': 24,
 'step': 12,
 'stole': 81,
 'story': 80,
 'sugar': 66,
 'sun': 45,
 'survive': 70,
 'that': 59,
 'the': 3,
 'this': 100,
 'through': 106,
 'tiago': 73,
 'to': 14,
 'triple': 76,
 'use': 32,
 'using': 87,
 'very': 56,
 'video': 101,
 'vpn': 111,
 'water': 35,
 'what': 2,
 'when': 15,
 'which': 50,
 'while': 62,
 'why': 94,
 'would': 20,
 'you': 43,
 'ã\x81\x97': 98,
 'ã\x82·': 107}



In [160]:

    
# find the difference between Keras and SpaCy
set(word2ind.keys()) - set(word2indK.keys())
set(word2indK.keys()) - set(word2ind.keys())









    Out[160]:





{'"',
 '&',
 "'m",
 '(',
 ')',
 ',',
 '-',
 '...',
 '24,23',
 ':',
 '?',
 '[',
 ']',
 'math]23^{24}[/math'}






    Out[160]:





{'23', '24', "i'm", 'math'}



In [157]:

    
[s for s in texts if s.find('oxidee') != 0]









    Out[157]:





['What is the step by step guide to invest in share market in india?',
 'What is the story of Kohinoor (Koh-i-Noor) Diamond?',
 'How can I increase the speed of my internet connection while using a VPN?',
 'Why am I mentally very lonely? How can I solve it?',
 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?',
 'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?',
 'Should I buy tiago?',
 'How can I be a good geologist?',
 'When do you use ã\x82· instead of ã\x81\x97?',
 'Motorola (company): Can I hack my Charter Motorolla DCX3400?',
 'What is the step by step guide to invest in share market?',
 'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?',
 'How can Internet speed be increased by hacking through DNS?',
 'Find the remainder when [math]23^{24}[/math] is divided by 24,23?',
 'Which fish would survive in salt water?',
 "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?",
 'What keeps childern active and far from phone and video games?',
 'What should I do to be a great geologist?',
 'When do you use "&" instead of "and"?',
 'How do I hack Motorola DCX3400 for free internet?']



In [110]:

    
data[0]









    Out[110]:





array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  2,  9,  3, 12, 11, 12, 37, 14, 38,  4, 41, 33,  4, 61], dtype=int32)



In [81]:

    
def index_emb(args):
    """
    1. build index, mapping words in the embeddings set to their embedding vector
    """
    log.info('Indexing word vectors.')

    embeddings_index = {}
    f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    log.info('Found word vectors.', n=len(embeddings_index))

    with open(EMB_INDEX_FN, 'wb') as f:
        pickle.dump(embeddings_index, f)
    log.info("Saved", fn=EMB_INDEX_FN)
    return embeddings_index



In [91]:

    
# embedding_matrix[0] == 0
# no offset of 1 needed as seen in some examples

def get_embMatrix(word_index, embeddings_index):
    '''
    compute an index, mapping words to known embeddings, by parsing the data dump of pre-trained embeddings
    '''
    log.info('Preparing embedding matrix.')

    num_words = min(MAX_NB_WORDS, len(word_index))
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= MAX_NB_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be/stay all-zeros.
            embedding_matrix[i] = embedding_vector
    return num_words, embedding_matrix



In [83]:

    
embeddings_index = index_emb(None)









    



event='Indexing word vectors.'
n=400000 event='Found word vectors.'
fn='./data/output/emb_index.pckl' event='Saved'



In [87]:

    
with open(WORD_INDEX_FN, 'rb') as f:
    word2ind = pickle.load(f)
with open(EMB_INDEX_FN, 'rb') as f:
    embeddings_index = pickle.load(f)

len(word2ind), len(embeddings_index)









    Out[87]:





(95603, 400000)



In [85]:

    
i = 0
for k,v in word2ind.items():
    print(k, v)
    i += 1
    if i == 10:
        break

word2ind["paperback'"]









    



saves 14976
haar 54918
sympatric 36379
penicillin 27085
80kmph 46024
29y 69185
'fahrenheit 61859
karol 69106
illustrations 24273
paperback' 62662






    Out[85]:





62662



In [42]:

    
embeddings_index['hallo']









    Out[42]:





array([ 0.36537999, -0.10879   ,  0.17691   ,  0.22384   , -0.31248999,
       -0.77702999,  0.083026  , -0.19175   ,  0.25918001, -0.38552001,
        0.31325999,  0.091299  , -0.65684003,  0.50344002,  0.1322    ,
       -0.24023999,  0.19908001, -0.21407001,  0.027921  ,  0.41314   ,
        0.87686002,  0.1165    , -0.36769   , -0.82401001,  0.11035   ,
        0.11297   ,  0.087235  ,  0.37029999,  0.37641999, -0.31900001,
       -0.15839   ,  0.060404  ,  0.63230002,  0.88050997, -0.0071794 ,
        0.80598998,  0.20733   , -0.70696002, -0.2325    , -0.42739001,
        0.70345002,  0.15075999, -0.17053001,  0.92660999, -0.20591   ,
       -0.34477001,  0.13541999, -0.0033754 , -0.40729001,  0.17761   ,
        0.058826  ,  0.026633  , -0.19412   , -0.22645999, -0.37819999,
        0.71743   ,  0.75711   ,  0.039923  , -0.72781998, -0.52138001,
       -0.10117   , -0.84917003, -0.77096999, -0.59053999, -0.50967002,
        0.67018002, -0.56453001, -0.1392    , -0.17783999, -0.060695  ,
       -0.33917001,  0.46097001, -0.71060002,  0.047548  , -0.12077   ,
        0.685     ,  0.069214  ,  0.42223999,  0.18037   ,  0.15075   ,
        0.19465999,  0.045795  ,  0.15576001, -0.20928   ,  0.55207002,
        0.20095   , -0.35288   , -0.64231002,  0.28084999, -0.19318999,
        0.032665  , -0.64752001,  0.39320999,  0.41231   ,  0.12662999,
        0.046476  ,  0.24716   , -0.42932001, -0.93985999,  0.16095001], dtype=float32)



In [90]:

    
num_wordsK, embedding_matrixK = get_embMatrix(word2ind, embeddings_index)
embedding_matrixK.shape









    



event='Preparing embedding matrix.'






    Out[90]:





(20000, 100)

	id	qid1	qid2	question1	question2
0	0	1	2	What is the step by step guide to invest in sh...	What is the step by step guide to invest in sh...
1	1	3	4	What is the story of Kohinoor (Koh-i-Noor) Dia...	What would happen if the Indian government sto...
2	2	5	6	How can I increase the speed of my internet co...	How can Internet speed be increased by hacking...
3	3	7	8	Why am I mentally very lonely? How can I solve...	Find the remainder when [math]23^{24}[/math] i...
4	4	9	10	Which one dissolve in water quikly sugar, salt...	Which fish would survive in salt water?

	id	qid1	qid2	question1	question2	q1_feats	q2_feats
0	0	1	2	What is the step by step guide to invest in sh...	What is the step by step guide to invest in sh...	[0.0195267, 0.203426, 0.00217022, 0.0389881, 0...	[0.0156703, 0.214883, -0.0177128, 0.0472451, 0...
1	1	3	4	What is the story of Kohinoor (Koh-i-Noor) Dia...	What would happen if the Indian government sto...	[-0.0691538, 0.26211, -0.0911685, -0.0305812, ...	[-0.0571951, 0.172125, -0.100987, -0.00079805,...
2	2	5	6	How can I increase the speed of my internet co...	How can Internet speed be increased by hacking...	[-0.0334206, 0.227167, -0.219999, -0.0785076, ...	[-0.218246, 0.241675, -0.101896, -0.10092, -0....
3	3	7	8	Why am I mentally very lonely? How can I solve...	Find the remainder when [math]23^{24}[/math] i...	[-0.0741342, 0.358056, -0.271047, -0.13483, 0....	[-0.0235632, 0.0672538, -0.0241042, -0.0751297...
4	4	9	10	Which one dissolve in water quikly sugar, salt...	Which fish would survive in salt water?	[-0.0576909, 0.197554, 0.000139066, -0.12711, ...	[-0.0745629, 0.0984321, 0.0713399, -0.144573, ...