In [3]:

    
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
# import re
# import gc
import codecs
# import matplotlib.pyplot as plt
# import seaborn as sns
# import tensorflow as tf
# from bs4 import BeautifulSoup
# from nltk.corpus import stopwords
# from keras.preprocessing.text import Tokenizer

# %matplotlib inline
%load_ext autotime

# pal = sns.color_palette()









    



The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 6.86 ms

%%sh [ -f autotime.py ] && echo "File exist" || wget https://raw.githubusercontent.com/msampathkumar/ipython-autotime/master/autotime.py # sample of how GLOVE_DATA_FILE # wc data/glove.840B.300d.txt # 2196017 661001572 5646236541 data/glove.840B.300d.txt head -n20 data/glove.840B.300d.txt | awk -F" " '{ print $1"\t"$2"\t"$3 }'



In [4]:

    
# Paths

if os.path.isdir('data'):
    QUORA_DATA_DIR = "data/"
    GLOVE_DATA_DIR = "data/"
else:
    QUORA_DATA_DIR = "/opt/datasets/quora/"
    GLOVE_DATA_DIR = "/opt/datasets/glove/"

TRAIN_CSV = QUORA_DATA_DIR + 'train.csv'
TEST_CSV = QUORA_DATA_DIR + 'test.csv'

glove_840B_300d = GLOVE_DATA_DIR + 'glove.840B.300d.txt'
GLOVE_DATA_FILE = glove_840B_300d









    



time: 5.55 ms



In [5]:

    
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 45
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.01









    



time: 2.61 ms

Data Analysis



In [6]:

    
df_train = pd.read_csv(TRAIN_CSV)
df_test = pd.read_csv(TEST_CSV)

# Train Data
train_feature_1_string = pd.Series(df_train['question1'].tolist()).astype(str)
train_feature_2_string = pd.Series(df_train['question2'].tolist()).astype(str)

target = pd.Series(df_train['is_duplicate'].tolist())

all_train_qs = train_feature_1_string + train_feature_2_string

# Test Data
test_feature_1_string = pd.Series(df_test['question1'].tolist()).astype(str)
test_feature_2_string = pd.Series(df_test['question2'].tolist()).astype(str)

all_test_qs = test_feature_1_string + test_feature_2_string

all_qs = all_train_qs + all_test_qs

print(all_train_qs.tolist()[:10])

df_train.head()









    



['What is the step by step guide to invest in share market in india?What is the step by step guide to invest in share market?', 'What is the story of Kohinoor (Koh-i-Noor) Diamond?What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?', 'How can I increase the speed of my internet connection while using a VPN?How can Internet speed be increased by hacking through DNS?', 'Why am I mentally very lonely? How can I solve it?Find the remainder when [math]23^{24}[/math] is divided by 24,23?', 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?Which fish would survive in salt water?', "Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?", 'Should I buy tiago?What keeps childern active and far from phone and video games?', 'How can I be a good geologist?What should I do to be a great geologist?', 'When do you use シ instead of し?When do you use "&" instead of "and"?', 'Motorola (company): Can I hack my Charter Motorolla DCX3400?How do I hack Motorola DCX3400 for free internet?']






    Out[6]:






  
    
      
      id
      qid1
      qid2
      question1
      question2
      is_duplicate
    
  
  
    
      0
      0
      1
      2
      What is the step by step guide to invest in sh...
      What is the step by step guide to invest in sh...
      0
    
    
      1
      1
      3
      4
      What is the story of Kohinoor (Koh-i-Noor) Dia...
      What would happen if the Indian government sto...
      0
    
    
      2
      2
      5
      6
      How can I increase the speed of my internet co...
      How can Internet speed be increased by hacking...
      0
    
    
      3
      3
      7
      8
      Why am I mentally very lonely? How can I solve...
      Find the remainder when [math]23^{24}[/math] i...
      0
    
    
      4
      4
      9
      10
      Which one dissolve in water quikly sugar, salt...
      Which fish would survive in salt water?
      0
    
  








    



time: 20.1 s



In [7]:

    
df_test.head()









    Out[7]:






  
    
      
      test_id
      question1
      question2
    
  
  
    
      0
      0
      How does the Surface Pro himself 4 compare wit...
      Why did Microsoft choose core m3 and not core ...
    
    
      1
      1
      Should I have a hair transplant at age 24? How...
      How much cost does hair transplant require?
    
    
      2
      2
      What but is the best way to send money from Ch...
      What you send money to China?
    
    
      3
      3
      Which food not emulsifiers?
      What foods fibre?
    
    
      4
      4
      How "aberystwyth" start reading?
      How their can I start reading?
    
  








    



time: 11.6 ms

Text Analysis



In [8]:

    
all_qids = df_train.qid1 + df_train.qid2

train_qs = df_train.question1 + df_train.question2









    



time: 275 ms



In [9]:

    
total_ques_pairs = len(df_train)
print('Total number of question pairs for training: {}'.format(total_ques_pairs))

duplicate_ques_pairs = round(df_train['is_duplicate'].mean()*100, 2)
print('Duplicate pairs: {}%'.format(duplicate_ques_pairs))

unique_qids = len(np.unique(train_qs.fillna("")))
print('Total number of questions in the training data: {}'.format(unique_qids))

print('Number of questions that appear multiple times: {}'.format(np.sum(all_qids.value_counts() > 1)))

print("Total number of questions in Quora dataset: {}".format(len(all_qs)))









    



Total number of question pairs for training: 404290
Duplicate pairs: 36.92%
Total number of questions in the training data: 404289
Number of questions that appear multiple times: 50205
Total number of questions in Quora dataset: 2345796
time: 1.19 s

Text Processing

TODO: Check the AVG length of english words and filter them out in GLOVE DATA
TODO: Check the AVG length of english words and filter them out in INPUT DATA



In [116]:

    
import string
import codecs


ACC_ALPHA_NUM_CHARS = set(map(ord, string.ascii_lowercase +  string.digits + ' ')) # adding space here
ACC_SPECIAL_CHARS = set(map(ord, '''~!@#$%^&*()_+`-=[]\{}|;':"<>?,./<\/>|?'"+=-*&^%$#@!_.;:{}()[]~!@#$%^&*()_+''')) 


# TODO: Split char processor to char_selector & char_procssor

def char_processor(text):
    """String charecters are processed based on globally defined ACC_* variables.

    Args:
        text(string): input string

    Features:
        * clears trailing or heading space or tabs
        * converts text to lower case
        * adds appends and prepends spaces to all special charecters
    """
    new_text = []
    new_text_append = new_text.append
    for char in text.strip().lower():
        if ord(char) in ACC_ALPHA_NUM_CHARS:
            new_text_append(char)
        elif ord(char) in ACC_SPECIAL_CHARS:
            new_text_append(" " + char + " ")
    return ''.join(new_text)


def word_selector(word):
    """Return True or False, to tell if we should select the word.
    
    Args:
        word(string): an array of charecters
        
    Selection Criteria:
        * If len == 1, has to be one of accepted ACC_* variables.
        * If len > 1, has to be lower case & alpha-numeric (Eg: i3, m5, 1plus, 3g,..)
    """
    if (len(word) == 1 and (ord(word) in ACC_ALPHA_NUM_CHARS or ord(word) in ACC_SPECIAL_CHARS))\
       or (key.islower() and len(key) > 1 and key.isalnum()):
        return True
    else:
        return Flase


def glove_embeddings():
    print('Indexing word vectors.')
    embeddings_index = dict()
    with codecs.open(GLOVE_DATA_FILE, encoding='utf-8') as f:
        for line in f:
            key, *vector = line.split(' ')
            if word_selector(key):
                embeddings_index[key] = np.asarray(vector, dtype='float32')
    return embeddings_index


def text_processor(text):
    return [word for word in text.split() if word_selector(word)]


def micro_procesor(text):
    return char_processor(text)









    



time: 36.5 ms



In [156]:

    
## Testing

msg = r'''
How does the Surface Pro himself 4 compare with iPad Pro?Why did Microsoft choose core
m3 and not core i3 home Surface Pro 4? as/df\asdf\ta?sd|asdf.as
'''

print(msg)
print('--' * 25)
msg = micro_procesor(msg)
print(msg)
print('--' * 25)
lows = text_processor(msg)
print(lows)
print('--' * 25)









    



How does the Surface Pro himself 4 compare with iPad Pro?Why did Microsoft choose core
m3 and not core i3 home Surface Pro 4? as/df\asdf\ta?sd|asdf.as

--------------------------------------------------
how does the surface pro himself 4 compare with ipad pro ? why did microsoft choose corem3 and not core i3 home surface pro 4 ?  as / df \ asdf \ ta ? sd | asdf . as
--------------------------------------------------
['how', 'does', 'the', 'surface', 'pro', 'himself', '4', 'compare', 'with', 'ipad', 'pro', '?', 'why', 'did', 'microsoft', 'choose', 'corem3', 'and', 'not', 'core', 'i3', 'home', 'surface', 'pro', '4', '?', 'as', '/', 'df', '\\', 'asdf', '\\', 'ta', '?', 'sd', '|', 'asdf', '.', 'as']
--------------------------------------------------
time: 7.25 ms



In [125]:

    
if ('embeddings_index' not in dir()):
    print('Indexing word vectors.')
    embeddings_index = glove_embeddings()
    print('Found %s word vectors.' % len(embeddings_index))

else:
    print('Skipped to save some time!')









    



Skipped to save some time!
time: 4.31 ms



In [126]:

    
PAD_EMB = '<PAD>'
embeddings_index[PAD_EMB] = np.zeros(300, dtype=float)









    



time: 2.55 ms

words_corpus = '' for msg in all_test_qs: print(msg) print(text_processor(micro_procesor(msg))) break

msg



In [87]:

    
msg = 'What is the best medication equation erectile dysfunction?How do I out get rid of Erectile Dysfunction?'

msg, text_processor(msg)









    Out[87]:





('What is the best medication equation erectile dysfunction?How do I out get rid of Erectile Dysfunction?',
 'what is the best medication equation erectile dysfunction ? how do i out get rid of erectile dysfunction ? ')






    



time: 4.31 ms



In [131]:

    
l = ["just say a word","of some length"]
ll = list(map(lambda l: l.split(" "), l))
seq_length = 5
ll
e = {"just": [0,0,0,0,1],
     "say" : [0,0,0,1,1],
     "a" : [0,1,1,1,0],
     "word" : [1,0,0,1,0],
     "of" : [0,1,0,1,0],
     "some" : [0,1,0,1,0],
     "length" : [0,0,1,1,0],
     '<PAD>': [0,0,0,0,0]
    }









    



time: 8.55 ms



In [165]:

    
# list of words --> lows

from pprint import pprint as pp

from numpy import array

def lows_padding(list_of_words, seq_length=5, append=True):
    """Pads/slices given bag of words for specified length."""
    if len(list_of_words) == seq_length:
        return list_of_words
    elif len(list_of_words) > seq_length:
        return list_of_words[:seq_length]
    # else:
    tmp = ['<PAD>' for i in range(seq_length - len(list_of_words))]
    if append:
        return list_of_words + tmp
    return  tmp + list_of_words

def lows_embedding(list_of_words, serializer):
    """To serializer/string Tokenize the list of words."""
    alist = []
    for word in list_of_words:
        try:
            alist.append(serializer[word])
        except KeyError:
            alist.append(serializer[PAD_EMB])
    return array(alist)

def lows_transformer(list_of_words, serializer, seq_length, append):
    """To pad the given list of words and serialiase them."""
    return lows_embedding(lows_padding(list_of_words, seq_length, append),
                          serializer)

bag_of_lows_transformer = lambda list_of_words: lows_transformer(list_of_words, serializer=embeddings_index,
                                                                 seq_length=4, append=True)

game = list(map(bag_of_lows_transformer, ll))









    



time: 22 ms



In [171]:

    
## Testing

msg = r'''
How does the Surface Pro himself 4 compare with iPad Pro?Why did Microsoft choose core
m3 and not core i3 home Surface Pro 4? as/df\asdf\ta?sd|asdf.as
'''

print(msg)
print('--' * 25)
msg = micro_procesor(msg)
print(msg)
print('--' * 25)
lows = text_processor(msg)
print(lows)
print('--' * 25)
lows = lows_padding(lows, append=True, seq_length=50)
print(lows)
print('--' * 25)
lows = lows_embedding(lows, embeddings_index)
print(lows)
print(lows.shape)
print('--' * 25)



question_transformer = lambda msg: lows_embedding(
                                                lows_padding(
                                                    text_processor(
                                                        micro_procesor(msg)
                                                    ),
                                                    append=True,
                                                    seq_length=50),
                                                embeddings_index)









    



How does the Surface Pro himself 4 compare with iPad Pro?Why did Microsoft choose core
m3 and not core i3 home Surface Pro 4? as/df\asdf\ta?sd|asdf.as

--------------------------------------------------
how does the surface pro himself 4 compare with ipad pro ? why did microsoft choose corem3 and not core i3 home surface pro 4 ?  as / df \ asdf \ ta ? sd | asdf . as
--------------------------------------------------
['how', 'does', 'the', 'surface', 'pro', 'himself', '4', 'compare', 'with', 'ipad', 'pro', '?', 'why', 'did', 'microsoft', 'choose', 'corem3', 'and', 'not', 'core', 'i3', 'home', 'surface', 'pro', '4', '?', 'as', '/', 'df', '\\', 'asdf', '\\', 'ta', '?', 'sd', '|', 'asdf', '.', 'as']
--------------------------------------------------
['how', 'does', 'the', 'surface', 'pro', 'himself', '4', 'compare', 'with', 'ipad', 'pro', '?', 'why', 'did', 'microsoft', 'choose', 'corem3', 'and', 'not', 'core', 'i3', 'home', 'surface', 'pro', '4', '?', 'as', '/', 'df', '\\', 'asdf', '\\', 'ta', '?', 'sd', '|', 'asdf', '.', 'as', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
--------------------------------------------------
[[-0.23205     0.47468001 -0.38264    ...,  0.33177999  0.31545001  0.37972   ]
 [-0.13563     0.33217001 -0.36019999 ..., -0.17296     0.21675999
   0.22205999]
 [ 0.27204001 -0.06203    -0.1884     ...,  0.13015001 -0.18317001  0.1323    ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]
(50, 300)
--------------------------------------------------
time: 19.4 ms



In [172]:

    
all_qs[5], question_transformer(all_qs[5])









    Out[172]:





("Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?How are the two wheeler insurance from Bharti Axa insurance?I admire I am considering of buying insurance from them",
 array([[ 0.21094   ,  0.59290999,  0.26559001, ..., -0.28240001,
          0.81332999, -0.38316   ],
        [ 0.008746  ,  0.33214   , -0.29175001, ..., -0.28676999,
         -0.22663   , -0.05087   ],
        [ 0.18732999,  0.40595001, -0.51174003, ...,  0.16495   ,
          0.18757001,  0.53873998],
        ..., 
        [ 0.19375999, -0.34272   , -0.37279999, ..., -0.51226002,
          0.28685999, -0.38719001],
        [ 0.37661999,  0.3118    , -0.25503999, ..., -0.030757  ,
         -0.13600001, -0.47569999],
        [-0.57073998,  0.42093   , -0.37584999, ..., -0.035666  ,
          0.46285   ,  0.019142  ]], dtype=float32))






    



time: 7.51 ms



In [173]:

    
sample = all_qs[:500]









    



time: 1.67 ms



In [174]:

    
sample.map(question_transformer)









    Out[174]:





0      [[-0.038548, 0.54252, -0.21843, -0.18855, 0.07...
1      [[-0.0385480001569, 0.542519986629, -0.2184299...
2      [[-0.232050001621, 0.474680006504, -0.38264000...
3      [[-0.177389994264, 0.371549993753, -0.38040000...
4      [[0.229369997978, 0.121559999883, 0.0934149995...
5      [[0.21094, 0.59291, 0.26559, 0.37571, 0.51553,...
6      [[-0.0130479997024, 0.0663049966097, -0.186749...
7      [[-0.232050001621, 0.474680006504, -0.38264000...
8      [[0.204999998212, 0.300570011139, -0.302590012...
9      [[-0.0632240027189, 0.236330002546, 0.44685998...
10     [[-0.351429998875, 0.794250011444, -0.16877999...
11     [[-0.23205, 0.47468, -0.38264, 0.0022248, -0.1...
12     [[-0.0385480001569, 0.542519986629, -0.2184299...
13     [[-0.0385480001569, 0.542519986629, -0.2184299...
14     [[-0.038548, 0.54252, -0.21843, -0.18855, 0.07...
15     [[-0.0385480001569, 0.542519986629, -0.2184299...
16     [[-0.0385480001569, 0.542519986629, -0.2184299...
17     [[-0.17739, 0.37155, -0.3804, -0.22554, -0.066...
18     [[-0.177389994264, 0.371549993753, -0.38040000...
19     [[0.229369997978, 0.121559999883, 0.0934149995...
20     [[-0.177389994264, 0.371549993753, -0.38040000...
21     [[-0.0385480001569, 0.542519986629, -0.2184299...
22     [[-0.0385480001569, 0.542519986629, -0.2184299...
23     [[-0.232050001621, 0.474680006504, -0.38264000...
24     [[-0.0385480001569, 0.542519986629, -0.2184299...
25     [[-0.038548, 0.54252, -0.21843, -0.18855, 0.07...
26     [[-0.0385480001569, 0.542519986629, -0.2184299...
27     [[-0.135629996657, 0.332170009613, -0.36019998...
28     [[-0.0385480001569, 0.542519986629, -0.2184299...
29     [[-0.23205, 0.47468, -0.38264, 0.0022248, -0.1...
                             ...                        
470    [[0.08649, 0.14503, -0.4902, 0.34224, 0.36343,...
471    [[-0.0849609971046, 0.501999974251, 0.00238230...
472    [[-0.0849609971046, 0.501999974251, 0.00238230...
473    [[-0.23205, 0.47468, -0.38264, 0.0022248, -0.1...
474    [[0.10528, 0.44863, 0.55631, -0.15466, -0.0726...
475    [[0.0356700010598, 0.18559999764, -0.305519998...
476    [[-0.23857, 0.35457, -0.30219, 0.089559, 0.082...
477    [[-0.232050001621, 0.474680006504, -0.38264000...
478    [[-0.177389994264, 0.371549993753, -0.38040000...
479    [[-0.0688939988613, 0.387690007687, -0.2612000...
480    [[-0.238570004702, 0.354570001364, -0.30219000...
481    [[-0.0849609971046, 0.501999974251, 0.00238230...
482    [[-0.0385480001569, 0.542519986629, -0.2184299...
483    [[-0.177389994264, 0.371549993753, -0.38040000...
484    [[-0.232050001621, 0.474680006504, -0.38264000...
485    [[0.0746089965105, 0.269320011139, -0.02772399...
486    [[-0.232050001621, 0.474680006504, -0.38264000...
487    [[-0.038548, 0.54252, -0.21843, -0.18855, 0.07...
488    [[-0.232050001621, 0.474680006504, -0.38264000...
489    [[-0.13563, 0.33217, -0.3602, -0.058787, 0.064...
490    [[-0.0849609971046, 0.501999974251, 0.00238230...
491    [[-0.0385480001569, 0.542519986629, -0.2184299...
492    [[-0.0849609971046, 0.501999974251, 0.00238230...
493    [[-0.232050001621, 0.474680006504, -0.38264000...
494    [[0.229369997978, 0.121559999883, 0.0934149995...
495    [[-0.0385480001569, 0.542519986629, -0.2184299...
496    [[0.22937, 0.12156, 0.093415, -0.31117, 0.1419...
497    [[-0.0385480001569, 0.542519986629, -0.2184299...
498    [[-0.0849609971046, 0.501999974251, 0.00238230...
499    [[-0.0385480001569, 0.542519986629, -0.2184299...
dtype: object






    



time: 3.59 s

Text Transformation



In [175]:

    
# df_train = pd.read_csv(TRAIN_CSV)
# df_test = pd.read_csv(TEST_CSV)

df_train.describe()









    Out[175]:






  
    
      
      id
      qid1
      qid2
      is_duplicate
    
  
  
    
      count
      404290.000000
      404290.000000
      404290.000000
      404290.000000
    
    
      mean
      202144.500000
      217243.942418
      220955.655337
      0.369198
    
    
      std
      116708.614503
      157751.700002
      159903.182629
      0.482588
    
    
      min
      0.000000
      1.000000
      2.000000
      0.000000
    
    
      25%
      101072.250000
      74437.500000
      74727.000000
      0.000000
    
    
      50%
      202144.500000
      192182.000000
      197052.000000
      0.000000
    
    
      75%
      303216.750000
      346573.500000
      354692.500000
      1.000000
    
    
      max
      404289.000000
      537932.000000
      537933.000000
      1.000000
    
  








    



time: 514 ms



In [181]:

    
np.sum(df_train.isnull(), axis=0), np.sum(df_test.isnull(), axis=0)









    Out[181]:





(id              0
 qid1            0
 qid2            0
 question1       0
 question2       2
 is_duplicate    0
 dtype: int64, test_id      0
 question1    2
 question2    4
 dtype: int64)






    



time: 829 ms



In [197]:

    
df_train = pd.read_csv(TRAIN_CSV)
df_test = pd.read_csv(TEST_CSV)









    



time: 12.5 s



In [198]:

    
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)









    



time: 977 ms



In [199]:

    
df_train.question1.apply?









    



time: 144 ms



In [213]:

    
def row_mapper(row):
    print(str(row))
    row.question1 = "a"
    row.question2 = 'b'
    return row

df_train[:5].applymap?









    



Object `applymap` not found.
time: 4.19 ms



In [215]:

    
df_train.apply?









    



time: 21.9 ms



In [222]:

    
df_train['question1 question2'.split()][:2].applymap(question_transformer)









    Out[222]:






  
    
      
      question1
      question2
    
  
  
    
      0
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
    
    
      1
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
    
  








    



time: 516 ms



In [223]:

    
import copy









    



time: 1.15 ms



In [306]:

    
# sample = copy.deepcopy(df_train['question1 question2'.split()][:50].applymap(question_transformer))
sample = copy.deepcopy(df_train['question1 question2'.split()][:10]) # .applymap(question_transformer)









    



time: 29.8 ms



In [307]:

    
sample.shape, question_transformer('0	What is the step by step guide to invest i').shape









    Out[307]:





((10, 2), (50, 300))






    



time: 5.31 ms



In [309]:

    
row = []

def row_mapper(input_row):
    global row 
    row = input_row
    return row









    



time: 2.69 ms



In [314]:

    
sample.applymap(row_mapper)









    Out[314]:






  
    
      
      question1
      question2
    
  
  
    
      0
      What is the step by step guide to invest in sh...
      What is the step by step guide to invest in sh...
    
    
      1
      What is the story of Kohinoor (Koh-i-Noor) Dia...
      What would happen if the Indian government sto...
    
    
      2
      How can I increase the speed of my internet co...
      How can Internet speed be increased by hacking...
    
    
      3
      Why am I mentally very lonely? How can I solve...
      Find the remainder when [math]23^{24}[/math] i...
    
    
      4
      Which one dissolve in water quikly sugar, salt...
      Which fish would survive in salt water?
    
    
      5
      Astrology: I am a Capricorn Sun Cap moon and c...
      I'm a triple Capricorn (Sun, Moon and ascendan...
    
    
      6
      Should I buy tiago?
      What keeps childern active and far from phone ...
    
    
      7
      How can I be a good geologist?
      What should I do to be a great geologist?
    
    
      8
      When do you use シ instead of し?
      When do you use "&" instead of "and"?
    
    
      9
      Motorola (company): Can I hack my Charter Moto...
      How do I hack Motorola DCX3400 for free internet?
    
  








    



time: 12.9 ms



In [318]:

    
features1 = np.array(sample.question1)









    



time: 1.55 ms



In [324]:

    
new_features1 = array([question_transformer(_) for _ in sample.question1])
new_features2 = array([question_transformer(_) for _ in sample.question2])









    



time: 9.12 ms



In [ ]:

    
df_train = array([array([question_transformer(_) for _ in df_train.question1]),
           array([question_transformer(_) for _ in df_train.question2])])



In [ ]:

    
df_train.shape



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [263]:

    
new = sample.applymap(question_transformer)









    Out[263]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      90
      91
      92
      93
      94
      95
      96
      97
      98
      99
    
  
  
    
      question1
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.232050001621, 0.474680006504, -0.38264000...
      [[-0.177389994264, 0.371549993753, -0.38040000...
      [[0.229369997978, 0.121559999883, 0.0934149995...
      [[0.210940003395, 0.592909991741, 0.2655900120...
      [[-0.0130479997024, 0.0663049966097, -0.186749...
      [[-0.232050001621, 0.474680006504, -0.38264000...
      [[0.204999998212, 0.300570011139, -0.302590012...
      [[-0.0632240027189, 0.236330002546, 0.44685998...
      ...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.0279389992356, 0.356279999018, 0.65255999...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.232050001621, 0.474680006504, -0.38264000...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.177389994264, 0.371549993753, -0.38040000...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.177389994264, 0.371549993753, -0.38040000...
    
    
      question2
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.232050001621, 0.474680006504, -0.38264000...
      [[0.0850000008941, 0.322129994631, -0.56233000...
      [[0.229369997978, 0.121559999883, 0.0934149995...
      [[0.187329992652, 0.405950009823, -0.511740028...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[0.204999998212, 0.300570011139, -0.302590012...
      [[-0.232050001621, 0.474680006504, -0.38264000...
      ...
      [[0.229369997978, 0.121559999883, 0.0934149995...
      [[-0.0279389992356, 0.356279999018, 0.65255999...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.232050001621, 0.474680006504, -0.38264000...
      [[0.0746089965105, 0.269320011139, -0.02772399...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.0385480001569, 0.542519986629, -0.2184299...
      [[-0.232050001621, 0.474680006504, -0.38264000...
    
  

2 rows × 100 columns







    



time: 4.67 s



In [294]:

    
np.array(sample.question1.map(lambda x: array([1, 1]))).shape









    Out[294]:





(100,)






    



time: 5.22 ms



In [298]:

    
features1 = np.zeros_like(sample.question1)

for i, row in enumerate(sample.question1):
    features1[i] = question_transformer(sample.question1[i])









    



time: 23 ms



In [305]:

    
type(features1[1])









    Out[305]:





numpy.ndarray






    



time: 4.02 ms



In [ ]:

	id	qid1	qid2	question1	question2
0	0	1	2	What is the step by step guide to invest in sh...	What is the step by step guide to invest in sh...
1	1	3	4	What is the story of Kohinoor (Koh-i-Noor) Dia...	What would happen if the Indian government sto...
2	2	5	6	How can I increase the speed of my internet co...	How can Internet speed be increased by hacking...
3	3	7	8	Why am I mentally very lonely? How can I solve...	Find the remainder when [math]23^{24}[/math] i...
4	4	9	10	Which one dissolve in water quikly sugar, salt...	Which fish would survive in salt water?

	test_id	question1	question2
0	0	How does the Surface Pro himself 4 compare wit...	Why did Microsoft choose core m3 and not core ...
1	1	Should I have a hair transplant at age 24? How...	How much cost does hair transplant require?
2	2	What but is the best way to send money from Ch...	What you send money to China?
3	3	Which food not emulsifiers?	What foods fibre?
4	4	How "aberystwyth" start reading?	How their can I start reading?

	id	qid1	qid2	is_duplicate
count	404290.000000	404290.000000	404290.000000	404290.000000
mean	202144.500000	217243.942418	220955.655337	0.369198
std	116708.614503	157751.700002	159903.182629	0.482588
min	0.000000	1.000000	2.000000	0.000000
25%	101072.250000	74437.500000	74727.000000	0.000000
50%	202144.500000	192182.000000	197052.000000	0.000000
75%	303216.750000	346573.500000	354692.500000	1.000000
max	404289.000000	537932.000000	537933.000000	1.000000

	question1	question2
0	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.0385480001569, 0.542519986629, -0.2184299...
1	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.0385480001569, 0.542519986629, -0.2184299...

	0	1	2	3	4	5	6	7	8	9	...	90	91	92	93	94	95	96	97	98	99
question1	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.232050001621, 0.474680006504, -0.38264000...	[[-0.177389994264, 0.371549993753, -0.38040000...	[[0.229369997978, 0.121559999883, 0.0934149995...	[[0.210940003395, 0.592909991741, 0.2655900120...	[[-0.0130479997024, 0.0663049966097, -0.186749...	[[-0.232050001621, 0.474680006504, -0.38264000...	[[0.204999998212, 0.300570011139, -0.302590012...	[[-0.0632240027189, 0.236330002546, 0.44685998...	...	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.0279389992356, 0.356279999018, 0.65255999...	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.232050001621, 0.474680006504, -0.38264000...	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.177389994264, 0.371549993753, -0.38040000...	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.177389994264, 0.371549993753, -0.38040000...
question2	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.232050001621, 0.474680006504, -0.38264000...	[[0.0850000008941, 0.322129994631, -0.56233000...	[[0.229369997978, 0.121559999883, 0.0934149995...	[[0.187329992652, 0.405950009823, -0.511740028...	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[0.204999998212, 0.300570011139, -0.302590012...	[[-0.232050001621, 0.474680006504, -0.38264000...	...	[[0.229369997978, 0.121559999883, 0.0934149995...	[[-0.0279389992356, 0.356279999018, 0.65255999...	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.232050001621, 0.474680006504, -0.38264000...	[[0.0746089965105, 0.269320011139, -0.02772399...	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.0385480001569, 0.542519986629, -0.2184299...	[[-0.232050001621, 0.474680006504, -0.38264000...