In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
# import re
# import gc
import codecs
# import matplotlib.pyplot as plt
# import seaborn as sns
# import tensorflow as tf
# from bs4 import BeautifulSoup
# from nltk.corpus import stopwords
# from keras.preprocessing.text import Tokenizer

# %matplotlib inline
%load_ext autotime

# pal = sns.color_palette()


The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 6.86 ms
%%sh [ -f autotime.py ] && echo "File exist" || wget https://raw.githubusercontent.com/msampathkumar/ipython-autotime/master/autotime.py # sample of how GLOVE_DATA_FILE # wc data/glove.840B.300d.txt # 2196017 661001572 5646236541 data/glove.840B.300d.txt head -n20 data/glove.840B.300d.txt | awk -F" " '{ print $1"\t"$2"\t"$3 }'

In [4]:
# Paths

if os.path.isdir('data'):
    QUORA_DATA_DIR = "data/"
    GLOVE_DATA_DIR = "data/"
else:
    QUORA_DATA_DIR = "/opt/datasets/quora/"
    GLOVE_DATA_DIR = "/opt/datasets/glove/"

TRAIN_CSV = QUORA_DATA_DIR + 'train.csv'
TEST_CSV = QUORA_DATA_DIR + 'test.csv'

glove_840B_300d = GLOVE_DATA_DIR + 'glove.840B.300d.txt'
GLOVE_DATA_FILE = glove_840B_300d


time: 5.55 ms

In [5]:
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 45
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.01


time: 2.61 ms

Data Analysis


In [6]:
df_train = pd.read_csv(TRAIN_CSV)
df_test = pd.read_csv(TEST_CSV)

# Train Data
train_feature_1_string = pd.Series(df_train['question1'].tolist()).astype(str)
train_feature_2_string = pd.Series(df_train['question2'].tolist()).astype(str)

target = pd.Series(df_train['is_duplicate'].tolist())

all_train_qs = train_feature_1_string + train_feature_2_string

# Test Data
test_feature_1_string = pd.Series(df_test['question1'].tolist()).astype(str)
test_feature_2_string = pd.Series(df_test['question2'].tolist()).astype(str)

all_test_qs = test_feature_1_string + test_feature_2_string

all_qs = all_train_qs + all_test_qs

print(all_train_qs.tolist()[:10])

df_train.head()


['What is the step by step guide to invest in share market in india?What is the step by step guide to invest in share market?', 'What is the story of Kohinoor (Koh-i-Noor) Diamond?What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?', 'How can I increase the speed of my internet connection while using a VPN?How can Internet speed be increased by hacking through DNS?', 'Why am I mentally very lonely? How can I solve it?Find the remainder when [math]23^{24}[/math] is divided by 24,23?', 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?Which fish would survive in salt water?', "Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?", 'Should I buy tiago?What keeps childern active and far from phone and video games?', 'How can I be a good geologist?What should I do to be a great geologist?', 'When do you use シ instead of し?When do you use "&" instead of "and"?', 'Motorola (company): Can I hack my Charter Motorolla DCX3400?How do I hack Motorola DCX3400 for free internet?']
Out[6]:
id qid1 qid2 question1 question2 is_duplicate
0 0 1 2 What is the step by step guide to invest in sh... What is the step by step guide to invest in sh... 0
1 1 3 4 What is the story of Kohinoor (Koh-i-Noor) Dia... What would happen if the Indian government sto... 0
2 2 5 6 How can I increase the speed of my internet co... How can Internet speed be increased by hacking... 0
3 3 7 8 Why am I mentally very lonely? How can I solve... Find the remainder when [math]23^{24}[/math] i... 0
4 4 9 10 Which one dissolve in water quikly sugar, salt... Which fish would survive in salt water? 0
time: 20.1 s

In [7]:
df_test.head()


Out[7]:
test_id question1 question2
0 0 How does the Surface Pro himself 4 compare wit... Why did Microsoft choose core m3 and not core ...
1 1 Should I have a hair transplant at age 24? How... How much cost does hair transplant require?
2 2 What but is the best way to send money from Ch... What you send money to China?
3 3 Which food not emulsifiers? What foods fibre?
4 4 How "aberystwyth" start reading? How their can I start reading?
time: 11.6 ms

Text Analysis


In [8]:
all_qids = df_train.qid1 + df_train.qid2

train_qs = df_train.question1 + df_train.question2


time: 275 ms

In [9]:
total_ques_pairs = len(df_train)
print('Total number of question pairs for training: {}'.format(total_ques_pairs))

duplicate_ques_pairs = round(df_train['is_duplicate'].mean()*100, 2)
print('Duplicate pairs: {}%'.format(duplicate_ques_pairs))

unique_qids = len(np.unique(train_qs.fillna("")))
print('Total number of questions in the training data: {}'.format(unique_qids))

print('Number of questions that appear multiple times: {}'.format(np.sum(all_qids.value_counts() > 1)))

print("Total number of questions in Quora dataset: {}".format(len(all_qs)))


Total number of question pairs for training: 404290
Duplicate pairs: 36.92%
Total number of questions in the training data: 404289
Number of questions that appear multiple times: 50205
Total number of questions in Quora dataset: 2345796
time: 1.19 s

Text Processing

  • TODO: Check the AVG length of english words and filter them out in GLOVE DATA
  • TODO: Check the AVG length of english words and filter them out in INPUT DATA

In [116]:
import string
import codecs


ACC_ALPHA_NUM_CHARS = set(map(ord, string.ascii_lowercase +  string.digits + ' ')) # adding space here
ACC_SPECIAL_CHARS = set(map(ord, '''~!@#$%^&*()_+`-=[]\{}|;':"<>?,./<\/>|?'"+=-*&^%$#@!_.;:{}()[]~!@#$%^&*()_+''')) 


# TODO: Split char processor to char_selector & char_procssor

def char_processor(text):
    """String charecters are processed based on globally defined ACC_* variables.

    Args:
        text(string): input string

    Features:
        * clears trailing or heading space or tabs
        * converts text to lower case
        * adds appends and prepends spaces to all special charecters
    """
    new_text = []
    new_text_append = new_text.append
    for char in text.strip().lower():
        if ord(char) in ACC_ALPHA_NUM_CHARS:
            new_text_append(char)
        elif ord(char) in ACC_SPECIAL_CHARS:
            new_text_append(" " + char + " ")
    return ''.join(new_text)


def word_selector(word):
    """Return True or False, to tell if we should select the word.
    
    Args:
        word(string): an array of charecters
        
    Selection Criteria:
        * If len == 1, has to be one of accepted ACC_* variables.
        * If len > 1, has to be lower case & alpha-numeric (Eg: i3, m5, 1plus, 3g,..)
    """
    if (len(word) == 1 and (ord(word) in ACC_ALPHA_NUM_CHARS or ord(word) in ACC_SPECIAL_CHARS))\
       or (key.islower() and len(key) > 1 and key.isalnum()):
        return True
    else:
        return Flase


def glove_embeddings():
    print('Indexing word vectors.')
    embeddings_index = dict()
    with codecs.open(GLOVE_DATA_FILE, encoding='utf-8') as f:
        for line in f:
            key, *vector = line.split(' ')
            if word_selector(key):
                embeddings_index[key] = np.asarray(vector, dtype='float32')
    return embeddings_index


def text_processor(text):
    return [word for word in text.split() if word_selector(word)]


def micro_procesor(text):
    return char_processor(text)


time: 36.5 ms

In [156]:
## Testing

msg = r'''
How does the Surface Pro himself 4 compare with iPad Pro?Why did Microsoft choose core
m3 and not core i3 home Surface Pro 4? as/df\asdf\ta?sd|asdf.as
'''

print(msg)
print('--' * 25)
msg = micro_procesor(msg)
print(msg)
print('--' * 25)
lows = text_processor(msg)
print(lows)
print('--' * 25)


How does the Surface Pro himself 4 compare with iPad Pro?Why did Microsoft choose core
m3 and not core i3 home Surface Pro 4? as/df\asdf\ta?sd|asdf.as

--------------------------------------------------
how does the surface pro himself 4 compare with ipad pro ? why did microsoft choose corem3 and not core i3 home surface pro 4 ?  as / df \ asdf \ ta ? sd | asdf . as
--------------------------------------------------
['how', 'does', 'the', 'surface', 'pro', 'himself', '4', 'compare', 'with', 'ipad', 'pro', '?', 'why', 'did', 'microsoft', 'choose', 'corem3', 'and', 'not', 'core', 'i3', 'home', 'surface', 'pro', '4', '?', 'as', '/', 'df', '\\', 'asdf', '\\', 'ta', '?', 'sd', '|', 'asdf', '.', 'as']
--------------------------------------------------
time: 7.25 ms

In [125]:
if ('embeddings_index' not in dir()):
    print('Indexing word vectors.')
    embeddings_index = glove_embeddings()
    print('Found %s word vectors.' % len(embeddings_index))

else:
    print('Skipped to save some time!')


Skipped to save some time!
time: 4.31 ms

In [126]:
PAD_EMB = '<PAD>'
embeddings_index[PAD_EMB] = np.zeros(300, dtype=float)


time: 2.55 ms
words_corpus = '' for msg in all_test_qs: print(msg) print(text_processor(micro_procesor(msg))) break
msg

In [87]:
msg = 'What is the best medication equation erectile dysfunction?How do I out get rid of Erectile Dysfunction?'

msg, text_processor(msg)


Out[87]:
('What is the best medication equation erectile dysfunction?How do I out get rid of Erectile Dysfunction?',
 'what is the best medication equation erectile dysfunction ? how do i out get rid of erectile dysfunction ? ')
time: 4.31 ms

In [131]:
l = ["just say a word","of some length"]
ll = list(map(lambda l: l.split(" "), l))
seq_length = 5
ll
e = {"just": [0,0,0,0,1],
     "say" : [0,0,0,1,1],
     "a" : [0,1,1,1,0],
     "word" : [1,0,0,1,0],
     "of" : [0,1,0,1,0],
     "some" : [0,1,0,1,0],
     "length" : [0,0,1,1,0],
     '<PAD>': [0,0,0,0,0]
    }


time: 8.55 ms

In [165]:
# list of words --> lows

from pprint import pprint as pp

from numpy import array

def lows_padding(list_of_words, seq_length=5, append=True):
    """Pads/slices given bag of words for specified length."""
    if len(list_of_words) == seq_length:
        return list_of_words
    elif len(list_of_words) > seq_length:
        return list_of_words[:seq_length]
    # else:
    tmp = ['<PAD>' for i in range(seq_length - len(list_of_words))]
    if append:
        return list_of_words + tmp
    return  tmp + list_of_words

def lows_embedding(list_of_words, serializer):
    """To serializer/string Tokenize the list of words."""
    alist = []
    for word in list_of_words:
        try:
            alist.append(serializer[word])
        except KeyError:
            alist.append(serializer[PAD_EMB])
    return array(alist)

def lows_transformer(list_of_words, serializer, seq_length, append):
    """To pad the given list of words and serialiase them."""
    return lows_embedding(lows_padding(list_of_words, seq_length, append),
                          serializer)

bag_of_lows_transformer = lambda list_of_words: lows_transformer(list_of_words, serializer=embeddings_index,
                                                                 seq_length=4, append=True)

game = list(map(bag_of_lows_transformer, ll))


time: 22 ms

In [171]:
## Testing

msg = r'''
How does the Surface Pro himself 4 compare with iPad Pro?Why did Microsoft choose core
m3 and not core i3 home Surface Pro 4? as/df\asdf\ta?sd|asdf.as
'''

print(msg)
print('--' * 25)
msg = micro_procesor(msg)
print(msg)
print('--' * 25)
lows = text_processor(msg)
print(lows)
print('--' * 25)
lows = lows_padding(lows, append=True, seq_length=50)
print(lows)
print('--' * 25)
lows = lows_embedding(lows, embeddings_index)
print(lows)
print(lows.shape)
print('--' * 25)



question_transformer = lambda msg: lows_embedding(
                                                lows_padding(
                                                    text_processor(
                                                        micro_procesor(msg)
                                                    ),
                                                    append=True,
                                                    seq_length=50),
                                                embeddings_index)


How does the Surface Pro himself 4 compare with iPad Pro?Why did Microsoft choose core
m3 and not core i3 home Surface Pro 4? as/df\asdf\ta?sd|asdf.as

--------------------------------------------------
how does the surface pro himself 4 compare with ipad pro ? why did microsoft choose corem3 and not core i3 home surface pro 4 ?  as / df \ asdf \ ta ? sd | asdf . as
--------------------------------------------------
['how', 'does', 'the', 'surface', 'pro', 'himself', '4', 'compare', 'with', 'ipad', 'pro', '?', 'why', 'did', 'microsoft', 'choose', 'corem3', 'and', 'not', 'core', 'i3', 'home', 'surface', 'pro', '4', '?', 'as', '/', 'df', '\\', 'asdf', '\\', 'ta', '?', 'sd', '|', 'asdf', '.', 'as']
--------------------------------------------------
['how', 'does', 'the', 'surface', 'pro', 'himself', '4', 'compare', 'with', 'ipad', 'pro', '?', 'why', 'did', 'microsoft', 'choose', 'corem3', 'and', 'not', 'core', 'i3', 'home', 'surface', 'pro', '4', '?', 'as', '/', 'df', '\\', 'asdf', '\\', 'ta', '?', 'sd', '|', 'asdf', '.', 'as', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
--------------------------------------------------
[[-0.23205     0.47468001 -0.38264    ...,  0.33177999  0.31545001  0.37972   ]
 [-0.13563     0.33217001 -0.36019999 ..., -0.17296     0.21675999
   0.22205999]
 [ 0.27204001 -0.06203    -0.1884     ...,  0.13015001 -0.18317001  0.1323    ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]
(50, 300)
--------------------------------------------------
time: 19.4 ms

In [172]:
all_qs[5], question_transformer(all_qs[5])


Out[172]:
("Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?How are the two wheeler insurance from Bharti Axa insurance?I admire I am considering of buying insurance from them",
 array([[ 0.21094   ,  0.59290999,  0.26559001, ..., -0.28240001,
          0.81332999, -0.38316   ],
        [ 0.008746  ,  0.33214   , -0.29175001, ..., -0.28676999,
         -0.22663   , -0.05087   ],
        [ 0.18732999,  0.40595001, -0.51174003, ...,  0.16495   ,
          0.18757001,  0.53873998],
        ..., 
        [ 0.19375999, -0.34272   , -0.37279999, ..., -0.51226002,
          0.28685999, -0.38719001],
        [ 0.37661999,  0.3118    , -0.25503999, ..., -0.030757  ,
         -0.13600001, -0.47569999],
        [-0.57073998,  0.42093   , -0.37584999, ..., -0.035666  ,
          0.46285   ,  0.019142  ]], dtype=float32))
time: 7.51 ms

In [173]:
sample = all_qs[:500]


time: 1.67 ms

In [174]:
sample.map(question_transformer)


Out[174]:
0      [[-0.038548, 0.54252, -0.21843, -0.18855, 0.07...
1      [[-0.0385480001569, 0.542519986629, -0.2184299...
2      [[-0.232050001621, 0.474680006504, -0.38264000...
3      [[-0.177389994264, 0.371549993753, -0.38040000...
4      [[0.229369997978, 0.121559999883, 0.0934149995...
5      [[0.21094, 0.59291, 0.26559, 0.37571, 0.51553,...
6      [[-0.0130479997024, 0.0663049966097, -0.186749...
7      [[-0.232050001621, 0.474680006504, -0.38264000...
8      [[0.204999998212, 0.300570011139, -0.302590012...
9      [[-0.0632240027189, 0.236330002546, 0.44685998...
10     [[-0.351429998875, 0.794250011444, -0.16877999...
11     [[-0.23205, 0.47468, -0.38264, 0.0022248, -0.1...
12     [[-0.0385480001569, 0.542519986629, -0.2184299...
13     [[-0.0385480001569, 0.542519986629, -0.2184299...
14     [[-0.038548, 0.54252, -0.21843, -0.18855, 0.07...
15     [[-0.0385480001569, 0.542519986629, -0.2184299...
16     [[-0.0385480001569, 0.542519986629, -0.2184299...
17     [[-0.17739, 0.37155, -0.3804, -0.22554, -0.066...
18     [[-0.177389994264, 0.371549993753, -0.38040000...
19     [[0.229369997978, 0.121559999883, 0.0934149995...
20     [[-0.177389994264, 0.371549993753, -0.38040000...
21     [[-0.0385480001569, 0.542519986629, -0.2184299...
22     [[-0.0385480001569, 0.542519986629, -0.2184299...
23     [[-0.232050001621, 0.474680006504, -0.38264000...
24     [[-0.0385480001569, 0.542519986629, -0.2184299...
25     [[-0.038548, 0.54252, -0.21843, -0.18855, 0.07...
26     [[-0.0385480001569, 0.542519986629, -0.2184299...
27     [[-0.135629996657, 0.332170009613, -0.36019998...
28     [[-0.0385480001569, 0.542519986629, -0.2184299...
29     [[-0.23205, 0.47468, -0.38264, 0.0022248, -0.1...
                             ...                        
470    [[0.08649, 0.14503, -0.4902, 0.34224, 0.36343,...
471    [[-0.0849609971046, 0.501999974251, 0.00238230...
472    [[-0.0849609971046, 0.501999974251, 0.00238230...
473    [[-0.23205, 0.47468, -0.38264, 0.0022248, -0.1...
474    [[0.10528, 0.44863, 0.55631, -0.15466, -0.0726...
475    [[0.0356700010598, 0.18559999764, -0.305519998...
476    [[-0.23857, 0.35457, -0.30219, 0.089559, 0.082...
477    [[-0.232050001621, 0.474680006504, -0.38264000...
478    [[-0.177389994264, 0.371549993753, -0.38040000...
479    [[-0.0688939988613, 0.387690007687, -0.2612000...
480    [[-0.238570004702, 0.354570001364, -0.30219000...
481    [[-0.0849609971046, 0.501999974251, 0.00238230...
482    [[-0.0385480001569, 0.542519986629, -0.2184299...
483    [[-0.177389994264, 0.371549993753, -0.38040000...
484    [[-0.232050001621, 0.474680006504, -0.38264000...
485    [[0.0746089965105, 0.269320011139, -0.02772399...
486    [[-0.232050001621, 0.474680006504, -0.38264000...
487    [[-0.038548, 0.54252, -0.21843, -0.18855, 0.07...
488    [[-0.232050001621, 0.474680006504, -0.38264000...
489    [[-0.13563, 0.33217, -0.3602, -0.058787, 0.064...
490    [[-0.0849609971046, 0.501999974251, 0.00238230...
491    [[-0.0385480001569, 0.542519986629, -0.2184299...
492    [[-0.0849609971046, 0.501999974251, 0.00238230...
493    [[-0.232050001621, 0.474680006504, -0.38264000...
494    [[0.229369997978, 0.121559999883, 0.0934149995...
495    [[-0.0385480001569, 0.542519986629, -0.2184299...
496    [[0.22937, 0.12156, 0.093415, -0.31117, 0.1419...
497    [[-0.0385480001569, 0.542519986629, -0.2184299...
498    [[-0.0849609971046, 0.501999974251, 0.00238230...
499    [[-0.0385480001569, 0.542519986629, -0.2184299...
dtype: object
time: 3.59 s

Text Transformation


In [175]:
# df_train = pd.read_csv(TRAIN_CSV)
# df_test = pd.read_csv(TEST_CSV)

df_train.describe()


Out[175]:
id qid1 qid2 is_duplicate
count 404290.000000 404290.000000 404290.000000 404290.000000
mean 202144.500000 217243.942418 220955.655337 0.369198
std 116708.614503 157751.700002 159903.182629 0.482588
min 0.000000 1.000000 2.000000 0.000000
25% 101072.250000 74437.500000 74727.000000 0.000000
50% 202144.500000 192182.000000 197052.000000 0.000000
75% 303216.750000 346573.500000 354692.500000 1.000000
max 404289.000000 537932.000000 537933.000000 1.000000
time: 514 ms

In [181]:
np.sum(df_train.isnull(), axis=0), np.sum(df_test.isnull(), axis=0)


Out[181]:
(id              0
 qid1            0
 qid2            0
 question1       0
 question2       2
 is_duplicate    0
 dtype: int64, test_id      0
 question1    2
 question2    4
 dtype: int64)
time: 829 ms

In [197]:
df_train = pd.read_csv(TRAIN_CSV)
df_test = pd.read_csv(TEST_CSV)


time: 12.5 s

In [198]:
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)


time: 977 ms

In [199]:
df_train.question1.apply?


time: 144 ms

In [213]:
def row_mapper(row):
    print(str(row))
    row.question1 = "a"
    row.question2 = 'b'
    return row

df_train[:5].applymap?


Object `applymap` not found.
time: 4.19 ms

In [215]:
df_train.apply?


time: 21.9 ms

In [222]:
df_train['question1 question2'.split()][:2].applymap(question_transformer)


Out[222]:
question1 question2
0 [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.0385480001569, 0.542519986629, -0.2184299...
1 [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.0385480001569, 0.542519986629, -0.2184299...
time: 516 ms

In [223]:
import copy


time: 1.15 ms

In [306]:
# sample = copy.deepcopy(df_train['question1 question2'.split()][:50].applymap(question_transformer))
sample = copy.deepcopy(df_train['question1 question2'.split()][:10]) # .applymap(question_transformer)


time: 29.8 ms

In [307]:
sample.shape, question_transformer('0	What is the step by step guide to invest i').shape


Out[307]:
((10, 2), (50, 300))
time: 5.31 ms

In [309]:
row = []

def row_mapper(input_row):
    global row 
    row = input_row
    return row


time: 2.69 ms

In [314]:
sample.applymap(row_mapper)


Out[314]:
question1 question2
0 What is the step by step guide to invest in sh... What is the step by step guide to invest in sh...
1 What is the story of Kohinoor (Koh-i-Noor) Dia... What would happen if the Indian government sto...
2 How can I increase the speed of my internet co... How can Internet speed be increased by hacking...
3 Why am I mentally very lonely? How can I solve... Find the remainder when [math]23^{24}[/math] i...
4 Which one dissolve in water quikly sugar, salt... Which fish would survive in salt water?
5 Astrology: I am a Capricorn Sun Cap moon and c... I'm a triple Capricorn (Sun, Moon and ascendan...
6 Should I buy tiago? What keeps childern active and far from phone ...
7 How can I be a good geologist? What should I do to be a great geologist?
8 When do you use シ instead of し? When do you use "&" instead of "and"?
9 Motorola (company): Can I hack my Charter Moto... How do I hack Motorola DCX3400 for free internet?
time: 12.9 ms

In [318]:
features1 = np.array(sample.question1)


time: 1.55 ms

In [324]:
new_features1 = array([question_transformer(_) for _ in sample.question1])
new_features2 = array([question_transformer(_) for _ in sample.question2])


time: 9.12 ms

In [ ]:
df_train = array([array([question_transformer(_) for _ in df_train.question1]),
           array([question_transformer(_) for _ in df_train.question2])])

In [ ]:
df_train.shape

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [263]:
new = sample.applymap(question_transformer)


Out[263]:
0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
question1 [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.232050001621, 0.474680006504, -0.38264000... [[-0.177389994264, 0.371549993753, -0.38040000... [[0.229369997978, 0.121559999883, 0.0934149995... [[0.210940003395, 0.592909991741, 0.2655900120... [[-0.0130479997024, 0.0663049966097, -0.186749... [[-0.232050001621, 0.474680006504, -0.38264000... [[0.204999998212, 0.300570011139, -0.302590012... [[-0.0632240027189, 0.236330002546, 0.44685998... ... [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.0279389992356, 0.356279999018, 0.65255999... [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.232050001621, 0.474680006504, -0.38264000... [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.177389994264, 0.371549993753, -0.38040000... [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.177389994264, 0.371549993753, -0.38040000...
question2 [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.232050001621, 0.474680006504, -0.38264000... [[0.0850000008941, 0.322129994631, -0.56233000... [[0.229369997978, 0.121559999883, 0.0934149995... [[0.187329992652, 0.405950009823, -0.511740028... [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.0385480001569, 0.542519986629, -0.2184299... [[0.204999998212, 0.300570011139, -0.302590012... [[-0.232050001621, 0.474680006504, -0.38264000... ... [[0.229369997978, 0.121559999883, 0.0934149995... [[-0.0279389992356, 0.356279999018, 0.65255999... [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.232050001621, 0.474680006504, -0.38264000... [[0.0746089965105, 0.269320011139, -0.02772399... [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.0385480001569, 0.542519986629, -0.2184299... [[-0.232050001621, 0.474680006504, -0.38264000...

2 rows × 100 columns

time: 4.67 s

In [294]:
np.array(sample.question1.map(lambda x: array([1, 1]))).shape


Out[294]:
(100,)
time: 5.22 ms

In [298]:
features1 = np.zeros_like(sample.question1)

for i, row in enumerate(sample.question1):
    features1[i] = question_transformer(sample.question1[i])


time: 23 ms

In [305]:
type(features1[1])


Out[305]:
numpy.ndarray
time: 4.02 ms

In [ ]: