In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
# import re
# import gc
import codecs
# import matplotlib.pyplot as plt
# import seaborn as sns
# import tensorflow as tf
# from bs4 import BeautifulSoup
# from nltk.corpus import stopwords
# from keras.preprocessing.text import Tokenizer
# %matplotlib inline
%load_ext autotime
# pal = sns.color_palette()
In [4]:
# Paths
if os.path.isdir('data'):
QUORA_DATA_DIR = "data/"
GLOVE_DATA_DIR = "data/"
else:
QUORA_DATA_DIR = "/opt/datasets/quora/"
GLOVE_DATA_DIR = "/opt/datasets/glove/"
TRAIN_CSV = QUORA_DATA_DIR + 'train.csv'
TEST_CSV = QUORA_DATA_DIR + 'test.csv'
glove_840B_300d = GLOVE_DATA_DIR + 'glove.840B.300d.txt'
GLOVE_DATA_FILE = glove_840B_300d
In [5]:
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 45
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.01
In [6]:
df_train = pd.read_csv(TRAIN_CSV)
df_test = pd.read_csv(TEST_CSV)
# Train Data
train_feature_1_string = pd.Series(df_train['question1'].tolist()).astype(str)
train_feature_2_string = pd.Series(df_train['question2'].tolist()).astype(str)
target = pd.Series(df_train['is_duplicate'].tolist())
all_train_qs = train_feature_1_string + train_feature_2_string
# Test Data
test_feature_1_string = pd.Series(df_test['question1'].tolist()).astype(str)
test_feature_2_string = pd.Series(df_test['question2'].tolist()).astype(str)
all_test_qs = test_feature_1_string + test_feature_2_string
all_qs = all_train_qs + all_test_qs
print(all_train_qs.tolist()[:10])
df_train.head()
Out[6]:
In [7]:
df_test.head()
Out[7]:
In [8]:
all_qids = df_train.qid1 + df_train.qid2
train_qs = df_train.question1 + df_train.question2
In [9]:
total_ques_pairs = len(df_train)
print('Total number of question pairs for training: {}'.format(total_ques_pairs))
duplicate_ques_pairs = round(df_train['is_duplicate'].mean()*100, 2)
print('Duplicate pairs: {}%'.format(duplicate_ques_pairs))
unique_qids = len(np.unique(train_qs.fillna("")))
print('Total number of questions in the training data: {}'.format(unique_qids))
print('Number of questions that appear multiple times: {}'.format(np.sum(all_qids.value_counts() > 1)))
print("Total number of questions in Quora dataset: {}".format(len(all_qs)))
In [116]:
import string
import codecs
ACC_ALPHA_NUM_CHARS = set(map(ord, string.ascii_lowercase + string.digits + ' ')) # adding space here
ACC_SPECIAL_CHARS = set(map(ord, '''~!@#$%^&*()_+`-=[]\{}|;':"<>?,./<\/>|?'"+=-*&^%$#@!_.;:{}()[]~!@#$%^&*()_+'''))
# TODO: Split char processor to char_selector & char_procssor
def char_processor(text):
"""String charecters are processed based on globally defined ACC_* variables.
Args:
text(string): input string
Features:
* clears trailing or heading space or tabs
* converts text to lower case
* adds appends and prepends spaces to all special charecters
"""
new_text = []
new_text_append = new_text.append
for char in text.strip().lower():
if ord(char) in ACC_ALPHA_NUM_CHARS:
new_text_append(char)
elif ord(char) in ACC_SPECIAL_CHARS:
new_text_append(" " + char + " ")
return ''.join(new_text)
def word_selector(word):
"""Return True or False, to tell if we should select the word.
Args:
word(string): an array of charecters
Selection Criteria:
* If len == 1, has to be one of accepted ACC_* variables.
* If len > 1, has to be lower case & alpha-numeric (Eg: i3, m5, 1plus, 3g,..)
"""
if (len(word) == 1 and (ord(word) in ACC_ALPHA_NUM_CHARS or ord(word) in ACC_SPECIAL_CHARS))\
or (key.islower() and len(key) > 1 and key.isalnum()):
return True
else:
return Flase
def glove_embeddings():
print('Indexing word vectors.')
embeddings_index = dict()
with codecs.open(GLOVE_DATA_FILE, encoding='utf-8') as f:
for line in f:
key, *vector = line.split(' ')
if word_selector(key):
embeddings_index[key] = np.asarray(vector, dtype='float32')
return embeddings_index
def text_processor(text):
return [word for word in text.split() if word_selector(word)]
def micro_procesor(text):
return char_processor(text)
In [156]:
## Testing
msg = r'''
How does the Surface Pro himself 4 compare with iPad Pro?Why did Microsoft choose core
m3 and not core i3 home Surface Pro 4? as/df\asdf\ta?sd|asdf.as
'''
print(msg)
print('--' * 25)
msg = micro_procesor(msg)
print(msg)
print('--' * 25)
lows = text_processor(msg)
print(lows)
print('--' * 25)
In [125]:
if ('embeddings_index' not in dir()):
print('Indexing word vectors.')
embeddings_index = glove_embeddings()
print('Found %s word vectors.' % len(embeddings_index))
else:
print('Skipped to save some time!')
In [126]:
PAD_EMB = '<PAD>'
embeddings_index[PAD_EMB] = np.zeros(300, dtype=float)
In [87]:
msg = 'What is the best medication equation erectile dysfunction?How do I out get rid of Erectile Dysfunction?'
msg, text_processor(msg)
Out[87]:
In [131]:
l = ["just say a word","of some length"]
ll = list(map(lambda l: l.split(" "), l))
seq_length = 5
ll
e = {"just": [0,0,0,0,1],
"say" : [0,0,0,1,1],
"a" : [0,1,1,1,0],
"word" : [1,0,0,1,0],
"of" : [0,1,0,1,0],
"some" : [0,1,0,1,0],
"length" : [0,0,1,1,0],
'<PAD>': [0,0,0,0,0]
}
In [165]:
# list of words --> lows
from pprint import pprint as pp
from numpy import array
def lows_padding(list_of_words, seq_length=5, append=True):
"""Pads/slices given bag of words for specified length."""
if len(list_of_words) == seq_length:
return list_of_words
elif len(list_of_words) > seq_length:
return list_of_words[:seq_length]
# else:
tmp = ['<PAD>' for i in range(seq_length - len(list_of_words))]
if append:
return list_of_words + tmp
return tmp + list_of_words
def lows_embedding(list_of_words, serializer):
"""To serializer/string Tokenize the list of words."""
alist = []
for word in list_of_words:
try:
alist.append(serializer[word])
except KeyError:
alist.append(serializer[PAD_EMB])
return array(alist)
def lows_transformer(list_of_words, serializer, seq_length, append):
"""To pad the given list of words and serialiase them."""
return lows_embedding(lows_padding(list_of_words, seq_length, append),
serializer)
bag_of_lows_transformer = lambda list_of_words: lows_transformer(list_of_words, serializer=embeddings_index,
seq_length=4, append=True)
game = list(map(bag_of_lows_transformer, ll))
In [171]:
## Testing
msg = r'''
How does the Surface Pro himself 4 compare with iPad Pro?Why did Microsoft choose core
m3 and not core i3 home Surface Pro 4? as/df\asdf\ta?sd|asdf.as
'''
print(msg)
print('--' * 25)
msg = micro_procesor(msg)
print(msg)
print('--' * 25)
lows = text_processor(msg)
print(lows)
print('--' * 25)
lows = lows_padding(lows, append=True, seq_length=50)
print(lows)
print('--' * 25)
lows = lows_embedding(lows, embeddings_index)
print(lows)
print(lows.shape)
print('--' * 25)
question_transformer = lambda msg: lows_embedding(
lows_padding(
text_processor(
micro_procesor(msg)
),
append=True,
seq_length=50),
embeddings_index)
In [172]:
all_qs[5], question_transformer(all_qs[5])
Out[172]:
In [173]:
sample = all_qs[:500]
In [174]:
sample.map(question_transformer)
Out[174]:
In [175]:
# df_train = pd.read_csv(TRAIN_CSV)
# df_test = pd.read_csv(TEST_CSV)
df_train.describe()
Out[175]:
In [181]:
np.sum(df_train.isnull(), axis=0), np.sum(df_test.isnull(), axis=0)
Out[181]:
In [197]:
df_train = pd.read_csv(TRAIN_CSV)
df_test = pd.read_csv(TEST_CSV)
In [198]:
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)
In [199]:
df_train.question1.apply?
In [213]:
def row_mapper(row):
print(str(row))
row.question1 = "a"
row.question2 = 'b'
return row
df_train[:5].applymap?
In [215]:
df_train.apply?
In [222]:
df_train['question1 question2'.split()][:2].applymap(question_transformer)
Out[222]:
In [223]:
import copy
In [306]:
# sample = copy.deepcopy(df_train['question1 question2'.split()][:50].applymap(question_transformer))
sample = copy.deepcopy(df_train['question1 question2'.split()][:10]) # .applymap(question_transformer)
In [307]:
sample.shape, question_transformer('0 What is the step by step guide to invest i').shape
Out[307]:
In [309]:
row = []
def row_mapper(input_row):
global row
row = input_row
return row
In [314]:
sample.applymap(row_mapper)
Out[314]:
In [318]:
features1 = np.array(sample.question1)
In [324]:
new_features1 = array([question_transformer(_) for _ in sample.question1])
new_features2 = array([question_transformer(_) for _ in sample.question2])
In [ ]:
df_train = array([array([question_transformer(_) for _ in df_train.question1]),
array([question_transformer(_) for _ in df_train.question2])])
In [ ]:
df_train.shape
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [263]:
new = sample.applymap(question_transformer)
Out[263]:
In [294]:
np.array(sample.question1.map(lambda x: array([1, 1]))).shape
Out[294]:
In [298]:
features1 = np.zeros_like(sample.question1)
for i, row in enumerate(sample.question1):
features1[i] = question_transformer(sample.question1[i])
In [305]:
type(features1[1])
Out[305]:
In [ ]: