In [1]:
from __future__ import division, print_function, absolute_import
from IPython.display import display # Allows the use of display() for DataFrames
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
def tokenize_str(string):
'''
Tokenization/string cleaning
'''
string = re.sub(r"[^A-Za-z0-9()[],?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\’s", " \'s", string)
string = re.sub(r"\'m", " am", string)
string = re.sub(r"\'ve", " have", string)
string = re.sub(r"can\'t", " cannot", string)
string = re.sub(r"n\'t", " not", string)
string = re.sub(r"\'re", " are", string)
string = re.sub(r"\'d", " had", string)
string = re.sub(r"\'ll", " will", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"[...]", " ", string)
string = re.sub(r"/", " or ", string)
string = re.sub(r"-", " ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def load_data_and_preprocess(filename):
'''
Read csv file into a DataFrame and tokenize Q1 and Q2 strings
'''
df = pd.read_csv(filename)
print(filename + " loaded. Preprocessing...")
df["q1"] = df["question1"].apply(lambda row: tokenize_str(str(row)))
df["q2"] = df["question2"].apply(lambda row: tokenize_str(str(row)))
print("Preprocess done!")
return df
df_train = load_data_and_preprocess("train.csv")
In [3]:
# Inspect several samples of preprocessed text
a = 80
for i in range(a,a+10):
print(i, df_train["q1"][i])
print(i, df_train["q2"][i])
print("")
In [4]:
# Merge Q1 and Q2 as one feature
df_train["merged"] = df_train["q1"] + " " + df_train["q2"]
print(df_train["merged"][0])
In [5]:
# Split into stratified training and validation set
from sklearn.model_selection import train_test_split
X = df_train
y = df_train["is_duplicate"]
X_train, X_val, y_train, y_val = train_test_split(
X,
y,
test_size=0.05,
random_state=42)
# Drop all columns except "merged" for X_val as it is the feature
# We keep all columns in X_train as we want to use those to oversample later
X_val = X_val["merged"]
# Convert the validation set labels to a list of either 0 (not duplicate) or 1 (duplicates)
y_val = y_val.values.tolist()
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(len(y_val))
In [6]:
from collections import Counter
print(y_train.value_counts(normalize=True))
c = Counter(y_val)
print(c, float(c[1])/(c[0] + c[1]))
In [7]:
# pos / pos + neg = 16.5%
pct = 16.5
pos_class = (pct * c[0] / (100 - pct))
pos_class_amt = int(np.round(pos_class, 0))
print("Pos class needs to be: " +
str(pos_class_amt) + " to balance to " + str(pct) + " % duplicates.")
In [8]:
# Undersample the positive class in the validation set
def undersample(features, labels, pos_act, pos_needed):
cnt = 0
ind = []
for i, (feat, label) in enumerate(zip(features, labels)):
if label == 1 and cnt < (pos_act - pos_needed):
ind.append(i)
cnt += 1
features = features.drop(features.index[ind])
for i in sorted(ind, reverse=True):
del labels[i]
print(len(ind))
print(len(features), len(labels))
r = Counter(labels)
print(r, float(r[1])/(r[0] + r[1]))
return features, labels
X_val, y_val = undersample(X_val, y_val, c[1], pos_class_amt)
In [9]:
# Random oversampling
df_pos = X_train[X_train["is_duplicate"] == 1]
df_neg = X_train[X_train["is_duplicate"] == 0]
print("Total positive pair examples: ", len(df_pos))
print("Total negative pair examples: ", len(df_neg))
def pos_neg_stats(df_pos, df_neg, pct):
'''
Calculates stats on class imbalance
'''
# Pos / Pos + Neg * 100
pos_neg_ratio = float(len(df_pos)) / (len(df_pos) + len(df_neg)) * 100
print("Percentage duplicates in dataset: " + str(np.round(pos_neg_ratio, 2)) + " %")
# Pos / Pos + Neg = 16.5%, then: Neg = Pos * 100 / 16.5 - Pos
neg_class = (len(df_pos) * 100 / pct) - len(df_pos)
neg_class_amt = int(np.round(neg_class, 0))
print("Neg class needs to be: " +
str(neg_class_amt) + " to balance to " + str(pct) + " % duplicates.")
return neg_class_amt
def random_oversample(pos, neg, pct):
'''
Oversamples majority class by an amount
'''
# Gets a random sample of all rows (frac=1) i.e. shuffling the entire dataset
pos = pos.sample(frac=1).reset_index(drop=True)
neg = neg.sample(frac=1).reset_index(drop=True)
df = pd.DataFrame()
df["q1"] = pos["q1"]
df["q2"] = neg["q2"]
#print(len(df))
df2 = pd.DataFrame()
df2["q1"] = pos["q2"]
df2["q2"] = neg["q1"]
#print(len(df2))
df = df.append(df2)
pos = pos.sample(frac=1).reset_index(drop=True)
neg = neg.sample(frac=1).reset_index(drop=True)
df3 = pd.DataFrame()
df3["q1"] = neg["q2"]
df3["q2"] = neg["q1"]
#print(len(df3))
df = df.append(df3)
pos = pos.sample(frac=1).reset_index(drop=True)
neg = neg.sample(frac=1).reset_index(drop=True)
df4 = pd.DataFrame()
df4["q1"] = pos["q2"]
df4["q2"] = neg["q2"]
#print(len(df4))
df = df.append(df4)
pos = pos.sample(frac=1).reset_index(drop=True)
neg = neg.sample(frac=1).reset_index(drop=True)
df5 = pd.DataFrame()
df5["q1"] = pos["q1"]
df5["q2"] = neg["q1"]
#print(len(df5))
df = df.append(df5)
df = df.reset_index(drop=True)
print("Total oversampled examples to choose from: ", len(df))
# Randomly select a subset of the oversampled examples
# to fit the required percentage duplicates
df_sample_ind = np.random.choice(
df.index.values,
size=pos_neg_stats(pos, neg, pct),
replace=False)
df_sample = df.loc[df_sample_ind]
df_sample = df_sample.reset_index(drop=True)
df_sample["is_duplicate"] = 0
# Retain all examples in positive class
df_pos_all = pos[["q1", "q2", "is_duplicate"]].copy()
df_sample = pd.concat([df_sample, df_pos_all])
# Reshuffle dataset one last time
df_sample = df_sample.sample(frac=1).reset_index(drop=True)
print("Total examples in rebalanced dataset: ", len(df_sample))
# Pos / Pos + Neg * 100
pos_neg_ratio = float(len(pos)) / (len(df_sample)) * 100
print("Percentage duplicates in rebalanced dataset: " + str(np.round(pos_neg_ratio, 2)) + " %")
display(df_sample.head())
display(df_sample.tail())
display(df_sample[["q1", "q2"]].describe())
return df_sample
#717553
df_sample = random_oversample(df_pos, df_neg, 16.5)
In [10]:
print(df_sample['is_duplicate'].value_counts(normalize=True))
In [11]:
# Merge Q1 and Q2 as one feature
df_sample["merged"] = df_sample["q1"] + " " + df_sample["q2"]
# Set feature and value
X_train_sample = df_sample["merged"]
# Convert training set labels to 0 or 1
y_train_sample = df_sample["is_duplicate"].values.tolist()
print(len(X_train_sample))
print(len(y_train_sample))
In [12]:
# Concatenate training and validation set for sequence length analysis
# and token id encoding later
X_all = pd.concat([X_train_sample, X_val], axis=0)
# Plot sequence length of questions, and determine outliers
sns.set_style('whitegrid')
seq_length = []
for row in X_all:
seq_length.append(len(row.split(" ")))
sns.distplot(seq_length, kde=False)
plt.xlabel("Tokenized Sentence length")
plt.ylabel("Occurences")
print("seq length stats: ")
print("Max: ", max(seq_length))
print("Min: ", min(seq_length))
print("")
Q1 = np.percentile(seq_length, 25)
Q3 = np.percentile(seq_length, 75)
IQR = Q3 - Q1
print("Q1: ", Q1)
print("Mean: ", np.mean(seq_length))
print("Q3: ", Q3)
print("IQR: ", IQR)
print("Outlier range: < " + str(Q1 - 1.5*IQR) + " and > " + str(Q3 + 1.5*IQR))
In [13]:
# Build vocabulary
from tensorflow.contrib import learn
#max_document_length = max([len(x.split(" ")) for x in X_all])
max_document_length = 45
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
#x1 = np.array(list(vocab_processor.fit_transform(X_train_sample)))
#x2 = np.array(list(vocab_processor.fit_transform(X_val)))
vocab_processor.fit(X_train_sample)
vocab_processor.fit(X_val)
x1 = np.array(list(vocab_processor.transform(X_train_sample)))
x2 = np.array(list(vocab_processor.transform(X_val)))
print("Vocabulary size: {:d}".format(len(vocab_processor.vocabulary_)))
In [21]:
print(X_train_sample.iloc[0])
print(X_val.iloc[0])
In [22]:
print(x1[0])
print(x2[0])
In [23]:
# Check memory usage of x1
str(x1.nbytes / 1e6) + " MB"
Out[23]:
In [ ]:
'''def build_vocabulary(df):
'''
Assigns an id to each unique word in dataset
'''
# Splits train dataset into tokens and calculates the unique number of tokens
unique = [word for row in df for word in row.split()]
token_count = set(unique)
print("Vocabulary size: ", len(token_count))
# Build vocabulary
vocab = {}
for i, u in enumerate(token_count):
vocab[u] = i
return vocab
vocab = build_vocabulary(X_all)'''
In [ ]:
'''def convert_token_to_token_id(df, vocabulary):
'''
Converts tokens in dataframe into ids, referencing from a vocabulary
'''
token_id = []
for row in df:
iid = []
for word in row.split():
iid.append(vocabulary.get(word))
token_id.append(iid)
return token_id
X_train_sample = convert_token_to_token_id(X_train_sample, vocab)
X_val = convert_token_to_token_id(X_val, vocab)
print(X_train_sample[:2])
print(X_val[:2])'''
In [ ]:
'''import joblib
with open('preprocess_train', 'wb'):
joblib.dump(x1, 'preprocess_train')
print("File saved.")'''
In [24]:
# Saves preprocessed data into a pickle file
import pickle
with open('preprocess_3.pickle', 'wb') as f:
#pickle.dump(X_train_sample, f, protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(x1, f, protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(x2, f, protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(y_train_sample, f, protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(y_val, f, protocol=pickle.HIGHEST_PROTOCOL)
print("File saved.")
In [ ]: