In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import os.path as osp
import re
from bs4 import BeautifulSoup
In [2]:
train_data_dir = '../data/train'
test_data_dir = '../data/test/test.csv'
topics = ['biology', 'cooking', 'crypto', 'diy', 'robotics', 'travel']
In [3]:
# combine
df_list = []
for i, t in enumerate(topics):
df_local = pd.read_csv(osp.join(train_data_dir, "".join([t, '.csv'])), encoding='utf-8')
df_local['topic'] = i
df_list.append(df_local)
df = pd.concat(df_list)
df["Qid"] = df["topic"].map(str) + "-" + df["id"].map(str)
df.reset_index(inplace=True)
df.drop(["index", "id", "topic"], 1, inplace=True)
del df_list
In [4]:
df.tail()
Out[4]:
In [5]:
# randomly look into one row
def peek():
df_slice = df.sample(1)
for col in df_slice.columns.values:
print list(df_slice[col])[0]
In [6]:
peek()
In [7]:
def pre_process(data=df):
#1: remove html tags
removeHtmlTags = lambda x: BeautifulSoup(x, "html.parser").text
data["content_cleaned"] = data['content'].apply(removeHtmlTags)
print "Removed html tags."
#2: combine title and content
tmp = data['title'] + " " + data['content_cleaned']
print "Combined question title and content."
#3: remove new line, extra space
clean = lambda x: re.sub("[ ]+", " ", x.strip("\n"))
data["combined_text"] = tmp.apply(clean)
print "Removed spaces and new lines."
keeps = ['Qid', 'combined_text', 'tags']
return data[keeps]
In [8]:
df = pre_process()
In [9]:
peek()
In [18]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import FreqDist
import numpy as np
import cPickle as pkl
from collections import deque
In [151]:
def build_word_dict(df, limit=None):
corpus = df["combined_text"].str.cat(sep=" ")
corpus = re.sub("[ ]+", " ", corpus) # one gigantic text
words = word_tokenize(corpus, 'english')
if limit is None:
limit = len(set(words))
vc = {}
vc['UNKNOWN'] = 0
for word,_ in FreqDist(words).most_common(limit):
vc[word] = len(vc) # might overwrite 'UNKNOWN'
vc['NULL'] = len(vc)
vc['UNKNOWN'] = 0
rv_vc = {v:k for k, v in vc.iteritems()} # reverse volcabulary
return vc, rv_vc
In [152]:
word2id, id2word = build_word_dict(df=df)
In [153]:
len(word2id)
Out[153]:
In [154]:
word_dict_file = "../data/tmp/word2id.pkl"
reversed_word_dict_file = "../data/tmp/id2word.pkl"
with open(word_dict_file, "wb") as f:
pkl.dump(word2id, f)
with open(reversed_word_dict_file, "wb") as f:
pkl.dump(id2word, f)
In [19]:
word_dict_file = "../data/tmp/word2id.pkl"
reversed_word_dict_file = "../data/tmp/id2word.pkl"
if osp.exists(word_dict_file):
with open(word_dict_file, "rb") as f:
word2id = pkl.load(f)
if osp.exists(reversed_word_dict_file):
with open(reversed_word_dict_file, "rb") as f:
id2word = pkl.load(f)
In [233]:
def build_dataset(df, vocab):
ssplit = lambda row: sent_tokenize(row["combined_text"], "english")
gendf = lambda sents, row: pd.DataFrame(zip([row["Qid"]]*len(sents), sents), columns=["Qid", "Sentence"])
first = df.loc[0, :]
sents = ssplit(first)
df_sent = gendf(sents, first)
nrow = len(df.index)
disp = nrow / 10
for idx in xrange(1, nrow):
row = df.loc[idx, :]
sents = ssplit(row)
df_sent = df_sent.append(gendf(sents, row), ignore_index=True)
if (idx+1) % disp == 0:
print "Processed: %d"%(idx+1)
text2numeric = lambda s: [vocab[w] if w in vocab else 0 for w in word_tokenize(s, "english")]
df_sent["Encoding"] = df_sent["Sentence"].apply(text2numeric)
df_sent["Length"] = df_sent["Encoding"].apply(len)
df_sent = df_sent.loc[df_sent["Length"] > 3, :]
df_sent = df_sent.loc[df_sent["Length"] < 50, :]
return df_sent
In [234]:
df_sent = build_dataset(df, word2id)
In [2]:
sents_file = "../data/tmp/sentences.csv"
In [11]:
df_sent.to_csv(sents_file, encoding='utf-8', index=False)
In [12]:
df_sent = pd.read_csv(sents_file, encoding='utf-8')
In [13]:
print "Number of questions: %d" %len(set(df_sent['Qid']))
In [238]:
def get_batch(df, batch_size, window=5, null_id=word2id["NULL"]): # use paragraph and left N-1 words to predict the N-th word
batch = np.ndarray(shape=(batch_size, 5), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
sample = df.sample(n=batch_size//10).reset_index()
assert sample["Length"].sum() >= batch_size, "Not enought data: %d (< %d)" %(sample["Length"].sum(), batch_size)
cnt = 0 # fill rate
row = 0
while cnt < batch_size:
buffer_ = [null_id] * window + sample.loc[row, "Encoding"] # paddings
for i in xrange(len(buffer_) - window):
if cnt < batch_size:
batch[cnt, :] = buffer_[i: i + window]
labels[cnt, 0] = buffer_[i + window]
cnt += 1
else:
break
row += 1
return batch, labels
In [239]:
batch, labels = get_batch(df_sent, 128, window=5)
In [236]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import threading
import time
from six.moves import xrange # pylint: disable=redefined-builtin
import numpy as np
import tensorflow as tf
In [240]:
# Pre-requisites:
# 1. word2id
# 2. id2word
# 3. df_sent: ["Qid", "Sentence", "Encoding", "Length"]
flags = tf.app.flags
flags.DEFINE_string("train_data", None, "Doc2Vec training data")
flags.DEFINE_integer("doc_embedding_size", 200, "embedding size for paragraph")
flags.DEFINE_integer("word_embedding_size", 100, "embedding size for word")
flags.DEFINE_integer("vocab_size", len(word2id), "vocabulary size")
flags.DEFINE_integer("num_paragraphs", len(set(df_sent['Qid'])), "number of paragraphs in the training set")
flags.DEFINE_boolean("concat", True, "whether to concatenate paragraph embedding with word embeddings")
flags.DEFINE_boolean("sum_", False, "whether to sum all embeddings (both paragraph and word)")
flags.DEFINE_boolean("average", False, "whether to average all embeddings (both paragraph and word)")
flags.DEFINE_integer("window", 4, "number of preceding words used to predict next word")
FLAGS = flags.FLAGS
In [ ]:
class Options(object):
"""
Options used by doc2vec model
"""
def __init__(self):
self.train_data = FLAGS.train_data
self.wd_emb_dim = FLAGS.word_embedding_size
self.ph_emb_dim = FLAGS.doc_embedding_size
self.vocab_size = FLAGS.vocab_size
self.num_paragraphs = FLAGS.num_paragraphs
self.concat = FLAGS.concat
self.sum_ = FLAGS.sum_
self.average = FLAGS.average
assert self.concat or self.sum_ or self.average, "You either concat or sum/average input embeddings"
if not self.concat:
assert self.sum_ or self.average, "You either sum or average input embeddings"
assert self.wd_emb_dim == self.ph_emb_dim,
"If not concatenated, paragraph embeddings should have the same size as word embedding's"
self.window = FLAGS.window
In [ ]:
class Doc2Vec(object):
"""
Distributed Memory Model of Paragraph Vectors (PV-DM).
"""
def __init__(self, options, session):
self._options = options
self._session = session
self._wd_embed = None
self._ph_embed = None
self.global_step = None
def forward(self, batch_data, batch_labels):
opts = self._options
# Word and paragraph embeddings
wd_embed_init_width = 0.5 / opts.wd_emb_dim
ph_embed_init_width = 0.5 / opts.ph_emb_dim
wd_embed = tf.Variable(tf.random_uniform([opts.vocab_size, opts.wd_embed_dim], -wd_embed_init_width,
wd_embed_init_width), name="word_embedding")
ph_embed = tf.Variable(tf.random_uniform([opts.num_paragraphs, opts.ph_embed_din], -ph_embed_init_width,
ph_embed_init_width), name="paragraph_embedding")
self._wd_embed = wd_embed
self._ph_embed = ph_embed
# Softmax weight & biases
if opts.concat:
tf.sm_wgt = tf.Variable(tf.zeros([opts.vocab_size, opts.ph_emb_dim + opts.window * opts.wd_emb_dim]),
name="Softmax weights")
else:
tf.sm_wgt = tf.Variable(tf.zeros([opts.vocab_size, opts.wd_emb_dim]), name="Softmax weights")
tf.sm_b = tf.Variable(tf.zeros([opts.vocab_size]), name="Softmax biases")
# Global step
self.global_step = tf.Variable(0, "global step")