Explore Data


In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import os.path as osp
import re
from bs4 import BeautifulSoup

In [2]:
train_data_dir = '../data/train'
test_data_dir = '../data/test/test.csv'
topics = ['biology', 'cooking', 'crypto', 'diy', 'robotics', 'travel']

In [3]:
# combine
df_list = []
for i, t in enumerate(topics):
    df_local = pd.read_csv(osp.join(train_data_dir, "".join([t, '.csv'])), encoding='utf-8')
    df_local['topic'] = i
    df_list.append(df_local)
df = pd.concat(df_list)
df["Qid"] = df["topic"].map(str) + "-" + df["id"].map(str)
df.reset_index(inplace=True)
df.drop(["index", "id", "topic"], 1, inplace=True)
del df_list

In [4]:
df.tail()


Out[4]:
title content tags Qid
86995 Tipping in USA.California.SF <p>Being a <a href="https://gsamaras.wordpress... usa food-and-drink california tipping san-fran... 5-78013
86996 As a Canadian, what are appropriate gifts to b... <p>I live in Vancouver, Canada and will be tra... uk canada france culture gifts 5-78016
86997 Does Macedonian police issue visitors a regist... <p>In Serbia and Macedonia, you have to Regist... customs-and-immigration officials registration... 5-78018
86998 Can my Austrian Visa D be renewed? <p>I am in Austria on a Visa D multiple entry,... visas austria 5-78019
86999 Shortest wide-body route from Frankfurt <p>What is the shortest passenger route served... untagged 5-78022

In [5]:
# randomly look into one row
def peek():
    df_slice = df.sample(1)
    for col in df_slice.columns.values:
        print list(df_slice[col])[0]

In [6]:
peek()


Cat6/Low Voltage Termination in a wooden enclosure
<p>I have been bouncing around the idea of running CAT6 thru my older (1970s) house.  Single story with an unfinished attic</p>

<p>I have identified a bedroom closet as the potential location for the central wiring location.  The closet is on the outer wall of the house.</p>

<p>Rather than install a patch panel in the closet itself and have to deal with tidying up all of the resulting cables, I was toying with the idea of a wooden enclosure directly above the closet</p>

<p>Something along the lines of this
<a href="http://www.ikea.com/us/en/catalog/products/S89046815/" rel="nofollow">http://www.ikea.com/us/en/catalog/products/S89046815/</a>
With a pair of 6U rails to mount the patch panels.  Rails would be mounted about 6" in on the client cable side and 9" in on the termination side</p>

<p>Nail some attic decking onto the joists and secure the enclosure onto the decking</p>

<p>Run all cables up into the attic
Drill a 4" hole on the side of the enclosure to feed cables from wall jacks (or maybe drill a series of holes and just run smurf tube directly into the enclosure)
Drill a 3" hole on the bottom and run a short length of PVC to feed the switch (24 port for now but possibly 48 later) or pass patch cables from router and VOIP up to the panel.  </p>

<p>Would this raise any code violations?<br>
Potential fire hazard maybe from terminating inside a wooden enclosure in the unfinished attic?
Any structural code issues from putting this near the outer wall/studs.</p>

<p>Any other problems this might cause that I'm overlooking?</p>

electrical code-compliance attic data-wiring
3-77029

Pre Processing


In [7]:
def pre_process(data=df):
    #1: remove html tags
    removeHtmlTags = lambda x: BeautifulSoup(x, "html.parser").text
    data["content_cleaned"] = data['content'].apply(removeHtmlTags)
    print "Removed html tags."
    #2: combine title and content
    tmp = data['title'] + " " + data['content_cleaned']
    print "Combined question title and content."
    #3: remove new line, extra space
    clean = lambda x: re.sub("[ ]+", " ", x.strip("\n"))
    data["combined_text"] = tmp.apply(clean)
    print "Removed spaces and new lines."
    keeps = ['Qid', 'combined_text', 'tags']
    return data[keeps]

In [8]:
df = pre_process()


Removed html tags.
Combined question title and content.
Removed spaces and new lines.

In [9]:
peek()


5-57585
Non-EU citizen Schengen visa via NYC Swiss embassy I am applying for Schengen visa via Switzerland embassy in NYC.
One of their requirements is :

A copy of your confirmed flight reservation.

How do I purchase temporary tickets to meet this requirement?
visas schengen

Generate batch


In [18]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import FreqDist
import numpy as np
import cPickle as pkl
from collections import deque

In [151]:
def build_word_dict(df, limit=None):
    corpus = df["combined_text"].str.cat(sep=" ")
    corpus = re.sub("[ ]+", " ", corpus) # one gigantic text
    words =  word_tokenize(corpus, 'english')
    if limit is None:
        limit = len(set(words))
    vc = {}
    vc['UNKNOWN'] = 0
    for word,_ in FreqDist(words).most_common(limit):
        vc[word] = len(vc) # might overwrite 'UNKNOWN'
    vc['NULL'] = len(vc)
    vc['UNKNOWN'] = 0
    rv_vc = {v:k for k, v in vc.iteritems()} # reverse volcabulary
    return vc, rv_vc

In [152]:
word2id, id2word = build_word_dict(df=df)

In [153]:
len(word2id)


Out[153]:
212590

In [154]:
word_dict_file = "../data/tmp/word2id.pkl"
reversed_word_dict_file = "../data/tmp/id2word.pkl"
with open(word_dict_file, "wb") as f:
    pkl.dump(word2id, f)
with open(reversed_word_dict_file, "wb") as f:
    pkl.dump(id2word, f)

In [19]:
word_dict_file = "../data/tmp/word2id.pkl"
reversed_word_dict_file = "../data/tmp/id2word.pkl"
if osp.exists(word_dict_file):
    with open(word_dict_file, "rb") as f:
        word2id = pkl.load(f)
if osp.exists(reversed_word_dict_file):
    with open(reversed_word_dict_file, "rb") as f:
        id2word = pkl.load(f)

In [233]:
def build_dataset(df, vocab): 
    ssplit = lambda row: sent_tokenize(row["combined_text"], "english")
    gendf = lambda sents, row: pd.DataFrame(zip([row["Qid"]]*len(sents), sents), columns=["Qid", "Sentence"])
    first = df.loc[0, :]
    sents = ssplit(first)
    df_sent = gendf(sents, first)
    nrow = len(df.index)
    disp = nrow / 10
    for idx in xrange(1, nrow):
        row = df.loc[idx, :]
        sents = ssplit(row)
        df_sent = df_sent.append(gendf(sents, row), ignore_index=True)
        if (idx+1) % disp == 0:
            print "Processed: %d"%(idx+1)
    text2numeric = lambda s: [vocab[w] if w in vocab else 0 for w in word_tokenize(s, "english")]
    df_sent["Encoding"] = df_sent["Sentence"].apply(text2numeric)
    df_sent["Length"] = df_sent["Encoding"].apply(len)
    df_sent = df_sent.loc[df_sent["Length"] > 3, :]
    df_sent = df_sent.loc[df_sent["Length"] < 50, :]
    return df_sent

In [234]:
df_sent = build_dataset(df, word2id)


Processed: 8700
Processed: 17400
Processed: 26100
Processed: 34800
Processed: 43500
Processed: 52200
Processed: 60900
Processed: 69600
Processed: 78300
Processed: 87000

In [2]:
sents_file = "../data/tmp/sentences.csv"

In [11]:
df_sent.to_csv(sents_file, encoding='utf-8', index=False)

In [12]:
df_sent = pd.read_csv(sents_file, encoding='utf-8')

In [13]:
print "Number of questions: %d" %len(set(df_sent['Qid']))


Number of questions: 86632

In [238]:
def get_batch(df, batch_size, window=5, null_id=word2id["NULL"]): # use paragraph and left N-1 words to predict the N-th word
    batch = np.ndarray(shape=(batch_size, 5), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    sample = df.sample(n=batch_size//10).reset_index()
    assert sample["Length"].sum() >= batch_size, "Not enought data: %d (< %d)" %(sample["Length"].sum(), batch_size)
    cnt = 0 # fill rate
    row = 0
    while cnt < batch_size:
        buffer_ = [null_id] * window + sample.loc[row, "Encoding"] # paddings 
        for i in xrange(len(buffer_) - window):
            if cnt < batch_size:
                batch[cnt, :] = buffer_[i: i + window]
                labels[cnt, 0] = buffer_[i + window]
                cnt += 1
            else:
                break
        row += 1
    return batch, labels

In [239]:
batch, labels = get_batch(df_sent, 128, window=5)

Build TensorFlow


In [236]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import threading
import time

from six.moves import xrange  # pylint: disable=redefined-builtin

import numpy as np
import tensorflow as tf

In [240]:
# Pre-requisites:
# 1. word2id
# 2. id2word
# 3. df_sent: ["Qid", "Sentence", "Encoding", "Length"]
flags = tf.app.flags
flags.DEFINE_string("train_data", None, "Doc2Vec training data")
flags.DEFINE_integer("doc_embedding_size", 200, "embedding size for paragraph")
flags.DEFINE_integer("word_embedding_size", 100, "embedding size for word")
flags.DEFINE_integer("vocab_size", len(word2id), "vocabulary size")
flags.DEFINE_integer("num_paragraphs", len(set(df_sent['Qid'])), "number of paragraphs in the training set")
flags.DEFINE_boolean("concat", True, "whether to concatenate paragraph embedding with word embeddings")
flags.DEFINE_boolean("sum_", False, "whether to sum all embeddings (both paragraph and word)")
flags.DEFINE_boolean("average", False, "whether to average all embeddings (both paragraph and word)")
flags.DEFINE_integer("window", 4, "number of preceding words used to predict next word")

FLAGS = flags.FLAGS

In [ ]:
class Options(object):
    """
    Options used by doc2vec model
    """
    
    def __init__(self):
        self.train_data = FLAGS.train_data
        self.wd_emb_dim = FLAGS.word_embedding_size
        self.ph_emb_dim = FLAGS.doc_embedding_size
        self.vocab_size = FLAGS.vocab_size
        self.num_paragraphs = FLAGS.num_paragraphs
        self.concat = FLAGS.concat
        self.sum_ = FLAGS.sum_
        self.average = FLAGS.average
        assert self.concat or self.sum_ or self.average, "You either concat or sum/average input embeddings"
        if not self.concat:
            assert self.sum_ or self.average, "You either sum or average input embeddings"
            assert self.wd_emb_dim == self.ph_emb_dim, 
            "If not concatenated, paragraph embeddings should have the same size as word embedding's"
        self.window = FLAGS.window

In [ ]:
class Doc2Vec(object):
    """
    Distributed Memory Model of Paragraph Vectors (PV-DM).
    """
    
    def __init__(self, options, session):
        self._options = options
        self._session = session
        self._wd_embed = None
        self._ph_embed = None
        self.global_step = None
    
    def forward(self, batch_data, batch_labels):
        opts = self._options
        
        # Word and paragraph embeddings
        wd_embed_init_width = 0.5 / opts.wd_emb_dim
        ph_embed_init_width = 0.5 / opts.ph_emb_dim
        wd_embed = tf.Variable(tf.random_uniform([opts.vocab_size, opts.wd_embed_dim], -wd_embed_init_width, 
                              wd_embed_init_width), name="word_embedding")
        ph_embed = tf.Variable(tf.random_uniform([opts.num_paragraphs, opts.ph_embed_din], -ph_embed_init_width,
                                                ph_embed_init_width), name="paragraph_embedding")
        self._wd_embed = wd_embed
        self._ph_embed = ph_embed
        
        # Softmax weight & biases
        if opts.concat:
            tf.sm_wgt = tf.Variable(tf.zeros([opts.vocab_size, opts.ph_emb_dim + opts.window * opts.wd_emb_dim]), 
                                   name="Softmax weights")
        else:
            tf.sm_wgt = tf.Variable(tf.zeros([opts.vocab_size, opts.wd_emb_dim]), name="Softmax weights")
        tf.sm_b = tf.Variable(tf.zeros([opts.vocab_size]), name="Softmax biases")
        
        # Global step
        self.global_step = tf.Variable(0, "global step")