In [1]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
Unsurprisingly, the 20 newsgroup data contains newgroup text from 20 topics. The topics are as follows:
In [2]:
classes = list(newsgroups_train.target_names)
classes
Out[2]:
We'll limit ourselves to two classes for sake of simplicity
In [3]:
classes = ['alt.atheism', 'soc.religion.christian']
newsgroups_train = fetch_20newsgroups(subset='train', categories=classes)
from collections import Counter
Counter([classes[t] for t in newsgroups_train.target])
Out[3]:
Here's an example from the dataset:
In [4]:
newsgroups_train.data[0]
Out[4]:
In [5]:
newsgroups_train.target[0]
Out[5]:
Notice that the target is already converted into a class index. Namely, in this case the text belongs to the class:
In [6]:
classes[newsgroups_train.target[0]]
Out[6]:
If you do not have CoreNLP, download it from here:
http://stanfordnlp.github.io/CoreNLP/index.html#download
We are going to use the Java server feature of CoreNLP to annotate data in python. In the CoreNLP directory, run the server:
java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer
Next, we'll annotate an example to see how the server works.
In [7]:
from stanza.corenlp.client import Client
client = Client()
annotation = client.annotate(newsgroups_train.data[0], properties={'annotators': 'tokenize,ssplit,pos'})
annotation['sentences'][0]
Out[7]:
That was rather long, but the gist is that the annotation is organized into sentences, which is then organized into tokens. Each token carries a number of annotations (we've only asked for the POS tags).
In [8]:
for token in annotation['sentences'][0]['tokens']:
print token['word'], token['pos']
For our purpose, we're actually going to just take the document as a long sequence of words as opposed to a sequence of sequences (eg. a list of sentences of words). We'll do this by passing in the ssplit.isOneSentence flag.
In [9]:
docs = []
labels = []
for doc, label in zip(newsgroups_train.data, newsgroups_train.target):
try:
annotation = client.annotate(doc, properties={'annotators': 'tokenize,ssplit', 'ssplit.isOneSentence': True})
docs.append([t['word'] for t in annotation['sentences'][0]['tokens']])
labels.append(label)
except Exception as e:
pass # we're going to punt and ignore unicode errors...
print len(docs), len(labels)
We'll create a lightweight dataset object out of this. A Dataset is really a glorified dictionary of fields, where each field corresponds to an attribute of the examples in the dataset.
In [10]:
from stanza.text.dataset import Dataset
from pprint import pprint
dataset = Dataset({'X': docs, 'Y': labels})
# dataset supports, amongst other functionalities, shuffling:
print dataset.shuffle()
# indexing of a single element
pprint(dataset[0])
# indexing of multiple elements
pprint(dataset[:2])
n_train = int(0.7 * len(dataset))
train = Dataset(dataset[:n_train])
test = Dataset(dataset[n_train:])
print 'train: {}, test: {}'.format(len(train), len(test))
In [11]:
from stanza.text.vocab import Vocab
vocab = Vocab('***UNK***')
vocab
Out[11]:
We'll try our hands at some conversions:
In [12]:
sents = ['I like cats and dogs', 'I like nothing', 'I like cats and nothing else']
inds = []
for s in sents[:2]:
inds.append(vocab.update(s.split()))
inds.append(vocab.words2indices(sents[2].split()))
for s, ind in zip(sents, inds):
print 'read {}, which got mapped to indices {}\nrecovered:{}'.format(s, ind, vocab.indices2words(ind))
A common operation to do with vocabular objects is to replace rare words with UNKNOWN tokens. We'll convert words that occured less than 2 times.
In [13]:
# this is actually a copy operation, because indices change when words are removed from the vocabulary
vocab = vocab.prune_rares(cutoff=2)
for s in sents:
inds = vocab.words2indices(s.split())
print vocab.indices2words(inds)
Now, we'll convert the entire dataset. The convert function applies a transform to the specified field of the dataset. We'll apply a transform using the vocabulary.
In [14]:
from stanza.text.vocab import SennaVocab
vocab = SennaVocab()
# we'll actually just use the first 200 tokens of the document
max_len = 200
train = train.convert({'X': lambda x: x[:max_len]}, in_place=True)
test = test.convert({'X': lambda x: x[:max_len]}, in_place=True)
# make a backup
train_orig = train
test_orig = test
train = train_orig.convert({'X': vocab.update}, in_place=False)
vocab = vocab.prune_rares(cutoff=3)
train = train_orig.convert({'X': vocab.words2indices}, in_place=False)
test = test_orig.convert({'X': vocab.words2indices}, in_place=False)
pad_index = vocab.add('***PAD***', count=100)
max_len = max([len(x) for x in train.fields['X'] + test.fields['X']])
print 'train: {}, test: {}'.format(len(train), len(test))
print 'vocab size: {}'.format(vocab)
print 'sequence max len: {}'.format(max_len)
print
print test[:2]
In [15]:
import tensorflow as tf
from tensorflow.models.rnn import rnn
from tensorflow.models.rnn.rnn_cell import LSTMCell
from stanza.ml.tensorflow_utils import labels_to_onehots
import numpy as np
np.random.seed(42)
embedding_size = 50
hidden_size = 100
seq_len = max_len
vocab_size = len(vocab)
class_size = len(classes)
# symbolic variable for word indices
indices = tf.placeholder(tf.int32, [None, seq_len])
# symbolic variable for labels
labels = tf.placeholder(tf.float32, [None, class_size])
In [16]:
# lookup table
with tf.device('/cpu:0'), tf.name_scope("embedding"):
E = tf.Variable(
tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
name="emb")
embeddings = tf.nn.embedding_lookup(E, indices)
embeddings_list = [tf.squeeze(t, [1]) for t in tf.split(1, seq_len, embeddings)]
In [17]:
# rnn
cell = LSTMCell(hidden_size, embedding_size)
outputs, states = rnn.rnn(cell, embeddings_list, dtype=tf.float32)
final_output = outputs[-1]
In [18]:
# classifier
def weights(shape):
return tf.Variable(tf.random_normal(shape, stddev=0.01))
scores = tf.matmul(final_output, weights((hidden_size, class_size)))
In [19]:
# objective
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(scores, labels))
We'll optimize the network via Adam
In [20]:
# operations
train_op = tf.train.AdamOptimizer(0.001, 0.9).minimize(cost)
predict_op = tf.argmax(scores, 1)
In [27]:
from sklearn.metrics import accuracy_score
from time import time
batch_size = 128
num_epochs = 10
def run_epoch(split, train=False):
epoch_cost = 0
epoch_pred = []
for i in xrange(0, len(split), batch_size):
batch = split[i: i+batch_size]
n = len(batch['Y'])
X = Dataset.pad(batch['X'], pad_index, seq_len)
Y = np.zeros((n, class_size))
Y[np.arange(n), np.array(batch['Y'])] = 1
if train:
batch_cost, batch_pred, _ = session.run([cost, predict_op, train_op], {indices: X, labels: Y})
else:
batch_cost, batch_pred = session.run([cost, predict_op], {indices: X, labels: Y})
epoch_cost += batch_cost * n
epoch_pred += batch_pred.flatten().tolist()
return epoch_cost, epoch_pred
def train_eval(session):
for epoch in xrange(num_epochs):
start = time()
print 'epoch: {}'.format(epoch)
epoch_cost, epoch_pred = run_epoch(train, True)
print 'train cost: {}, acc: {}'.format(epoch_cost/len(train), accuracy_score(train.fields['Y'], epoch_pred))
print 'time elapsed: {}'.format(time() - start)
test_cost, test_pred = run_epoch(test, False)
print '-' * 20
print 'test cost: {}, acc: {}'.format(test_cost/len(test), accuracy_score(test.fields['Y'], test_pred))
with tf.Session() as session:
tf.set_random_seed(123)
session.run(tf.initialize_all_variables())
train_eval(session)
Remember how we used SennaVocab? Let's see what happens if we preinitialize our embeddings:
In [28]:
preinit_op = E.assign(vocab.get_embeddings())
with tf.Session() as session:
tf.set_random_seed(123)
session.run(tf.initialize_all_variables())
session.run(preinit_op)
train_eval(session)
In [ ]: