In [18]:
import keras 
print(keras.__version__)
import scipy
print(scipy.__version__)
import theano
print(theano.__version__)
import sklearn
print(sklearn.__version__)


1.1.1
0.18.1
0.7.0
0.18.1
from __future__ import print_function import os import numpy as np from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils.np_utils import to_categorical from keras.models import Sequential, Model from keras.layers import Dense, Input, Flatten, Activation, Merge

In [ ]:


In [9]:
np.random.seed(1337)

EmbeddingDim    = 50
MaxWords        = 30000
SequenceLength  = 50
Epochs          = 5
SamplesPerEpoch = 1000
BatchSize       = 64
Labels          = 3
LabelMapping    = {
  1: 0,
  2: 0,
  3: 1,
  4: 2,
  5: 2
}

def oneHot(dictionarySize, wordIndex):
	vect = np.zeros(dictionarySize)
	if wordIndex > 0: vect[wordIndex] = 1
	return vect

# From https://primes.utm.edu/lists/small/100ktwins.txt
Prime1 = 15327749
Prime2 = 18409199

# `sequence` must refer to zero-padded sequence.
# From http://www.fit.vutbr.cz/~imikolov/rnnlm/thesis.pdf, equation 6.6
def biGramHash(sequence, t, buckets):
	t1 = sequence[t - 1] if t - 1 >= 0 else 0
	return (t1 * Prime1) % buckets

def triGramHash(sequence, t, buckets):
	t1 = sequence[t - 1] if t - 1 >= 0 else 0
	t2 = sequence[t - 2] if t - 2 >= 0 else 0

	return (t2 * Prime1 * Prime2 + t1 * Prime1) % buckets

def sentenceVector(tokeniser, dictionarySize, sentence, oneHotVectors, oneHotAveraged, contextHashes):
	result    = np.array([])
	sequences = tokeniser.texts_to_sequences([sentence])
	# Zero-pad every string
	padded    = pad_sequences(sequences, maxlen=SequenceLength)[0]

	if oneHotVectors:
		iptOneHot = [oneHot(dictionarySize, i) for i in padded]
		result = np.append(
			result,
			np.mean(iptOneHot, axis=0) if oneHotAveraged else np.concatenate(iptOneHot)
		)

	if contextHashes:
		buckets = np.zeros(dictionarySize * 2)
		for t in range(SequenceLength): buckets[biGramHash(padded, t, dictionarySize)] = 1
		for t in range(SequenceLength): buckets[dictionarySize + triGramHash(padded, t, dictionarySize)] = 1
		result = np.append(result, buckets)

	return result


def mapGenerator(generator, tokeniser, dictionarySize, oneHot, oneHotAveraged, contextHashes):
	for row in generator:
		sentence = row[0]
		label    = row[1]

		x = sentenceVector(tokeniser, dictionarySize, sentence, oneHot, oneHotAveraged, contextHashes)
		y = np.zeros(Labels)
		y[LabelMapping[label]] = 1
		yield (x[np.newaxis], y[np.newaxis])

def train(oneHot, oneHotAveraged, contextHashes):
	n = (Epochs + 1) * SamplesPerEpoch  # TODO + 1 should not be needed

	tokeniser = Tokenizer(nb_words=MaxWords)
	tokeniser.fit_on_texts((row[0] for row in trainingData(n)))

	# `word_index` maps each word to its unique index
	dictionarySize = len(tokeniser.word_index) + 1

	oneHotDimension        = (1 if oneHotAveraged else SequenceLength) * dictionarySize if oneHot else 0
	contextHashesDimension = dictionarySize * 2 if contextHashes else 0

	model = Sequential()
	model.add(Dense(EmbeddingDim, input_dim=(oneHotDimension + contextHashesDimension)))
	model.add(Dense(Labels, activation='softmax'))
	model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

	trainingGenerator   = mapGenerator(trainingData(n),   tokeniser, dictionarySize, oneHot, oneHotAveraged, contextHashes)
	validationGenerator = mapGenerator(validationData(n), tokeniser, dictionarySize, oneHot, oneHotAveraged, contextHashes)

	model.fit_generator(trainingGenerator,
		nb_epoch=Epochs,
		samples_per_epoch=SamplesPerEpoch,
		validation_data=validationGenerator,
		nb_val_samples=SamplesPerEpoch)

	model2 = Sequential()
	model2.add(Dense(EmbeddingDim, input_dim=(oneHotDimension + contextHashesDimension), weights=model.layers[0].get_weights()))

	return model, model2, tokeniser, dictionarySize

# TODO Fix
def query(model, tokeniser, dictionarySize, sentence):
	concat = sentenceVector(tokeniser, dictionarySize, sentence)
	return model.predict(np.asarray(concat)[np.newaxis])

In [10]:
import json
import codecs

DataSetPath = '/home/data/sentiment-analysis-and-text-classification/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json'


def processFile(n, validation):
  with codecs.open(DataSetPath, encoding='iso-8859-1') as f:
    if validation:
      for _ in range(n): next(f)

    for _ in range(n):
      line   = next(f).strip()
      review = json.loads(line)

      while len(review['text'].split()) > 50:
        line   = next(f).strip()
        review = json.loads(line)

      yield (review['text'], int(review['stars']))

def trainingData(n):
  return processFile(n, validation = False)

def validationData(n):
  return processFile(n, validation = True)

In [11]:
import time
import six.moves.cPickle
from sklearn.manifold import TSNE
import numpy as np

import csv

In [12]:
model, model2, tokeniser, dictionarySize = train(oneHot = True, oneHotAveraged = True, contextHashes=True)


Epoch 1/5
1000/1000 [==============================] - 24s - loss: 0.6913 - acc: 0.7510 - val_loss: 0.8134 - val_acc: 0.7270
Epoch 2/5
1000/1000 [==============================] - 24s - loss: 0.6052 - acc: 0.7840 - val_loss: 0.5645 - val_acc: 0.7980
Epoch 3/5
1000/1000 [==============================] - 25s - loss: 0.6514 - acc: 0.7590 - val_loss: 0.7004 - val_acc: 0.7820
Epoch 4/5
1000/1000 [==============================] - 25s - loss: 0.5726 - acc: 0.8090 - val_loss: 0.6256 - val_acc: 0.7840
Epoch 5/5
1000/1000 [==============================] - 26s - loss: 0.6181 - acc: 0.7660 - val_loss: 0.5753 - val_acc: 0.7740

In [ ]: