In [3]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
np.random.seed(1337)
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.datasets import imdb
from keras import backend as K
from theano import function
In [6]:
print("Loading data...")
max_features = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words = max_features)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')
In [8]:
print("Pad sequences(samples x time)")
maxlen = 500
X_train = sequence.pad_sequences(X_train, maxlen = maxlen)
X_test = sequence.pad_sequences(X_test, maxlen = maxlen)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
In [9]:
print("Build model..")
model = Sequential()
model.add(Embedding(max_features, 128, input_length = maxlen))
In [10]:
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
In [13]:
model.compile(loss='binary_crossentropy',
optimizer = 'adam',
metrics=["accuracy"])
print("Train..")
batch_size = 30
score = model.fit(X_train, y_train, batch_size = batch_size,
nb_epoch = 4, validation_data = (X_test, y_test))
In [16]:
import matplotlib.pyplot as plt
plt.plot(score.history['acc'])
plt.plot(score.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(score.history['loss'])
plt.plot(score.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
In [17]:
X_train[0]
Out[17]:
In [18]:
word_index = imdb.get_word_index()
index_word = {v:k for k,v in word_index.items()}
In [25]:
type(index_word.keys()[0])
Out[25]:
In [32]:
index_word[0] = '0'
' '.join(index_word[w] for w in X_train[0])
Out[32]:
In [91]:
X_train_words = []
for sentence in X_train:
X_train_words += [[index_word[w] for w in sentence if w != "0"]]
In [67]:
import gensim
import logging
In [92]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
w2v_model = gensim.models.Word2Vec(X_train_words, min_count=1)
In [95]:
X_train_words[0]
# w2v_model.wv[u'wonderful']
Out[95]:
In [ ]:
In [107]:
# w2v_model.wv['lucas']
# [index_word[w] for w in X_train[0]]
# w2v_model.wv.most_similar(positive=['woman', 'king'], negative=['man'])
w2v_model.wv.similarity('good', 'spielberg')
w2v_model.wv.similarity('good', 'tarantino')
In [47]:
from tempfile import mkstemp
In [48]:
fs, temp_path = mkstemp('word2vec_model_may9')
w2v_model.save(temp_path)
In [51]:
w2v_model.accuracy('questions-words.txt')
Out[51]:
In [70]:
fs, temp_path = mkstemp('word2vec_model_may9')
w2v.model = gensim.models.Word2Vec.load('word2vec_model_may9')
In [52]:
from gensim.corpora import WikiCorpus
In [53]:
import pandas as pd
In [55]:
train = pd.read_csv( "labeledTrainData.tsv", header=0,
delimiter="\t", quoting=3 )
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0,
delimiter="\t", quoting=3 )
In [58]:
# Verify the number of reviews that were read (100,000 in total)
print (train["review"].size, test["review"].size, unlabeled_train["review"].size)
In [59]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
In [60]:
def review_to_wordlist( review, remove_stopwords=False ):
# Function to convert a document to a sequence of words,
# optionally removing stop words. Returns a list of words.
#
# 1. Remove HTML
review_text = BeautifulSoup(review).get_text()
#
# 2. Remove non-letters
review_text = re.sub("[^a-zA-Z]"," ", review_text)
#
# 3. Convert words to lower case and split them
words = review_text.lower().split()
#
# 4. Optionally remove stop words (false by default)
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
#
# 5. Return a list of words
return(words)
In [61]:
# Download the punkt tokenizer for sentence splitting
import nltk.data
nltk.download()
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
# Function to split a review into parsed sentences. Returns a
# list of sentences, where each sentence is a list of words
#
# 1. Use the NLTK tokenizer to split the paragraph into sentences
raw_sentences = tokenizer.tokenize(review.strip())
#
# 2. Loop over each sentence
sentences = []
for raw_sentence in raw_sentences:
# If a sentence is empty, skip it
if len(raw_sentence) > 0:
# Otherwise, call review_to_wordlist to get a list of words
sentences.append( review_to_wordlist( raw_sentence, \
remove_stopwords ))
#
# Return the list of sentences (each sentence is a list of words,
# so this returns a list of lists
return sentences
In [64]:
sentences = [] # Initialize an empty list of sentences
print("Parsing sentences from training set")
for review in train["review"]:
sentences += review_to_sentences(review, tokenizer)
print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
sentences += review_to_sentences(review, tokenizer)
In [ ]: