In [1]:
import glob
import pickle
import re
import numpy as np
import pandas as pd
from random import shuffle
from tensorflow.contrib import learn
In [24]:
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
string = re.sub(r"<br />", " ", string)
return string.strip().lower()
def get_reviews(path, clean = True):
complete_path = path + '/*.txt'
files = glob.glob(complete_path)
reviews = [str(open(rev).readlines()[0]).strip() for rev in files]
if clean:
reviews = [clean_str(rev) for rev in reviews]
return reviews
In [22]:
# Gets all the reviews
train_positive_reviews = get_reviews("data/aclImdb/train/pos")
train_negative_reviews = get_reviews("data/aclImdb/train/neg")
test_positive_reviews = get_reviews("data/aclImdb/test/pos")
test_negative_reviews = get_reviews("data/aclImdb/test/neg")
# Divide The train set into train and validation
# Concat all train reviews and write it on a file
train_reviews = train_positive_reviews + train_negative_reviews
output_train = open('data/all_train.txt', 'w')
for rev in train_reviews:
print>>output_train, rev
output_train.close()
In [23]:
# Saves the Train/Test lists into pickle objects
pickle.dump(train_positive_reviews, open( "data/train_pos.p", "wb" ))
pickle.dump(train_negative_reviews, open( "data/train_neg.p", "wb" ))
pickle.dump(test_positive_reviews, open( "data/test_pos.p", "wb" ))
pickle.dump(test_negative_reviews, open( "data/test_neg.p", "wb" ))
In [2]:
# Loads the Train/Test objects
train_positive_reviews = pickle.load(open("data/train_pos.p","rb"))
train_negative_reviews = pickle.load(open("data/train_neg.p","rb"))
test_positive_reviews = pickle.load(open("data/test_pos.p","rb"))
test_negative_reviews = pickle.load(open("data/test_neg.p","rb"))
In [3]:
def get_train_sets():
train_positive_reviews = pickle.load(open("data/train_pos.p","rb"))
train_negative_reviews = pickle.load(open("data/train_neg.p","rb"))
return train_positive_reviews, train_negative_reviews
def get_test_sets():
test_positive_reviews = pickle.load(open("data/test_pos.p","rb"))
test_negative_reviews = pickle.load(open("data/test_neg.p","rb"))
return test_positive_reviews, test_negative_reviews
def label_data(positive_revs, negative_revs):
# Generate the labels
positive_labels = [[0, 1] for _ in positive_revs]
negative_labels = [[1, 0] for _ in negative_revs]
# Concatenates the positive and negative labels for train and val
y_labels = np.concatenate([positive_labels, negative_labels], 0)
x_train = positive_revs + negative_revs
return [x_train, y_labels]
def __split_train_validation(x_train, y_train, amount_val=.25):
x_train = np.array(x_train)
y_train = np.array(y_train)
# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y_train)))
print (shuffle_indices)
x_shuffled = x_train[shuffle_indices]
y_shuffled = y_train[shuffle_indices]
total_reviews = len(x_shuffled)
training_num = total_reviews - int(total_reviews * amount_val)
x_t = x_shuffled[:training_num]
y_t = y_shuffled[:training_num]
x_dev = x_shuffled[training_num:]
y_dev = y_shuffled[training_num:]
return [x_t, y_t], [x_dev, y_dev]
def get_train_validation(train_pos, train_neg, amount_val=.25):
# Divides the sets
total_reviews = len(train_pos)
print("Num Total Reviews in set:", total_reviews)
training_num = total_reviews - int(total_reviews * amount_val)
print("Num Training Reviews:", training_num)
train_pos_reviews_t = train_pos[:training_num]
train_neg_reviews_t = train_neg[:training_num]
train_pos_reviews_v = train_pos[training_num:]
train_neg_reviews_v = train_neg[training_num:]
# Generate the labels
train_positive_labels = [[0, 1] for _ in train_pos_reviews_t]
val_positive_labels = [[0, 1] for _ in train_pos_reviews_v]
train_negative_labels = [[1, 0] for _ in train_neg_reviews_t]
val_negative_labels = [[1, 0] for _ in train_neg_reviews_v]
# Concatenates the positive and negative labels for train and val
y_train = np.concatenate([train_positive_labels, train_negative_labels], 0)
y_val = np.concatenate([val_positive_labels, val_negative_labels], 0)
# Creates one list for positive and negative reviews
x_train = train_pos_reviews_t + train_neg_reviews_t
x_val = train_pos_reviews_v + train_neg_reviews_v
print("x_train:", len(x_train))
print("y_train:", len(y_train))
print("x_val:", len(x_val))
print("y_val:", len(y_val))
return [x_train, y_train],[x_val, y_val]
def get_test_labeled(test_pos, test_neg):
# Generate the labels
test_positive_labels = [[0, 1] for _ in test_pos]
test_negative_labels = [[1, 0] for _ in test_neg]
y = np.concatenate([test_positive_labels, test_negative_labels], 0)
x_test = test_pos + test_neg
return [x_test, y]
#train, validation = get_train_validation(train_positive_reviews, train_negative_reviews)
x_t, y_t = label_data(train_positive_reviews, train_negative_reviews)
In [4]:
# Label the data
x_train, y_train = label_data(train_positive_reviews, train_negative_reviews)
# Separates in Train and Dev
x_train_list, x_dev_list = split_train_validation(x_train, y_train)
In [7]:
# Shuffle the data
def split_train_validation(x_train, y_train, amount_val=.25):
x_train_shuffled = []
y_train_shuffled = []
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y_train)))
for i in shuffle_indices:
x_train_shuffled.append(x_train[i])
y_train_shuffled.append(y_train[i])
total_reviews = len(x_train_shuffled)
training_num = total_reviews - int(total_reviews * amount_val)
x_t = x_train_shuffled[:training_num]
y_t = y_train_shuffled[:training_num]
x_dev = x_train_shuffled[training_num:]
y_dev = y_train_shuffled[training_num:]
return [x_t, y_t], [x_dev, y_dev]
# Separates in Train and Dev
x_train_list, x_dev_list = split_train_validation(x_t, y_t)
print(len(x_train_list[0]))
print(len(x_train_list[1]))
In [13]:
print(x_dev_list[0][1])
print(x_dev_list[1][1])
In [32]:
# Loads the vocabulary
def load_vocabulary(file_path, num_words=10000):
with open(file_path) as vocab:
vocab_list = [next(vocab) for x in range(num_words)]
vocab_list = [str(vocab).strip() for vocab in vocab_list]
return vocab_list
#
#load_vocabulary("data/vocab_unigrams_no_counts/part-00000")
In [ ]:
# Spark Unigrams
text_file = sc.textFile('all_train.txt')
counts = text_file.flatMap(lambda line: line.split(" ")).map(lambda word:(word, 1)).reduceByKey(lambda a, b: a+b).sortBy(lambda a: -a[1])
# Comment this line, if you want tuples
just_words = counts.map(lambda tuple: tuple[0])
just_words.saveAsTextFile("vocab_unigrams_no_counts")
# Spark Bi-grams
bigrams = text_file.map(lambda x:x.split()).flatMap(lambda x: [((x[i],x[i+1]),1) for i in range(0,len(x)-1)])
count_bigrams = bigrams.reduceByKey(lambda x, y: x+y).sortBy(lambda a: -a[1])
just_bigrams = count_bigrams.map(lambda tuple: tuple[0][0] + ' ' + tuple[0][1])
just_bigrams.saveAsTextFile("vocab_bigrams_no_counts")
just_bigrams.saveAsTextFile("vocab_oov_bigrams_no_counts")
In [10]:
# This is a test for the vocabulary
vocabulary = load_vocabulary("data/vocab_unigrams_no_counts/part-00000")
vocabulary = [str(vocab).strip() for vocab in vocabulary]
vocabulary[:5]
max_len_vocabulary = len(vocabulary)
print (max_len_vocabulary)
In [8]:
train_reviews = train_positive_reviews + train_negative_reviews
print(len(train_reviews))
In [16]:
def set_oov(reviews, vocabulary):
updated_reviews = []
for review in reviews:
up_review = []
splitted_review = review.split(" ")
for i, word in enumerate(splitted_review):
if word not in vocabulary:
splitted_review[i] = 'oov'
else:
splitted_review[i] = word
new_review = (' ').join(splitted_review)
updated_reviews.append(new_review)
return updated_reviews
def set_oov_tag(reviews, vocabulary):
updated_reviews = []
set_vocabulary = set(vocabulary)
for review in reviews:
set_review = set(review.split(" "))
oov_words = set_review - set_vocabulary
#print(list(oov_words))
dic_oov_words = {k:'oov' for k in oov_words}
#print(dic_oov_words)
if len(dic_oov_words) >= 1:
rep = dict((re.escape(k), v) for k, v in dic_oov_words.items())
pattern = re.compile("|".join(rep.keys()))
oov_review = pattern.sub(lambda m: rep[re.escape(m.group(0))], review)
updated_reviews.append(oov_review)
else:
updated_reviews.append(review)
return updated_reviews
oov_reviews = set_oov(train_reviews, vocabulary)
#print(len(new_reviews))
In [24]:
print(len(oov_reviews))
super_review = ' '.join(oov_reviews)
In [26]:
# Prepares Train/Dev for FaceBook FastText
# Loads the Data
train_positive_reviews = pickle.load(open("data/train_pos.p","rb"))
train_negative_reviews = pickle.load(open("data/train_neg.p","rb"))
# For each review append the label
train_pos_reviews_labeled = [x + ' __label__1' for x in train_positive_reviews]
train_neg_reviews_labeled = [x + ' __label__0' for x in train_negative_reviews]
In [29]:
fb_reviews = train_pos_reviews_labeled + train_neg_reviews_labeled
shuffle(fb_reviews)
print(fb_reviews[0])
with open('fastText/fb_train_shuffled.txt', mode='wt', encoding='utf-8') as output_fb_train:
output_fb_train.write('\n'.join(fb_reviews))
In [30]:
# Prepares Test for Facebook FastText
test_positive_reviews = pickle.load(open("data/test_pos.p","rb"))
test_negative_reviews = pickle.load(open("data/test_neg.p","rb"))
# For each review append the label
test_pos_reviews_labeled = [x + ' __label__1' for x in test_positive_reviews]
test_neg_reviews_labeled = [x + ' __label__0' for x in test_negative_reviews]
fb_test_reviews = test_pos_reviews_labeled + test_neg_reviews_labeled
shuffle(fb_test_reviews)
with open('fastText/fb_test_shuffled.txt', mode='wt', encoding='utf-8') as output_fb_test:
output_fb_test.write('\n'.join(fb_test_reviews))
In [31]:
train_positive_reviews = get_reviews("data/aclImdb/train/pos", clean=False)
train_negative_reviews = get_reviews("data/aclImdb/train/neg", clean=False)
test_positive_reviews = get_reviews("data/aclImdb/test/pos", clean=False)
test_negative_reviews = get_reviews("data/aclImdb/test/neg", clean=False)
from random import shuffle
# For each review append the label
train_pos_reviews_labeled = [x + ' __label__1' for x in train_positive_reviews]
train_neg_reviews_labeled = [x + ' __label__0' for x in train_negative_reviews]
fb_reviews = train_pos_reviews_labeled + train_neg_reviews_labeled
shuffle(fb_reviews)
with open('fastText/fb_train_unclean_shuffled.txt', mode='wt', encoding='utf-8') as output_fb_train:
output_fb_train.write('\n'.join(fb_reviews))
#=============================
# For each review append the label
test_pos_reviews_labeled = [x + ' __label__1' for x in test_positive_reviews]
test_neg_reviews_labeled = [x + ' __label__0' for x in test_negative_reviews]
fb_test_reviews = test_pos_reviews_labeled + test_neg_reviews_labeled
shuffle(fb_test_reviews)
with open('fastText/fb_test_unclean_shuffled.txt', mode='wt', encoding='utf-8') as output_fb_test:
output_fb_test.write('\n'.join(fb_test_reviews))
fb_test_reviews[0]
Out[31]:
In [13]:
# Obtain the labels
# Open the files
# For Clean Shuffled
clean_shuffled = open('fastText/fb_test_shuffled.txt').readlines()
print(len(clean_shuffled))
clean_shuffled_labels = [str(rev.split(' ')[-1]).strip() for rev in clean_shuffled]
clean_preds = open('fastText/fb_shuffled_preds.txt').readlines()
clean_preds = [str(lab).strip() for lab in clean_preds]
count = 0
for i in range(len(clean_shuffled_labels)):
if clean_shuffled_labels[i] == clean_preds[i]:
count += 1
print("Accuracy:", float(count)/float(len(clean_shuffled_labels)))
In [16]:
# For Unclean Shuffled
unclean_shuffled = open('fastText/fb_test_unclean_shuffled.txt').readlines()
print(len(unclean_shuffled))
unclean_shuffled_labels = [str(rev.split(' ')[-1]).strip() for rev in unclean_shuffled]
unclean_preds = open('fastText/fb_unclean_shuffled_preds.txt').readlines()
unclean_preds = [str(lab).strip() for lab in unclean_preds]
count = 0
for i in range(len(unclean_shuffled_labels)):
if unclean_shuffled_labels[i] == unclean_preds[i]:
count += 1
print("Accuracy:", float(count)/float(len(unclean_shuffled_labels)))
BIGRAMS
In [34]:
x_train_reviews_oov = pickle.load(open("data/reviews_oov.p", "rb"))
# Set this to file
with open('data/reviews_oov.txt', mode='wt', encoding='utf-8') as output_reviews_oov:
output_reviews_oov.write('\n'.join(x_train_reviews_oov))
In [42]:
rev_test = x_train_reviews_oov[0]
# Loads vocab
bi_vocabulary = load_vocabulary("data/vocab_oov_bigrams_no_counts/part-00000")
def find_bigrams(review, vocabulary):
split_review = review.split(' ')
zipped = zip(split_review, split_review[1:])
bigrams = [x[0] + '_' + x[1] if x[0] + ' ' + x[1] in vocabulary else 'oov' for x in zipped]
print(len(bigrams))
return ' '.join(bigrams)
#[find_bigrams(rev, bi_vocabulary) for rev in x_train_reviews_oov]
find_bigrams(rev_test, bi_vocabulary)
Out[42]:
In [39]:
vp = learn.preprocessing.VocabularyProcessor(10)
list(vp.fit_transform(["a", "dog" , "ran" ,"in" ,"the", "park"]))
Out[39]:
In [35]:
x = [[1,2,3], [4,5,6]]
y = [['a', 'b', 'c'], ['d','e','f']]
zipped = zip(x,y)
final_revs = [x[0]+x[1] for x in zipped]
final_revs
Out[35]: