In [1]:
import glob
import pickle
import re
import numpy as np
import pandas as pd
from random import shuffle
from tensorflow.contrib import learn

In [24]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"<br />", " ", string)
    return string.strip().lower()

def get_reviews(path, clean = True):
    complete_path = path + '/*.txt'
    files = glob.glob(complete_path)    
    reviews = [str(open(rev).readlines()[0]).strip() for rev in files]
    if clean:
        reviews = [clean_str(rev) for rev in reviews]
    return reviews

In [22]:
# Gets all the reviews
train_positive_reviews = get_reviews("data/aclImdb/train/pos")
train_negative_reviews = get_reviews("data/aclImdb/train/neg")
test_positive_reviews = get_reviews("data/aclImdb/test/pos")
test_negative_reviews = get_reviews("data/aclImdb/test/neg")

# Divide The train set into train and validation

# Concat all train reviews and write it on a file
train_reviews = train_positive_reviews + train_negative_reviews
output_train = open('data/all_train.txt', 'w')
for rev in train_reviews:
    print>>output_train, rev
output_train.close()

In [23]:
# Saves the Train/Test lists into pickle objects
pickle.dump(train_positive_reviews, open( "data/train_pos.p", "wb" ))
pickle.dump(train_negative_reviews, open( "data/train_neg.p", "wb" ))
pickle.dump(test_positive_reviews, open( "data/test_pos.p", "wb" ))
pickle.dump(test_negative_reviews, open( "data/test_neg.p", "wb" ))

In [2]:
# Loads the Train/Test objects
train_positive_reviews = pickle.load(open("data/train_pos.p","rb"))
train_negative_reviews = pickle.load(open("data/train_neg.p","rb"))
test_positive_reviews = pickle.load(open("data/test_pos.p","rb"))
test_negative_reviews = pickle.load(open("data/test_neg.p","rb"))

In [3]:
def get_train_sets():
    train_positive_reviews = pickle.load(open("data/train_pos.p","rb"))
    train_negative_reviews = pickle.load(open("data/train_neg.p","rb"))
    return train_positive_reviews, train_negative_reviews

def get_test_sets():
    test_positive_reviews = pickle.load(open("data/test_pos.p","rb"))
    test_negative_reviews = pickle.load(open("data/test_neg.p","rb"))
    return test_positive_reviews, test_negative_reviews

def label_data(positive_revs, negative_revs):
    # Generate the labels
    positive_labels = [[0, 1] for _ in positive_revs]
    negative_labels = [[1, 0] for _ in negative_revs]
    
    # Concatenates the positive and negative labels for train and val
    y_labels = np.concatenate([positive_labels, negative_labels], 0)
    
    x_train = positive_revs + negative_revs
     
    return [x_train, y_labels]
    
def __split_train_validation(x_train, y_train, amount_val=.25):
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    
    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    print (shuffle_indices)
    x_shuffled = x_train[shuffle_indices]
    y_shuffled = y_train[shuffle_indices]
    
    total_reviews = len(x_shuffled)
    training_num = total_reviews - int(total_reviews * amount_val)
    
    x_t = x_shuffled[:training_num]
    y_t = y_shuffled[:training_num]
    
    x_dev = x_shuffled[training_num:]
    y_dev = y_shuffled[training_num:]
    
    return [x_t, y_t], [x_dev, y_dev]

def get_train_validation(train_pos, train_neg, amount_val=.25):
    # Divides the sets
    total_reviews = len(train_pos)
    print("Num Total Reviews in set:", total_reviews)
    training_num = total_reviews - int(total_reviews * amount_val)
    print("Num Training Reviews:", training_num)
    
    train_pos_reviews_t = train_pos[:training_num]
    train_neg_reviews_t = train_neg[:training_num]
    train_pos_reviews_v = train_pos[training_num:]
    train_neg_reviews_v = train_neg[training_num:]
    
    # Generate the labels
    train_positive_labels = [[0, 1] for _ in train_pos_reviews_t]
    val_positive_labels = [[0, 1] for _ in train_pos_reviews_v]
    
    train_negative_labels = [[1, 0] for _ in train_neg_reviews_t]
    val_negative_labels = [[1, 0] for _ in train_neg_reviews_v]
    
    # Concatenates the positive and negative labels for train and val
    y_train = np.concatenate([train_positive_labels, train_negative_labels], 0)
    y_val = np.concatenate([val_positive_labels, val_negative_labels], 0)
    
    # Creates one list for positive and negative reviews
    x_train = train_pos_reviews_t + train_neg_reviews_t
    x_val = train_pos_reviews_v + train_neg_reviews_v
    
    print("x_train:", len(x_train))
    print("y_train:", len(y_train))
    print("x_val:", len(x_val))
    print("y_val:", len(y_val))
    
    return [x_train, y_train],[x_val, y_val]

def get_test_labeled(test_pos, test_neg):
    # Generate the labels
    test_positive_labels = [[0, 1] for _ in test_pos]
    test_negative_labels = [[1, 0] for _ in test_neg]
    
    y = np.concatenate([test_positive_labels, test_negative_labels], 0)
    x_test = test_pos + test_neg
    
    return [x_test, y]
    
#train, validation = get_train_validation(train_positive_reviews, train_negative_reviews)
x_t, y_t = label_data(train_positive_reviews, train_negative_reviews)

In [4]:
# Label the data
x_train, y_train = label_data(train_positive_reviews, train_negative_reviews)
# Separates in Train and Dev
x_train_list, x_dev_list = split_train_validation(x_train, y_train)


[18634  1333 20315 ..., 17728  7293 17673]

In [7]:
# Shuffle the data
def split_train_validation(x_train, y_train, amount_val=.25):
    x_train_shuffled = []
    y_train_shuffled = []
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    for i in shuffle_indices:
        x_train_shuffled.append(x_train[i])
        y_train_shuffled.append(y_train[i])
    
    total_reviews = len(x_train_shuffled)
    training_num = total_reviews - int(total_reviews * amount_val)

    x_t = x_train_shuffled[:training_num]
    y_t = y_train_shuffled[:training_num]

    x_dev = x_train_shuffled[training_num:]
    y_dev = y_train_shuffled[training_num:]

    return [x_t, y_t], [x_dev, y_dev]

# Separates in Train and Dev
x_train_list, x_dev_list = split_train_validation(x_t, y_t)
print(len(x_train_list[0]))
print(len(x_train_list[1]))


18750
18750

In [13]:
print(x_dev_list[0][1])
print(x_dev_list[1][1])


another of many nearly forgotten movies cranked out by poverty row in the 1930 's , resurrected by the magic of dvd starring stock universal player lionel atwill \( often a supporting actor in numerous frankenstein movies \) as a pair of twins involved in a murder racket one kills the victims \( stockbrokers involved in a scam \) and asks witnesses for the exact time , while the other is deaf and is proved innocent because he could not have spoken to witnesses of course , where it falls apart is if it was a congenital deafness , would n't they both be deaf \? oh , well atwill does a pretty good job here , faking being deaf and mute unfortunately , no one else here can really act worth a darn
[1 0]

In [32]:
# Loads the vocabulary
def load_vocabulary(file_path, num_words=10000):
    with open(file_path) as vocab:
        vocab_list = [next(vocab) for x in range(num_words)]
    vocab_list = [str(vocab).strip() for vocab in vocab_list]
    return vocab_list
#
#load_vocabulary("data/vocab_unigrams_no_counts/part-00000")

In [ ]:
# Spark Unigrams
text_file = sc.textFile('all_train.txt')
counts = text_file.flatMap(lambda line: line.split(" ")).map(lambda word:(word, 1)).reduceByKey(lambda a, b: a+b).sortBy(lambda a: -a[1])
# Comment this line, if you want tuples
just_words = counts.map(lambda tuple: tuple[0])
just_words.saveAsTextFile("vocab_unigrams_no_counts")

# Spark Bi-grams
bigrams = text_file.map(lambda x:x.split()).flatMap(lambda x: [((x[i],x[i+1]),1) for i in range(0,len(x)-1)])
count_bigrams = bigrams.reduceByKey(lambda x, y: x+y).sortBy(lambda a: -a[1])
just_bigrams = count_bigrams.map(lambda tuple: tuple[0][0] + ' ' + tuple[0][1])
just_bigrams.saveAsTextFile("vocab_bigrams_no_counts")
just_bigrams.saveAsTextFile("vocab_oov_bigrams_no_counts")

In [10]:
# This is a test for the vocabulary

vocabulary = load_vocabulary("data/vocab_unigrams_no_counts/part-00000")
vocabulary = [str(vocab).strip() for vocab in vocabulary]
vocabulary[:5]
max_len_vocabulary = len(vocabulary)
print (max_len_vocabulary)


10000

In [8]:
train_reviews = train_positive_reviews + train_negative_reviews
print(len(train_reviews))


25000

In [16]:
def set_oov(reviews, vocabulary):
    updated_reviews = []
    for review in reviews:
        up_review = []
        splitted_review = review.split(" ")
        for i, word in enumerate(splitted_review):
            if word not in vocabulary:
                splitted_review[i] = 'oov'
            else:
                splitted_review[i] = word
        new_review = (' ').join(splitted_review)
        updated_reviews.append(new_review)
    return updated_reviews
            
def set_oov_tag(reviews, vocabulary):
    updated_reviews = []
    set_vocabulary = set(vocabulary)
    for review in reviews:
        set_review = set(review.split(" "))
        oov_words = set_review - set_vocabulary
        #print(list(oov_words))
        
        dic_oov_words = {k:'oov' for k in oov_words}
        #print(dic_oov_words)
        if len(dic_oov_words) >= 1:
            rep = dict((re.escape(k), v) for k, v in dic_oov_words.items())
            pattern = re.compile("|".join(rep.keys()))
            oov_review = pattern.sub(lambda m: rep[re.escape(m.group(0))], review)
            updated_reviews.append(oov_review)
        else:
            updated_reviews.append(review)
    return updated_reviews

oov_reviews = set_oov(train_reviews, vocabulary)
#print(len(new_reviews))

In [24]:
print(len(oov_reviews))
super_review = ' '.join(oov_reviews)


25000

In [26]:
# Prepares Train/Dev for FaceBook FastText
# Loads the Data
train_positive_reviews = pickle.load(open("data/train_pos.p","rb"))
train_negative_reviews = pickle.load(open("data/train_neg.p","rb"))

# For each review append the label
train_pos_reviews_labeled = [x + ' __label__1' for x in train_positive_reviews]
train_neg_reviews_labeled = [x + ' __label__0' for x in train_negative_reviews]

In [29]:
fb_reviews = train_pos_reviews_labeled + train_neg_reviews_labeled

shuffle(fb_reviews)
print(fb_reviews[0])

with open('fastText/fb_train_shuffled.txt', mode='wt', encoding='utf-8') as output_fb_train:
    output_fb_train.write('\n'.join(fb_reviews))


this is one of the most daring and important of the so called pre code films made in hollywood during the 1930s unlike some pre code films that occasionally dabbled in subjects that would have never been allowed after 1934 5 , this film fully immersed itself in a very sordid yet entertaining plot from start to finish the conventional morality of the late 30s and 40s was definitely not evident in this film , as the film is essentially about a conniving woman who sleeps her way to the top and with no apologies along the way this broad both enjoyed sex and used it on every man who could help her get rich something you just never would have seen in films made just two or three years later the film begins with barbara stanwyck working in her father 's speakeasy in addition to being her boss , he is also her pimp and encourages her to sleep with a local government official so that he 'll allow the illegal bar to operate with impunity while not especially clear here , it appears as if daddy has been renting his daughter 's body out for a long time however , after nearly being raped and attacking this man by breaking a bottle of beer over his skull , barbara has had enough and heads to the big city it does n't hurt that the still blew up and killed her father , but her feeling that she was whoring herself out and had nothing to show for it appeared to be the impetus to move despite the depression , barbara uses sex to get a job at a huge mega bank she starts out at a pretty menial job as a file clerk , but in the space of what seems like just a few weeks , she sleeps her way from one job to another to yet another until she is sleeping with the head of the bank and his future son in law ! ! ! this all ends in tragedy , but babs does n't seem too shook up over the deaths of these two men in fact , some time later , she is able to insinuate herself into the life of the new ceo and once again she 's on top \( perhaps in more way than one \) now so far , this is a wonderful movie because it was so gritty and unrepentant barbara played a 100 sociopath a woman with no morality and no conscience just a desire to squeeze as much out of life as she could no matter who she hurt in the process however , the brave writers and producer chickened out and thought it important to tack on a redemptive ending considering that this woman was so evil and conniving , her change of heart at the end was a major disappointment and strongly detracted from the film in many ways , this reminded me of the ending of jezebel as once again , a wicked person somehow sees the light and changes at the not too convincing conclusion my advice is to try watching red headed woman and downstairs red headed woman is much like baby face but features no magical transformation at the end the leading lady really is a skunk down deep ! in downstairs , a film very much like red headed woman , the roles are reversed and a man \( john gilbert \) plays a similar conniving character both are classics and a bit better than this film this film is an amazing curio of a brief period of often ultra sleazy hollywood films and in this light it 's well worth seeing for cinephiles also , fans of the duke take note john wayne plays a very small part in the film and it 's very unusual to see a very young wayne playing such a conventional role __label__1

In [30]:
# Prepares Test for Facebook FastText
test_positive_reviews = pickle.load(open("data/test_pos.p","rb"))
test_negative_reviews = pickle.load(open("data/test_neg.p","rb"))

# For each review append the label
test_pos_reviews_labeled = [x + ' __label__1' for x in test_positive_reviews]
test_neg_reviews_labeled = [x + ' __label__0' for x in test_negative_reviews]

fb_test_reviews = test_pos_reviews_labeled + test_neg_reviews_labeled

shuffle(fb_test_reviews)

with open('fastText/fb_test_shuffled.txt', mode='wt', encoding='utf-8') as output_fb_test:
    output_fb_test.write('\n'.join(fb_test_reviews))

In [31]:
train_positive_reviews = get_reviews("data/aclImdb/train/pos", clean=False)
train_negative_reviews = get_reviews("data/aclImdb/train/neg", clean=False)
test_positive_reviews = get_reviews("data/aclImdb/test/pos", clean=False)
test_negative_reviews = get_reviews("data/aclImdb/test/neg", clean=False)


from random import shuffle
# For each review append the label
train_pos_reviews_labeled = [x + ' __label__1' for x in train_positive_reviews]
train_neg_reviews_labeled = [x + ' __label__0' for x in train_negative_reviews]

fb_reviews = train_pos_reviews_labeled + train_neg_reviews_labeled

shuffle(fb_reviews)

with open('fastText/fb_train_unclean_shuffled.txt', mode='wt', encoding='utf-8') as output_fb_train:
    output_fb_train.write('\n'.join(fb_reviews))
    
#=============================

# For each review append the label
test_pos_reviews_labeled = [x + ' __label__1' for x in test_positive_reviews]
test_neg_reviews_labeled = [x + ' __label__0' for x in test_negative_reviews]

fb_test_reviews = test_pos_reviews_labeled + test_neg_reviews_labeled

shuffle(fb_test_reviews)

with open('fastText/fb_test_unclean_shuffled.txt', mode='wt', encoding='utf-8') as output_fb_test:
    output_fb_test.write('\n'.join(fb_test_reviews))
    
fb_test_reviews[0]


Out[31]:
"Such a shame that this wonderful bright spot on the small screen had such talent in writers and actors, such wonderful scenery that was the ultimate escapism for those in a land locked, sun deprived state. Many of the actors went on to bigger things...another indicator that there was something wonderful sadly lost. I lived in Columbus, Ohio at the time, with all of my (now ex) husband's very large family in a chorus of 'NO NO!!' every time it would be yet again taken over by a baseball game broadcast. <br /><br />Someone who wrote here mentioned it was up against 'Rosanne'...all my ex and I noticed was that it was always always always preempted by BASEBALL!!!! Yes, FOX really wasted something wonderful that one and nothing will ever equal it~! Thank you for the memories of it. __label__1"

In [13]:
# Obtain the labels
 # Open the files
# For Clean Shuffled
clean_shuffled = open('fastText/fb_test_shuffled.txt').readlines()
print(len(clean_shuffled))
clean_shuffled_labels = [str(rev.split(' ')[-1]).strip() for rev in clean_shuffled]

clean_preds = open('fastText/fb_shuffled_preds.txt').readlines()
clean_preds = [str(lab).strip() for lab in clean_preds]

count = 0
for i in range(len(clean_shuffled_labels)):
    if clean_shuffled_labels[i] == clean_preds[i]:
        count += 1

print("Accuracy:", float(count)/float(len(clean_shuffled_labels)))


25000
Accuracy: 0.87636

In [16]:
# For Unclean Shuffled

unclean_shuffled = open('fastText/fb_test_unclean_shuffled.txt').readlines()
print(len(unclean_shuffled))
unclean_shuffled_labels = [str(rev.split(' ')[-1]).strip() for rev in unclean_shuffled]

unclean_preds = open('fastText/fb_unclean_shuffled_preds.txt').readlines()
unclean_preds = [str(lab).strip() for lab in unclean_preds]

count = 0
for i in range(len(unclean_shuffled_labels)):
    if unclean_shuffled_labels[i] == unclean_preds[i]:
        count += 1

print("Accuracy:", float(count)/float(len(unclean_shuffled_labels)))


25000
Accuracy: 0.85984

BIGRAMS


In [34]:
x_train_reviews_oov = pickle.load(open("data/reviews_oov.p", "rb"))
# Set this to file
with open('data/reviews_oov.txt', mode='wt', encoding='utf-8') as output_reviews_oov:
    output_reviews_oov.write('\n'.join(x_train_reviews_oov))

In [42]:
rev_test = x_train_reviews_oov[0]

# Loads vocab
bi_vocabulary = load_vocabulary("data/vocab_oov_bigrams_no_counts/part-00000")

def find_bigrams(review, vocabulary):
    split_review = review.split(' ')
    zipped = zip(split_review, split_review[1:])
    bigrams = [x[0] + '_' + x[1] if x[0] + ' ' + x[1] in vocabulary else 'oov' for x in zipped]
    print(len(bigrams))
    return ' '.join(bigrams)

#[find_bigrams(rev, bi_vocabulary) for rev in x_train_reviews_oov]

find_bigrams(rev_test, bi_vocabulary)


267
Out[42]:
"oov_is is_a a_young oov oov of_oov oov oov oov oov oov oov_, ,_he he_is is_very oov oov oov oov oov oov out_of of_the the_blue oov ,_his his_father father_oov oov_that that_oov oov_will will_be oov oov him_to to_the the_oov oov_\\( \\(_oov oov_\\) \\)_to to_oov oov_something something_that that_oov oov_has has_no oov interest_in oov oov oov oov oov out_of of_oov oov_as as_a a_result result_, ,_from from_the the_start start_, ,_oov oov_is oov oov oov being_a oov oov oov man_, ,_his his_father father_is is_difficult difficult_to to_talk talk_to oov oov oov his_oov oov_both oov father_and and_son oov oov oov oov and_oov oov_and and_it it_'s 's_very oov oov when_the oov oov oov his_son oov that_he he_should should_not not_be be_so oov oov when_i i_read read_the oov oov ,_it oov oov about_how how_much much_the the_characters oov oov oov began_to to_know oov each_other oov however_, ,_i i_really really_do do_n't n't_think think_they they_did oov and_that that_is is_the oov oov oov oov aspect_of of_the the_film oov sure_, ,_there there_were oov oov oov oov ,_but but_so so_often oov there_was was_an an_oov oov_of of_oov oov_and and_oov oov_i i_actually oov liked_this this_and oov oov that_there there_was was_n't oov oov oov of_this this_as as_it it_would would_have oov oov oov overall_, ,_the the_film film_is is_well well_acted acted_and oov oov oov oov_an oov oov insight_into oov oov and_the the_oov oov_it it_also oov provides_a a_fascinating oov oov_of oov oov oov and_the the_oov oov oov oov while_the the_slow oov pace_and oov lack_of oov oov about_the the_relationship oov throughout_the the_film oov oov oov oov ,_i i_think think_it oov gave_the the_film oov oov oov and_made made_it it_look look_like like_a a_film film_about oov oov oov oov oov a_nice nice_and oov oov"

In [39]:
vp = learn.preprocessing.VocabularyProcessor(10)
list(vp.fit_transform(["a", "dog" , "ran" ,"in" ,"the", "park"]))


Out[39]:
[array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([3, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([4, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([5, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([6, 0, 0, 0, 0, 0, 0, 0, 0, 0])]

In [35]:
x = [[1,2,3], [4,5,6]]
y = [['a', 'b', 'c'], ['d','e','f']]

zipped = zip(x,y)
final_revs = [x[0]+x[1] for x in zipped]
final_revs


Out[35]:
[[1, 2, 3, 'a', 'b', 'c'], [4, 5, 6, 'd', 'e', 'f']]