In [1]:
    
#Imports and paths
from __future__ import print_function
import numpy as np
data_path='/home/ubuntu/data/training/text/sentiment/aclImdb/'
    
In [2]:
    
# Generator of list of files in a folder and subfolders
import os
import shutil
import fnmatch
def gen_find(filepattern, toppath):
    '''
    Generator with a recursive list of files in the toppath that match filepattern 
    Inputs:
        filepattern(str): Command stype pattern 
        toppath(str): Root path
    '''
    for path, dirlist, filelist in os.walk(toppath):
        for name in fnmatch.filter(filelist, filepattern):
            yield os.path.join(path, name)
#Test
#print(gen_find("*.txt", data_path+'train/pos/').next())
    
In [3]:
    
def read_sentences(path):
    sentences = []
    sentences_list = gen_find("*.txt", path)
    for ff in sentences_list:
        with open(ff, 'r') as f:
            sentences.append(f.readline().strip())
    return sentences        
#Test
print(read_sentences(data_path+'train/pos/')[0:2])
    
    
In [4]:
    
print(read_sentences(data_path+'train/neg/')[0:2])
    
    
In [5]:
    
def tokenize(sentences):
    from nltk import word_tokenize
    print( 'Tokenizing...',)
    tokens = []
    for sentence in sentences:
        tokens += [word_tokenize(sentence)]
    print('Done!')
    return tokens
print(tokenize(read_sentences(data_path+'train/pos/')[0:2]))
    
    
In [6]:
    
from sklearn.utils import shuffle
    
In [7]:
    
sentences_trn_pos = tokenize(read_sentences(data_path+'train/pos/'))
sentences_trn_neg = tokenize(read_sentences(data_path+'train/neg/'))
sentences_trn = sentences_trn_pos + sentences_trn_neg
y_train_trn = [1] * len(sentences_trn_pos) + [0] * len(sentences_trn_neg)
X_trn, y_trn = shuffle(sentences_trn, y_train_trn)
len(X_trn)
    
    
    Out[7]:
In [8]:
    
sentences_pos = tokenize(read_sentences(data_path+'test/pos/'))
sentences_neg = tokenize(read_sentences(data_path+'test/neg/'))
sentences_tst = sentences_pos + sentences_neg
y_test = [1] * len(sentences_pos) + [0] * len(sentences_neg)
X_tst, y_tst = shuffle(sentences_tst, y_test)
len(X_tst)
    
    
    Out[8]:
In [9]:
    
#Save tokenization whit numpy
np.save('/tmp/sentiment_X_trn.npy', X_trn)
np.save('/tmp/sentiment_X_tst.npy', X_tst)
np.save('/tmp/sentiment_y_trn.npy', y_trn)
np.save('/tmp/sentiment_y_tst.npy', y_tst)
    
In [10]:
    
# Load tokenization whit numpy
X_trn = np.load('/tmp/sentiment_X_trn.npy')
X_tst = np.load('/tmp/sentiment_X_tst.npy')
y_trn = np.load('/tmp/sentiment_y_trn.npy')
y_tst = np.load('/tmp/sentiment_y_tst.npy')
    
In [11]:
    
#create the dictionary to conver words to numbers. Order it with most frequent words first
def build_dict(sentences):
#    from collections import OrderedDict
    '''
    Build dictionary of train words
    Outputs: 
     - Dictionary of word --> word index
     - Dictionary of word --> word count freq
    '''
    print( 'Building dictionary..',)
    wordcount = dict()
    #For each worn in each sentence, cummulate frequency
    for ss in sentences:
        for w in ss:
            if w not in wordcount:
                wordcount[w] = 1
            else:
                wordcount[w] += 1
    counts = list(wordcount.values()) # List of frequencies
    keys = list(wordcount) #List of words
    
    sorted_idx = reversed(np.argsort(counts))
    
    worddict = dict()
    for idx, ss in enumerate(sorted_idx):
        worddict[keys[ss]] = idx+2  # leave 0 and 1 (UNK)
    print( np.sum(counts), ' total words ', len(keys), ' unique words')
    return worddict, wordcount
worddict, wordcount = build_dict(sentences_trn)
print(worddict['the'], wordcount['the'])
    
    
In [12]:
    
# 
def generate_sequence(sentences, dictionary):
    '''
    Convert tokenized text in sequences of integers
    '''
    seqs = [None] * len(sentences)
    for idx, ss in enumerate(sentences):
        seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in ss]
    return seqs
    
In [13]:
    
# Create train and test data
#Read train sentences and generate target y
train_x_pos = generate_sequence(sentences_trn_pos, worddict)
train_x_neg = generate_sequence(sentences_trn_neg, worddict)
X_train_full = train_x_pos + train_x_neg
y_train_full = [1] * len(train_x_pos) + [0] * len(train_x_neg)
print(X_train_full[0], y_train_full[0])
    
    
In [14]:
    
#Read test sentences and generate target y
sentences_tst_pos = read_sentences(data_path+'test/pos/')
sentences_tst_neg = read_sentences(data_path+'test/neg/')
test_x_pos = generate_sequence(tokenize(sentences_tst_pos), worddict)
test_x_neg = generate_sequence(tokenize(sentences_tst_neg), worddict)
X_test_full = test_x_pos + test_x_neg
y_test_full = [1] * len(test_x_pos) + [0] * len(test_x_neg)
print(X_test_full[0])
print(y_test_full[0])
    
    
In [15]:
    
#Median length of sentences
print('Median length: ', np.median([len(x) for x in X_test_full]))
    
    
In [16]:
    
max_features = 50000 # Number of most frequent words selected. the less frequent recode to 0
maxlen = 200  # cut texts after this number of words (among top max_features most common words)
    
In [17]:
    
#Select the most frequent max_features, recode others using 0
def remove_features(x):
    return [[0 if w >= max_features else w for w in sen] for sen in x]
X_train = remove_features(X_train_full)
X_test  = remove_features(X_test_full)
y_train = y_train_full
y_test = y_test_full
print(X_test[1])
    
    
In [18]:
    
from tensorflow.contrib.keras import preprocessing
# Cut or complete the sentences to length = maxlen
print("Pad sequences (samples x time)")
X_train = preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print(X_test[0])
    
    
In [19]:
    
# Shuffle data
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train, random_state=0)
    
In [20]:
    
# Export train and test data
np.save(data_path + 'X_train', X_train)
np.save(data_path + 'y_train', y_train)
np.save(data_path + 'X_test',  X_test)
np.save(data_path + 'y_test',  y_test)
    
In [21]:
    
# Export worddict
import pickle
with open(data_path + 'worddict.pickle', 'wb') as pfile:
    pickle.dump(worddict, pfile)
    
In [ ]: