In [1]:
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model


Using TensorFlow backend.

In [2]:
def read_20_newgroup_files(path_to_data_directory):
    texts = []
    labels_index = {}
    labels = []
    
    for name in sorted(os.listdir(path_to_data_directory)):
        path = os.path.join(path_to_data_directory,name)
        
        if os.path.isdir(path):
            label_id = len(labels_index)
            labels_index[name] = label_id
            
            for fname in sorted(os.listdir(path)):
                if fname.isdigit():
                    fpath = os.path.join(path, fname)
                    if sys.version_info < (3,):
                        f = open(fpath)
                    else:
                        f = open(fpath, encoding='latin-1')
                    t = f.read()
                    i = t.find('\n\n') +2  # skip header
                                       
                    if i > 0:
                        t = t[i:]
                                   
                    texts.append(t)
                    f.close()
                    labels.append(label_id)                    
        
        
    return (texts,labels_index,labels)

path = "/home/felipe/data/20_newsgroup/20_newsgroup/"
        
texts,labels_index,labels = read_20_newgroup_files(path)

In [3]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.2

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

data = pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))

indices = np.arange(data.shape[0])
np.random.shuffle(indices)

data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

X_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
X_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [4]:
GLOVE_DIR = "/media/felipe/SAMSUNG/GloVe"
EMBEDDING_DIM = 100
embeddings_index = {}

with open(os.path.join(GLOVE_DIR,"glove.6B.{0}d.txt".format(EMBEDDING_DIM)),'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:],dtype='float32')

        embeddings_index[word] = coefs

In [5]:
len(word_index)


Out[5]:
174074

In [6]:
embedding_matrix = np.zeros((len(word_index)+1,EMBEDDING_DIM))

for word,i in word_index.items():
    
    if i >= MAX_NB_WORDS:
        continue
    
    embedding_vector = embeddings_index.get(word)
    
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [7]:
embedding_layer = Embedding(len(word_index)+1,
                           EMBEDDING_DIM,
                           weights=[embedding_matrix],
                           input_length=MAX_SEQUENCE_LENGTH,
                           trainable = False)

In [8]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,),dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

In [9]:
model = Model(sequence_input, preds)
model.compile(loss ='categorical_crossentropy',
             optimizer='rmsprop',
             metrics=['acc'])

In [10]:
model.fit(X_train,y_train, validation_data=(X_val, y_val),
         epochs=20, batch_size=128)


Train on 15998 samples, validate on 3999 samples
Epoch 1/20
 4352/15998 [=======>......................] - ETA: 94s - loss: 2.8506 - acc: 0.0843
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-10-af0321680fa3> in <module>()
      1 model.fit(X_train,y_train, validation_data=(X_val, y_val),
----> 2          epochs=20, batch_size=128)

/home/felipe/tf-venv3/lib/python3.5/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, **kwargs)
   1484                               val_f=val_f, val_ins=val_ins, shuffle=shuffle,
   1485                               callback_metrics=callback_metrics,
-> 1486                               initial_epoch=initial_epoch)
   1487 
   1488     def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):

/home/felipe/tf-venv3/lib/python3.5/site-packages/keras/engine/training.py in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch)
   1139                 batch_logs['size'] = len(batch_ids)
   1140                 callbacks.on_batch_begin(batch_index, batch_logs)
-> 1141                 outs = f(ins_batch)
   1142                 if not isinstance(outs, list):
   1143                     outs = [outs]

/home/felipe/tf-venv3/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs)
   2101         session = get_session()
   2102         updated = session.run(self.outputs + [self.updates_op],
-> 2103                               feed_dict=feed_dict)
   2104         return updated[:len(self.outputs)]
   2105 

/home/felipe/tf-venv3/lib/python3.5/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    776     try:
    777       result = self._run(None, fetches, feed_dict, options_ptr,
--> 778                          run_metadata_ptr)
    779       if run_metadata:
    780         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/home/felipe/tf-venv3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
    980     if final_fetches or final_targets:
    981       results = self._do_run(handle, final_targets, final_fetches,
--> 982                              feed_dict_string, options, run_metadata)
    983     else:
    984       results = []

/home/felipe/tf-venv3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1030     if handle is None:
   1031       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1032                            target_list, options, run_metadata)
   1033     else:
   1034       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/home/felipe/tf-venv3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1037   def _do_call(self, fn, *args):
   1038     try:
-> 1039       return fn(*args)
   1040     except errors.OpError as e:
   1041       message = compat.as_text(e.message)

/home/felipe/tf-venv3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1019         return tf_session.TF_Run(session, options,
   1020                                  feed_dict, fetch_list, target_list,
-> 1021                                  status, run_metadata)
   1022 
   1023     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

In [ ]: