Issue tracker classification using RNN

Importing required packages into python


In [1]:
# Required dependencies
# 1. NLTK
# 2. Gensim for word2vec
# 3. Keras with tensorflow/theano backend


import numpy as np
np.random.seed(1337)
import json, re, nltk, string, csv, sys, codecs
from nltk.corpus import wordnet
from gensim.models import Word2Vec
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge
from keras import layers
from keras.optimizers import RMSprop
from keras.utils import np_utils
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics.pairwise import cosine_similarity

from matplotlib import pyplot as plt


/home/eruwsil/anaconda3/envs/MLpy27/lib/python2.7/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

A hack to increase size due to Error: field larger than field limit (131072)


In [2]:
maxInt = sys.maxsize
decrement = True

while decrement:
    # decrease the maxInt value by factor 10
    # as long as the OverflowError occurs.

    decrement = False
    try:
        csv.field_size_limit(maxInt)
    except OverflowError:
        maxInt = int(maxInt / 10)
        decrement = True

Initializing Hyper parameter


In [3]:
#1. Word2vec parameters
min_word_frequency_word2vec = 5
embed_size_word2vec = 200
context_window_word2vec = 5

#2. Classifier hyperparameters
numCV = 10
max_sentence_len = 50
min_sentence_length = 15
rankK = 10
batch_size = 32

Define the data files

CSV data format:

"General.Eriref";"General.Heading";"Answer.Answer";"Observation.Observation";"TR History Answer.Answered by user"

Array element[0] = "General.Eriref" (Contains only the bug number)

Array element[1] = "General.Heading" (Contains the heading)

Array element[2] = "Answer.Answer" (The unstructured natural text Answer section)--NOT USED IN THIS PROJECT

Array element[3] = "Observation.Observation" (Detailed description of the issue. Free text)

Array element[4] = "TR History Answer.Answered by user" (The developer who fixed and answered the issue ticket)


In [4]:
open_bugs_csv = 'e1_open.csv'
closed_bugs_csv = 'm15_closed.csv'

Preprocess the open bugs, extract the vocabulary and learn the word2vec representation


In [5]:
with open(open_bugs_csv) as data_file:
    data = csv.reader(data_file, delimiter=';')

    all_data = []
    for item in data:
        #1. Remove \r 
        current_title = unicode(item[1], errors='ignore').replace('\r', ' ')
        #print current_title
        current_desc = unicode(item[3], errors='ignore').replace('\r', ' ')
        #print current_desc
        #2. Remove URLs
        current_desc = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', current_desc)    
        #3. Remove Stack Trace
        start_loc = current_desc.find("Stack trace:")
        current_desc = current_desc[:start_loc]    
        #4. Remove hex code
        current_desc = re.sub(r'(\w+)0x\w+', '', current_desc)
        current_title= re.sub(r'(\w+)0x\w+', '', current_title)    
        #5. Change to lower case
        current_desc = current_desc.lower()
        current_title = current_title.lower()    
        #6. Tokenize
        current_desc_tokens = nltk.word_tokenize(current_desc)
        current_title_tokens = nltk.word_tokenize(current_title)
        #7. Strip trailing punctuation marks    
        current_desc_filter = [word.strip(string.punctuation) for word in current_desc_tokens]
        current_title_filter = [word.strip(string.punctuation) for word in current_title_tokens]      
        #8. Join the lists
        current_data = current_title_filter + current_desc_filter
        current_data = filter(None, current_data)
        all_data.append(current_data)
        
#print(len(all_data))
# Learn the word2vec model and extract vocabulary
# A vocabulary is constructed and the word2vec model is learnt using the preprocessed data. 
# The word2vec model provides a semantic word representation for every word in the vocabulary.
wordvec_model = Word2Vec(all_data, min_count=min_word_frequency_word2vec, size=embed_size_word2vec, window=context_window_word2vec)
vocabulary = wordvec_model.wv.vocab
#print vocabulary
vocab_size = len(vocabulary)

Preprocess the closed bugs, using the extracted the vocabulary

The closed bug trackers are used for training and testing the classifier.

(1) The closed bugs are loaded

(2) Data(Heading and Observarion) are preprocessed


In [6]:
with open(closed_bugs_csv) as data_file:
    data = csv.reader(data_file, delimiter=';')

    all_data = []
    all_owner = []    
    for item in data:
        #1. Remove \r 
        current_title = unicode(item[1], errors='ignore').replace('\r', ' ')
        current_desc = unicode(item[3], errors='ignore').replace('\r', ' ')
        #2. Remove URLs
        current_desc = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', current_desc)
        #3. Remove Stack Trace
        start_loc = current_desc.find("Stack trace:")
        current_desc = current_desc[:start_loc]
        #4. Remove hex code
        current_desc = re.sub(r'(\w+)0x\w+', '', current_desc)
        current_title= re.sub(r'(\w+)0x\w+', '', current_title)
        #5. Change to lower case
        current_desc = current_desc.lower()
        current_title = current_title.lower()
        #6. Tokenize
        current_desc_tokens = nltk.word_tokenize(current_desc)
        current_title_tokens = nltk.word_tokenize(current_title)
        #7. Strip punctuation marks
        current_desc_filter = [word.strip(string.punctuation) for word in current_desc_tokens]
        current_title_filter = [word.strip(string.punctuation) for word in current_title_tokens]       
        #8. Join the lists
        current_data = current_title_filter + current_desc_filter
        current_data = filter(None, current_data)
        all_data.append(current_data)
        all_owner.append(item[4])
#        print all_data

Split cross validation sets and perform deep learning + softamx based classification

The ten times chronological cross validation split is performed as follows:


In [7]:
totalLength = len(all_data)
splitLength = int(totalLength / (numCV + 1))

for i in range(1, numCV + 1):
    # Split cross validation set
    print("Starting work on cross validation set {0}".format(i))
    train_data = all_data[:i*splitLength-1]
    test_data = all_data[i*splitLength:(i+1)*splitLength-1]
    train_owner = all_owner[:i*splitLength-1]
    test_owner = all_owner[i*splitLength:(i+1)*splitLength-1]
    
    # Remove words outside the vocabulary
    updated_train_data = []    
    updated_train_data_length = []    
    updated_train_owner = []
    final_test_data = []
    final_test_owner = []
    for j, item in enumerate(train_data):
        current_train_filter = [word for word in item if word in vocabulary]
        if len(current_train_filter) >= min_sentence_length:  
          updated_train_data.append(current_train_filter)
          updated_train_owner.append(train_owner[j])  
          
    for j, item in enumerate(test_data):
        current_test_filter = [word for word in item if word in vocabulary]  
        if len(current_test_filter) >= min_sentence_length:
          final_test_data.append(current_test_filter)          
          final_test_owner.append(test_owner[j])          
    
    # Remove data from test set that is not there in train set
    train_owner_unique = set(updated_train_owner)
    test_owner_unique = set(final_test_owner)
    unwanted_owner = list(test_owner_unique - train_owner_unique)
    updated_test_data = []
    updated_test_owner = []
    updated_test_data_length = []
    for j in range(len(final_test_owner)):
        if final_test_owner[j] not in unwanted_owner:
            updated_test_data.append(final_test_data[j])
            updated_test_owner.append(final_test_owner[j])

    unique_train_label = list(set(updated_train_owner))
    classes = np.array(unique_train_label)
    
    # Create train and test data for deep learning + softmax
    X_train = np.empty(shape=[len(updated_train_data), max_sentence_len, embed_size_word2vec], dtype='float32')
    Y_train = np.empty(shape=[len(updated_train_owner), 1], dtype='int32')
    # 1 - start of sentence, # 2 - end of sentence, # 0 - zero padding. Hence, word indices start with 3 
    for j, curr_row in enumerate(updated_train_data):
        sequence_cnt = 0         
        for item in curr_row:
            if item in vocabulary:
                X_train[j, sequence_cnt, :] = wordvec_model[item] 
                sequence_cnt = sequence_cnt + 1                
                if sequence_cnt == max_sentence_len-1:
                          break                
        for k in range(sequence_cnt, max_sentence_len):
            X_train[j, k, :] = np.zeros((1, embed_size_word2vec))        
        Y_train[j, 0] = unique_train_label.index(updated_train_owner[j])
    
    X_test = np.empty(shape=[len(updated_test_data), max_sentence_len, embed_size_word2vec], dtype='float32')
    Y_test = np.empty(shape=[len(updated_test_owner),1], dtype='int32')
    # 1 - start of sentence, # 2 - end of sentence, # 0 - zero padding. Hence, word indices start with 3 
    for j, curr_row in enumerate(updated_test_data):
        sequence_cnt = 0          
        for item in curr_row:
            if item in vocabulary:
                X_test[j, sequence_cnt, :] = wordvec_model[item] 
                sequence_cnt = sequence_cnt + 1                
                if sequence_cnt == max_sentence_len-1:
                          break                
        for k in range(sequence_cnt, max_sentence_len):
            X_test[j, k, :] = np.zeros((1, embed_size_word2vec))        
        Y_test[j, 0] = unique_train_label.index(updated_test_owner[j])
        
    y_train = np_utils.to_categorical(Y_train, len(unique_train_label))
    y_test = np_utils.to_categorical(Y_test, len(unique_train_label))


    # TODO: Add x_train and x_test
    
    # Construct the deep learning model
    print("Creating Model")
    sequence = Input(shape=(max_sentence_len, embed_size_word2vec), dtype='float32')
    forwards_1 = LSTM(1024)(sequence)
    after_dp_forward_4 = Dropout(0.20)(forwards_1) 
    backwards_1 = LSTM(1024, go_backwards=True)(sequence)
    after_dp_backward_4 = Dropout(0.20)(backwards_1)         
    #merged = merge([after_dp_forward_4, after_dp_backward_4], mode='concat', concat_axis=-1)
    merged = layers.concatenate([after_dp_forward_4, after_dp_backward_4], axis=-1)
    after_dp = Dropout(0.5)(merged)
    output = Dense(len(unique_train_label), activation='softmax')(after_dp)                
    model = Model(input=sequence, output=output)            
    rms = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy', optimizer=rms, metrics=['accuracy'])    
    hist = model.fit(X_train, y_train, batch_size=batch_size, epochs=20)  # Rename nb_epochs to epochs // Value original: 200
    
    predict = model.predict(X_test)        
    accuracy = []
    sortedIndices = []
    pred_classes = []
    if len(predict) == 0:
        exit(1)  # Avoid divide by zero
    for ll in predict:
          sortedIndices.append(sorted(range(len(ll)), key=lambda ii: ll[ii], reverse=True))
    for k in range(1, rankK + 1):
          id = 0
          trueNum = 0
          for sortedInd in sortedIndices:
            pred_classes.append(classes[sortedInd[:k]])
            if y_test[id] in classes[sortedInd[:k]]:
                  trueNum += 1            
            id += 1
          accuracy.append((float(trueNum) / len(predict)) * 100)
    print("Test accuracy: ", accuracy)       
    
    train_result = hist.history        
    print(train_result)
    
    # Loss curves for validation and training
    test_loss = hist.history['loss']
    test_accuracy = hist.history['acc']
    #val_loss = hist.history['val_loss']

    epochs = range(len(test_loss))

    plt.figure()

    plt.plot(epochs, test_loss, 'bo', label='Training loss')
    plt.plot(epochs, test_accuracy, 'r', label='Training Accuracy')
    plt.title('Training loss and accuracy')
    plt.legend()

    plt.show()
    
    del model


Starting work on cross validation set 1
Creating Model
/home/eruwsil/anaconda3/envs/MLpy27/lib/python2.7/site-packages/ipykernel_launcher.py:53: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
/home/eruwsil/anaconda3/envs/MLpy27/lib/python2.7/site-packages/ipykernel_launcher.py:68: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
/home/eruwsil/anaconda3/envs/MLpy27/lib/python2.7/site-packages/ipykernel_launcher.py:93: UserWarning: Update your `Model` call to the Keras 2 API: `Model(outputs=Tensor("de..., inputs=Tensor("in...)`
Epoch 1/20
74/74 [==============================] - 10s 130ms/step - loss: 4.6367 - acc: 0.0541
Epoch 2/20
74/74 [==============================] - 7s 95ms/step - loss: 3.9130 - acc: 0.0676
Epoch 3/20
74/74 [==============================] - 8s 107ms/step - loss: 3.1084 - acc: 0.1622
Epoch 4/20
74/74 [==============================] - 8s 107ms/step - loss: 2.7026 - acc: 0.2432
Epoch 5/20
74/74 [==============================] - 8s 109ms/step - loss: 2.6111 - acc: 0.2297
Epoch 6/20
74/74 [==============================] - 8s 108ms/step - loss: 2.3558 - acc: 0.3378
Epoch 7/20
74/74 [==============================] - 7s 100ms/step - loss: 2.2117 - acc: 0.2838
Epoch 8/20
74/74 [==============================] - 8s 113ms/step - loss: 2.1818 - acc: 0.4324
Epoch 9/20
74/74 [==============================] - 9s 119ms/step - loss: 2.1694 - acc: 0.3919
Epoch 10/20
74/74 [==============================] - 9s 122ms/step - loss: 1.6200 - acc: 0.5541
Epoch 11/20
74/74 [==============================] - 9s 128ms/step - loss: 1.2449 - acc: 0.7027
Epoch 12/20
74/74 [==============================] - 8s 105ms/step - loss: 1.8379 - acc: 0.5135
Epoch 13/20
74/74 [==============================] - 8s 109ms/step - loss: 1.0384 - acc: 0.7568
Epoch 14/20
74/74 [==============================] - 9s 116ms/step - loss: 0.8680 - acc: 0.7838
Epoch 15/20
74/74 [==============================] - 8s 110ms/step - loss: 0.7570 - acc: 0.8243
Epoch 16/20
74/74 [==============================] - 9s 118ms/step - loss: 0.5378 - acc: 0.8919
Epoch 17/20
74/74 [==============================] - 8s 114ms/step - loss: 0.6430 - acc: 0.8649
Epoch 18/20
74/74 [==============================] - 9s 118ms/step - loss: 0.4758 - acc: 0.9054
Epoch 19/20
74/74 [==============================] - 8s 109ms/step - loss: 0.4647 - acc: 0.9054
Epoch 20/20
74/74 [==============================] - 8s 114ms/step - loss: 1.8677 - acc: 0.4865
('Test accuracy: ', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
{'acc': [0.05405405445678814, 0.06756756797030165, 0.1621621623635292, 0.24324324404871142, 0.22972973295160243, 0.33783783803920486, 0.28378378378378377, 0.4324324356543051, 0.3918918951137646, 0.5540540572759267, 0.7027027027027027, 0.5135135151244499, 0.7567567583676931, 0.7837837837837838, 0.8243243243243243, 0.8918918886700192, 0.8648648616429921, 0.9054054021835327, 0.9054054086272781, 0.4864864832646138], 'loss': [4.636739138010386, 3.912982566936596, 3.1084110285784745, 2.7025617715474723, 2.6111074782706596, 2.3557996169940845, 2.2116774546133504, 2.1817923816474707, 2.169395756077122, 1.6200437481338914, 1.2449047243272937, 1.8378826315338548, 1.038433261819788, 0.867987829285699, 0.7569656774804399, 0.5377844088786358, 0.6429572057079624, 0.47576710420685847, 0.46471532293268153, 1.8677118468928982]}
/home/eruwsil/anaconda3/envs/MLpy27/lib/python2.7/site-packages/ipykernel_launcher.py:111: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
Starting work on cross validation set 2
Creating Model
Epoch 1/20
149/149 [==============================] - 15s 101ms/step - loss: 4.4454 - acc: 0.0403
Epoch 2/20
149/149 [==============================] - 14s 94ms/step - loss: 4.4161 - acc: 0.0470
Epoch 3/20
149/149 [==============================] - 14s 96ms/step - loss: 3.4694 - acc: 0.1879
Epoch 4/20
149/149 [==============================] - 14s 95ms/step - loss: 3.2583 - acc: 0.1812
Epoch 5/20
149/149 [==============================] - 12s 80ms/step - loss: 2.9866 - acc: 0.2953
Epoch 6/20
149/149 [==============================] - 12s 81ms/step - loss: 2.7092 - acc: 0.3020
Epoch 7/20
149/149 [==============================] - 13s 85ms/step - loss: 2.4486 - acc: 0.4161
Epoch 8/20
149/149 [==============================] - 14s 94ms/step - loss: 2.1733 - acc: 0.4966
Epoch 9/20
149/149 [==============================] - 13s 87ms/step - loss: 2.0339 - acc: 0.4497
Epoch 10/20
149/149 [==============================] - 15s 100ms/step - loss: 1.5682 - acc: 0.6309
Epoch 11/20
149/149 [==============================] - 17s 111ms/step - loss: 1.2924 - acc: 0.6913
Epoch 12/20
149/149 [==============================] - 14s 91ms/step - loss: 1.1759 - acc: 0.7181
Epoch 13/20
149/149 [==============================] - 13s 88ms/step - loss: 1.0551 - acc: 0.7450
Epoch 14/20
149/149 [==============================] - 12s 82ms/step - loss: 0.5387 - acc: 0.9060
Epoch 15/20
149/149 [==============================] - 12s 81ms/step - loss: 0.4783 - acc: 0.9128
Epoch 16/20
149/149 [==============================] - 12s 80ms/step - loss: 0.4417 - acc: 0.9195
Epoch 17/20
149/149 [==============================] - 12s 81ms/step - loss: 0.3338 - acc: 0.9463
Epoch 18/20
149/149 [==============================] - 12s 81ms/step - loss: 0.3319 - acc: 0.9262
Epoch 19/20
149/149 [==============================] - 13s 89ms/step - loss: 0.2867 - acc: 0.9396
Epoch 20/20
149/149 [==============================] - 15s 98ms/step - loss: 0.1942 - acc: 0.9732
('Test accuracy: ', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
{'acc': [0.04026845662585841, 0.04697986577181208, 0.1879194648873886, 0.1812080545913453, 0.29530201432288894, 0.3020134230688114, 0.41610738265034336, 0.496644295502029, 0.4496644301302481, 0.6308724856216635, 0.6912751709854843, 0.7181208093694392, 0.744966444953176, 0.906040268856407, 0.9127516790524425, 0.9194630884484156, 0.9463087256322771, 0.9261744970443265, 0.9395973166363352, 0.9731543624161074], 'loss': [4.445408462678026, 4.416072411825193, 3.469441141858197, 3.258274449598069, 2.986626169025498, 2.709213986492797, 2.448623338801749, 2.173348289208124, 2.033871413877346, 1.568180053026084, 1.2924081247124897, 1.1758944732230783, 1.0551263181955222, 0.5386544690036134, 0.4783200821620506, 0.4417374666105181, 0.33379762984762257, 0.3319464035882246, 0.28666193613269986, 0.19416157511256685]}
Starting work on cross validation set 3
Creating Model
Epoch 1/20
225/225 [==============================] - 20s 89ms/step - loss: 4.5157 - acc: 0.0400
Epoch 2/20
225/225 [==============================] - 17s 77ms/step - loss: 4.7816 - acc: 0.0800
Epoch 3/20
225/225 [==============================] - 18s 79ms/step - loss: 3.9029 - acc: 0.1156
Epoch 4/20
225/225 [==============================] - 19s 85ms/step - loss: 3.7572 - acc: 0.1022
Epoch 5/20
225/225 [==============================] - 17s 75ms/step - loss: 3.5465 - acc: 0.1289
Epoch 6/20
225/225 [==============================] - 18s 78ms/step - loss: 3.2795 - acc: 0.1644
Epoch 7/20
225/225 [==============================] - 18s 81ms/step - loss: 3.1670 - acc: 0.2489
Epoch 8/20
225/225 [==============================] - 18s 79ms/step - loss: 2.9801 - acc: 0.2444
Epoch 9/20
225/225 [==============================] - 18s 81ms/step - loss: 3.1724 - acc: 0.2756
Epoch 10/20
225/225 [==============================] - 17s 77ms/step - loss: 2.4479 - acc: 0.3600
Epoch 11/20
225/225 [==============================] - 17s 77ms/step - loss: 2.4383 - acc: 0.3600
Epoch 12/20
225/225 [==============================] - 18s 81ms/step - loss: 2.1909 - acc: 0.4000
Epoch 13/20
225/225 [==============================] - 18s 81ms/step - loss: 1.7464 - acc: 0.5556
Epoch 14/20
225/225 [==============================] - 18s 82ms/step - loss: 1.6398 - acc: 0.5556
Epoch 15/20
225/225 [==============================] - 18s 79ms/step - loss: 1.4824 - acc: 0.6000
Epoch 16/20
225/225 [==============================] - 17s 75ms/step - loss: 0.9644 - acc: 0.7822
Epoch 17/20
225/225 [==============================] - 17s 75ms/step - loss: 1.1976 - acc: 0.7422
Epoch 18/20
225/225 [==============================] - 17s 74ms/step - loss: 0.7265 - acc: 0.8667
Epoch 19/20
225/225 [==============================] - 17s 77ms/step - loss: 1.5711 - acc: 0.6267
Epoch 20/20
225/225 [==============================] - 19s 82ms/step - loss: 1.1158 - acc: 0.7467
('Test accuracy: ', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
{'acc': [0.04, 0.08, 0.11555555555555555, 0.10222222222222223, 0.1288888888888889, 0.16444444444444445, 0.24888888888888888, 0.24444444444444444, 0.27555555555555555, 0.36, 0.36, 0.4, 0.5555555555555556, 0.5555555555555556, 0.6, 0.7822222222222223, 0.7422222222222222, 0.8666666666666667, 0.6266666666666667, 0.7466666666666667], 'loss': [4.515739873250325, 4.781630143059624, 3.902880312601725, 3.757197222179837, 3.546457291709052, 3.2794740316602917, 3.166976580089993, 2.98006195280287, 3.1724020279778373, 2.4479230562845866, 2.4382579782274036, 2.190870146221585, 1.7464492561750942, 1.6397590976291232, 1.4823918232652875, 0.9643806558185154, 1.1975877634021972, 0.7265213033888075, 1.5710536193847657, 1.1158309915330675]}
Starting work on cross validation set 4
Creating Model
Epoch 1/20
301/301 [==============================] - 24s 80ms/step - loss: 4.8524 - acc: 0.0365
Epoch 2/20
301/301 [==============================] - 20s 68ms/step - loss: 4.1264 - acc: 0.0864
Epoch 3/20
301/301 [==============================] - 21s 70ms/step - loss: 3.8839 - acc: 0.0731
Epoch 4/20
301/301 [==============================] - 20s 67ms/step - loss: 3.5926 - acc: 0.1395
Epoch 5/20
301/301 [==============================] - 22s 72ms/step - loss: 3.5098 - acc: 0.2027
Epoch 6/20
301/301 [==============================] - 22s 75ms/step - loss: 3.1414 - acc: 0.2425
Epoch 7/20
301/301 [==============================] - 21s 70ms/step - loss: 2.8183 - acc: 0.2890
Epoch 8/20
301/301 [==============================] - 20s 68ms/step - loss: 2.3671 - acc: 0.4053
Epoch 9/20
301/301 [==============================] - 24s 80ms/step - loss: 2.1544 - acc: 0.4518
Epoch 10/20
301/301 [==============================] - 24s 79ms/step - loss: 1.7372 - acc: 0.5781
Epoch 11/20
301/301 [==============================] - 30s 98ms/step - loss: 1.2966 - acc: 0.6844
Epoch 12/20
301/301 [==============================] - 30s 98ms/step - loss: 1.0202 - acc: 0.7475
Epoch 13/20
301/301 [==============================] - 24s 81ms/step - loss: 0.8047 - acc: 0.8239
Epoch 14/20
301/301 [==============================] - 24s 81ms/step - loss: 0.5793 - acc: 0.9136
Epoch 15/20
301/301 [==============================] - 24s 80ms/step - loss: 0.3861 - acc: 0.9402
Epoch 16/20
301/301 [==============================] - 25s 83ms/step - loss: 0.4014 - acc: 0.9236
Epoch 17/20
301/301 [==============================] - 22s 73ms/step - loss: 0.2202 - acc: 0.9568
Epoch 18/20
301/301 [==============================] - 21s 69ms/step - loss: 0.2750 - acc: 0.9369
Epoch 19/20
301/301 [==============================] - 20s 68ms/step - loss: 0.1859 - acc: 0.9668
Epoch 20/20
301/301 [==============================] - 21s 69ms/step - loss: 0.2293 - acc: 0.9568
('Test accuracy: ', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
{'acc': [0.036544850498338874, 0.08637873803658343, 0.07308970124420533, 0.13953488396845784, 0.2026578073584756, 0.2425249174385768, 0.28903654490000386, 0.40531561580607267, 0.4518272428219501, 0.5780730901960519, 0.684385383049911, 0.7475083066379509, 0.8239202663747971, 0.913621262854516, 0.9401993357461949, 0.9235880398671097, 0.9568106312292359, 0.9368770764119602, 0.9667774086378738, 0.956810631427258], 'loss': [4.852414856717436, 4.12639047141091, 3.8839353018028793, 3.5926467160449866, 3.5098322594284617, 3.1413970516369587, 2.818260215049566, 2.3670503617917182, 2.154351302555629, 1.7372174484785212, 1.2966221349183904, 1.020157518180898, 0.8047217621755759, 0.5792940518388716, 0.3861074953083184, 0.40143467391844206, 0.22021449468658613, 0.2749699096495527, 0.18585543324582995, 0.22929074914748487]}
Starting work on cross validation set 5
Creating Model
Epoch 1/20
376/376 [==============================] - 30s 80ms/step - loss: 4.7502 - acc: 0.0372
Epoch 2/20
376/376 [==============================] - 26s 69ms/step - loss: 4.2348 - acc: 0.0612
Epoch 3/20
376/376 [==============================] - 27s 71ms/step - loss: 4.0079 - acc: 0.0824
Epoch 4/20
376/376 [==============================] - 26s 68ms/step - loss: 3.7133 - acc: 0.1117
Epoch 5/20
376/376 [==============================] - 28s 73ms/step - loss: 3.4057 - acc: 0.1809
Epoch 6/20
376/376 [==============================] - 27s 72ms/step - loss: 3.1145 - acc: 0.2101
Epoch 7/20
376/376 [==============================] - 26s 68ms/step - loss: 2.8491 - acc: 0.3165
Epoch 8/20
376/376 [==============================] - 27s 71ms/step - loss: 2.4942 - acc: 0.3697
Epoch 9/20
376/376 [==============================] - 27s 72ms/step - loss: 2.0101 - acc: 0.4867
Epoch 10/20
376/376 [==============================] - 26s 70ms/step - loss: 1.6160 - acc: 0.6250
Epoch 11/20
376/376 [==============================] - 24s 65ms/step - loss: 1.3228 - acc: 0.6941
Epoch 12/20
376/376 [==============================] - 24s 63ms/step - loss: 0.9247 - acc: 0.8351
Epoch 13/20
376/376 [==============================] - 24s 65ms/step - loss: 0.6426 - acc: 0.8670
Epoch 14/20
376/376 [==============================] - 28s 75ms/step - loss: 0.4437 - acc: 0.9309
Epoch 15/20
376/376 [==============================] - 29s 76ms/step - loss: 0.3642 - acc: 0.9441
Epoch 16/20
376/376 [==============================] - 29s 78ms/step - loss: 0.3248 - acc: 0.9362
Epoch 17/20
376/376 [==============================] - 28s 75ms/step - loss: 0.2544 - acc: 0.9495
Epoch 18/20
376/376 [==============================] - 28s 74ms/step - loss: 0.1984 - acc: 0.9628
Epoch 19/20
376/376 [==============================] - 28s 76ms/step - loss: 0.2735 - acc: 0.9388
Epoch 20/20
376/376 [==============================] - 31s 82ms/step - loss: 0.2378 - acc: 0.9574
('Test accuracy: ', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
{'acc': [0.03723404255319149, 0.06117021292448044, 0.08244680858989979, 0.11170212765957446, 0.18085106351274124, 0.2101063831372464, 0.31648936138508166, 0.36968085169792175, 0.4867021270254825, 0.6250000012681839, 0.6941489374383967, 0.8351063842469073, 0.8670212778639286, 0.9308510650979712, 0.9441489374383967, 0.9361702114977735, 0.949468083838199, 0.9627659561786246, 0.9388297872340425, 0.9574468072424543], 'loss': [4.750196132254093, 4.2347583922934025, 4.007927965610586, 3.7133123367390732, 3.405741980735292, 3.114452159151118, 2.8491382751059025, 2.4941955170732864, 2.0101340070683906, 1.6160036629818855, 1.3228023331216041, 0.9246597924131028, 0.6425992719670559, 0.44368606171709424, 0.36423538783763315, 0.3247838901712539, 0.25440096221071606, 0.19839995877539857, 0.2734588150014269, 0.2377648562827009]}
Starting work on cross validation set 6
Creating Model
Epoch 1/20
452/452 [==============================] - 32s 71ms/step - loss: 4.7349 - acc: 0.0310
Epoch 2/20
452/452 [==============================] - 37s 82ms/step - loss: 4.1802 - acc: 0.1018
Epoch 3/20
452/452 [==============================] - 38s 84ms/step - loss: 3.9415 - acc: 0.1018
Epoch 4/20
452/452 [==============================] - 33s 74ms/step - loss: 3.7645 - acc: 0.1261
Epoch 5/20
452/452 [==============================] - 33s 73ms/step - loss: 3.4011 - acc: 0.1770
Epoch 6/20
452/452 [==============================] - 36s 79ms/step - loss: 3.1756 - acc: 0.2301
Epoch 7/20
452/452 [==============================] - 34s 75ms/step - loss: 2.9732 - acc: 0.2810
Epoch 8/20
452/452 [==============================] - 33s 73ms/step - loss: 2.5793 - acc: 0.3385
Epoch 9/20
452/452 [==============================] - 33s 74ms/step - loss: 2.2497 - acc: 0.4159
Epoch 10/20
452/452 [==============================] - 32s 70ms/step - loss: 1.8129 - acc: 0.5664
Epoch 11/20
452/452 [==============================] - 31s 69ms/step - loss: 1.5457 - acc: 0.6195
Epoch 12/20
452/452 [==============================] - 30s 67ms/step - loss: 1.0610 - acc: 0.7389
Epoch 13/20
452/452 [==============================] - 34s 75ms/step - loss: 0.8189 - acc: 0.8385
Epoch 14/20
452/452 [==============================] - 34s 75ms/step - loss: 0.5899 - acc: 0.8916
Epoch 15/20
452/452 [==============================] - 33s 74ms/step - loss: 0.4564 - acc: 0.9093
Epoch 16/20
452/452 [==============================] - 31s 68ms/step - loss: 0.4534 - acc: 0.9204
Epoch 17/20
452/452 [==============================] - 33s 73ms/step - loss: 0.2178 - acc: 0.9624
Epoch 18/20
452/452 [==============================] - 31s 69ms/step - loss: 0.2540 - acc: 0.9558
Epoch 19/20
452/452 [==============================] - 31s 68ms/step - loss: 0.2355 - acc: 0.9447
Epoch 20/20
452/452 [==============================] - 31s 69ms/step - loss: 0.3798 - acc: 0.9137
('Test accuracy: ', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
{'acc': [0.030973451327433628, 0.10176991150442478, 0.10176991150442478, 0.1261061946902655, 0.17699115044247787, 0.23008849557522124, 0.2809734513274336, 0.33849557522123896, 0.415929203539823, 0.5663716814159292, 0.6194690265486725, 0.7389380530973452, 0.838495575221239, 0.8915929203539823, 0.9092920353982301, 0.9203539823008849, 0.9623893805309734, 0.9557522123893806, 0.9446902654867256, 0.9137168141592921], 'loss': [4.734865551501249, 4.180207936109695, 3.9415339706218346, 3.764456010497777, 3.4010907401025823, 3.175603206178783, 2.9731988527078546, 2.5792592331371478, 2.249683000345146, 1.8128639027080704, 1.5456597942166623, 1.060966565281944, 0.8189177223011456, 0.5898554177410835, 0.45644559374952737, 0.4534411278709901, 0.21776356221695917, 0.254036185356368, 0.23553280387304526, 0.3798139284208285]}
Starting work on cross validation set 7
Creating Model
Epoch 1/20
528/528 [==============================] - 38s 72ms/step - loss: 4.6296 - acc: 0.0511
Epoch 2/20
528/528 [==============================] - 34s 64ms/step - loss: 4.1529 - acc: 0.0833
Epoch 3/20
528/528 [==============================] - 35s 66ms/step - loss: 3.9034 - acc: 0.1004
Epoch 4/20
528/528 [==============================] - 40s 75ms/step - loss: 3.6703 - acc: 0.1420
Epoch 5/20
528/528 [==============================] - 36s 67ms/step - loss: 3.4374 - acc: 0.1780
Epoch 6/20
528/528 [==============================] - 36s 69ms/step - loss: 3.1857 - acc: 0.2216
Epoch 7/20
528/528 [==============================] - 36s 69ms/step - loss: 2.8405 - acc: 0.2898
Epoch 8/20
528/528 [==============================] - 33s 63ms/step - loss: 2.5054 - acc: 0.3390
Epoch 9/20
528/528 [==============================] - 33s 62ms/step - loss: 2.0530 - acc: 0.4773
Epoch 10/20
528/528 [==============================] - 35s 66ms/step - loss: 1.6944 - acc: 0.5833
Epoch 11/20
528/528 [==============================] - 36s 68ms/step - loss: 1.3048 - acc: 0.6894
Epoch 12/20
528/528 [==============================] - 33s 63ms/step - loss: 0.8746 - acc: 0.8239
Epoch 13/20
528/528 [==============================] - 33s 63ms/step - loss: 0.6294 - acc: 0.8883
Epoch 14/20
528/528 [==============================] - 36s 69ms/step - loss: 0.4399 - acc: 0.9223
Epoch 15/20
528/528 [==============================] - 34s 65ms/step - loss: 0.3303 - acc: 0.9318
Epoch 16/20
528/528 [==============================] - 33s 62ms/step - loss: 0.2761 - acc: 0.9394
Epoch 17/20
528/528 [==============================] - 36s 68ms/step - loss: 0.2336 - acc: 0.9659
Epoch 18/20
528/528 [==============================] - 39s 74ms/step - loss: 0.1942 - acc: 0.9678
Epoch 19/20
528/528 [==============================] - 36s 68ms/step - loss: 0.1349 - acc: 0.9773
Epoch 20/20
528/528 [==============================] - 38s 72ms/step - loss: 0.1727 - acc: 0.9697
('Test accuracy: ', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
{'acc': [0.05113636363636364, 0.08333333333333333, 0.10037878787878787, 0.14204545454545456, 0.17803030303030304, 0.2215909090909091, 0.2897727272727273, 0.3390151515151515, 0.4772727272727273, 0.5833333333333334, 0.6893939393939394, 0.8238636363636364, 0.8882575757575758, 0.9223484848484849, 0.9318181818181818, 0.9393939393939394, 0.9659090909090909, 0.9678030303030303, 0.9772727272727273, 0.9696969696969697], 'loss': [4.629555731108694, 4.1528933987473, 3.9033913612365723, 3.6703255393288354, 3.437427766395338, 3.1856574578718706, 2.840506105711966, 2.505383693810665, 2.052970720059944, 1.694417209336252, 1.3048207254120798, 0.87464511755741, 0.6293881571654117, 0.4398622955336715, 0.3303227018226277, 0.2761244082992727, 0.23363817009058865, 0.19419398587761502, 0.13493488345182303, 0.17268287842020844]}
Starting work on cross validation set 8
Creating Model
Epoch 1/20
603/603 [==============================] - 43s 71ms/step - loss: 4.6045 - acc: 0.0448
Epoch 2/20
603/603 [==============================] - 40s 66ms/step - loss: 4.1237 - acc: 0.0697
Epoch 3/20
603/603 [==============================] - 42s 70ms/step - loss: 3.9144 - acc: 0.0879
Epoch 4/20
603/603 [==============================] - 42s 69ms/step - loss: 3.7059 - acc: 0.1343
Epoch 5/20
603/603 [==============================] - 41s 68ms/step - loss: 3.4883 - acc: 0.1675
Epoch 6/20
603/603 [==============================] - 43s 71ms/step - loss: 3.2081 - acc: 0.2338
Epoch 7/20
603/603 [==============================] - 43s 71ms/step - loss: 2.8525 - acc: 0.2819
Epoch 8/20
603/603 [==============================] - 41s 69ms/step - loss: 2.5183 - acc: 0.3665
Epoch 9/20
603/603 [==============================] - 45s 74ms/step - loss: 2.0650 - acc: 0.4643
Epoch 10/20
603/603 [==============================] - 42s 70ms/step - loss: 1.6286 - acc: 0.5871
Epoch 11/20
603/603 [==============================] - 42s 69ms/step - loss: 1.2159 - acc: 0.7297
Epoch 12/20
603/603 [==============================] - 44s 73ms/step - loss: 0.8327 - acc: 0.8375
Epoch 13/20
603/603 [==============================] - 38s 63ms/step - loss: 0.5930 - acc: 0.9038
Epoch 14/20
603/603 [==============================] - 40s 66ms/step - loss: 0.4400 - acc: 0.9071
Epoch 15/20
603/603 [==============================] - 45s 75ms/step - loss: 0.3864 - acc: 0.9221
Epoch 16/20
603/603 [==============================] - 40s 67ms/step - loss: 0.2167 - acc: 0.9619
Epoch 17/20
603/603 [==============================] - 41s 68ms/step - loss: 0.2516 - acc: 0.9420
Epoch 18/20
603/603 [==============================] - 43s 71ms/step - loss: 0.2185 - acc: 0.9619
Epoch 19/20
603/603 [==============================] - 41s 68ms/step - loss: 0.1302 - acc: 0.9768
Epoch 20/20
603/603 [==============================] - 42s 70ms/step - loss: 0.1733 - acc: 0.9685
('Test accuracy: ', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
{'acc': [0.044776119440052636, 0.06965174131824405, 0.08789386405033456, 0.13432835828309037, 0.16749585379118942, 0.2338308458205677, 0.28192371451241854, 0.36650082968163056, 0.4643449411661075, 0.5870646771111496, 0.7296849086905396, 0.8374792709081722, 0.9038142613312894, 0.9071310122017046, 0.9220563860279608, 0.9618573790758996, 0.9419568832438581, 0.9618573807562959, 0.9767827538906243, 0.9684908799271086], 'loss': [4.604499075938615, 4.123690900122546, 3.91436969897838, 3.705934292048364, 3.488275130589803, 3.2081445813376708, 2.852453622849624, 2.5183466062023867, 2.065009862233948, 1.6285543817390455, 1.2159198797361965, 0.8326763786486725, 0.5930271063673358, 0.4400432771118126, 0.38636066877031405, 0.21673515275936223, 0.2516340226379793, 0.21846123531485473, 0.1301788781610492, 0.1732571558226796]}
Starting work on cross validation set 9
Creating Model
Epoch 1/20
679/679 [==============================] - 52s 76ms/step - loss: 4.5677 - acc: 0.0353
Epoch 2/20
679/679 [==============================] - 44s 65ms/step - loss: 4.0835 - acc: 0.0751
Epoch 3/20
679/679 [==============================] - 49s 72ms/step - loss: 3.8820 - acc: 0.1119
Epoch 4/20
679/679 [==============================] - 47s 69ms/step - loss: 3.6481 - acc: 0.1517
Epoch 5/20
679/679 [==============================] - 48s 70ms/step - loss: 3.4386 - acc: 0.1767
Epoch 6/20
679/679 [==============================] - 48s 71ms/step - loss: 3.1435 - acc: 0.2209
Epoch 7/20
679/679 [==============================] - 46s 68ms/step - loss: 2.8589 - acc: 0.2784
Epoch 8/20
679/679 [==============================] - 51s 75ms/step - loss: 2.4877 - acc: 0.3638
Epoch 9/20
679/679 [==============================] - 46s 68ms/step - loss: 2.1346 - acc: 0.4639
Epoch 10/20
679/679 [==============================] - 47s 70ms/step - loss: 1.7418 - acc: 0.5493
Epoch 11/20
679/679 [==============================] - 47s 69ms/step - loss: 1.3584 - acc: 0.6745
Epoch 12/20
679/679 [==============================] - 43s 63ms/step - loss: 0.9600 - acc: 0.7732
Epoch 13/20
679/679 [==============================] - 48s 71ms/step - loss: 0.7250 - acc: 0.8365
Epoch 14/20
679/679 [==============================] - 49s 72ms/step - loss: 0.4958 - acc: 0.9072
Epoch 15/20
679/679 [==============================] - 61s 90ms/step - loss: 0.4065 - acc: 0.9205
Epoch 16/20
679/679 [==============================] - 48s 70ms/step - loss: 0.2716 - acc: 0.9543
Epoch 17/20
679/679 [==============================] - 49s 72ms/step - loss: 0.2962 - acc: 0.9558
Epoch 18/20
679/679 [==============================] - 51s 74ms/step - loss: 0.1789 - acc: 0.9676
Epoch 19/20
679/679 [==============================] - 45s 66ms/step - loss: 0.1927 - acc: 0.9661
Epoch 20/20
679/679 [==============================] - 50s 74ms/step - loss: 0.1045 - acc: 0.9867
('Test accuracy: ', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
{'acc': [0.035346097201767304, 0.07511045655375552, 0.11192930787143371, 0.15169366722342192, 0.17673048607467376, 0.2209131075768829, 0.27835051550780904, 0.363770250499863, 0.4639175258170874, 0.5493372608091414, 0.674521355197075, 0.7731958763764428, 0.8365243007051751, 0.9072164949331438, 0.9204712813838065, 0.9543446245355002, 0.9558173784977909, 0.96759941089838, 0.9661266569360893, 0.9867452135493373], 'loss': [4.567715251568666, 4.083519287418371, 3.8819729073936, 3.648139042425928, 3.438588172943497, 3.1434893084906688, 2.8588713918413435, 2.48766548749449, 2.134560451942972, 1.7417725835527693, 1.3583678848086822, 0.9600451338449177, 0.7249828846535381, 0.495802026521826, 0.40651958320558684, 0.27161019118790775, 0.2962282588362518, 0.17885387843592704, 0.19266988216338346, 0.1044931950511216]}
Starting work on cross validation set 10
Creating Model
Epoch 1/20
755/755 [==============================] - 49s 65ms/step - loss: 4.5609 - acc: 0.0503
Epoch 2/20
755/755 [==============================] - 52s 68ms/step - loss: 4.0584 - acc: 0.0861
Epoch 3/20
755/755 [==============================] - 53s 71ms/step - loss: 3.8712 - acc: 0.1046
Epoch 4/20
755/755 [==============================] - 51s 68ms/step - loss: 3.6618 - acc: 0.1391
Epoch 5/20
755/755 [==============================] - 57s 76ms/step - loss: 3.4205 - acc: 0.1642
Epoch 6/20
755/755 [==============================] - 57s 76ms/step - loss: 3.0951 - acc: 0.2305
Epoch 7/20
755/755 [==============================] - 54s 72ms/step - loss: 2.7594 - acc: 0.2967
Epoch 8/20
755/755 [==============================] - 51s 68ms/step - loss: 2.4296 - acc: 0.3748
Epoch 9/20
755/755 [==============================] - 52s 69ms/step - loss: 1.9622 - acc: 0.4940
Epoch 10/20
755/755 [==============================] - 49s 65ms/step - loss: 1.5892 - acc: 0.6066
Epoch 11/20
755/755 [==============================] - 50s 66ms/step - loss: 1.1769 - acc: 0.7311
Epoch 12/20
755/755 [==============================] - 51s 68ms/step - loss: 0.8298 - acc: 0.8318
Epoch 13/20
755/755 [==============================] - 50s 66ms/step - loss: 0.5730 - acc: 0.8834
Epoch 14/20
755/755 [==============================] - 53s 70ms/step - loss: 0.4527 - acc: 0.9152
Epoch 15/20
755/755 [==============================] - 49s 65ms/step - loss: 0.3366 - acc: 0.9417
Epoch 16/20
755/755 [==============================] - 53s 70ms/step - loss: 0.2220 - acc: 0.9642
Epoch 17/20
755/755 [==============================] - 61s 81ms/step - loss: 0.2519 - acc: 0.9523
Epoch 18/20
755/755 [==============================] - 61s 81ms/step - loss: 0.1940 - acc: 0.9682
Epoch 19/20
755/755 [==============================] - 51s 68ms/step - loss: 0.1261 - acc: 0.9801
Epoch 20/20
755/755 [==============================] - 53s 70ms/step - loss: 0.1733 - acc: 0.9669
('Test accuracy: ', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
{'acc': [0.050331125837682886, 0.08609271507389499, 0.10463576159927229, 0.13907284791895885, 0.16423841063549977, 0.2304635760010473, 0.2966887416034345, 0.3748344371650393, 0.49403973537565066, 0.6066225170299707, 0.7311258281303558, 0.8317880792333591, 0.8834437093197904, 0.9152317887899891, 0.941721853909903, 0.9642384102012937, 0.9523178813473279, 0.9682119205298013, 0.9801324503311258, 0.9668874172185431], 'loss': [4.560920313020415, 4.058387916767044, 3.871235451793039, 3.661826286568547, 3.42054346191962, 3.095070534194542, 2.759375188840146, 2.429589053652934, 1.9622009552077742, 1.5892039272169403, 1.1768863321140113, 0.8298144397356653, 0.5730399114406661, 0.4526760964204144, 0.33663744942241947, 0.2220429059685461, 0.2519148943045281, 0.1939519659474196, 0.12609291273039697, 0.17329386112508394]}

Split cross validation sets and perform baseline classifiers


In [9]:
totalLength = len(all_data)
splitLength = totalLength / (numCV + 1)

for i in range(1, numCV+1):
    # Split cross validation set
    print("Starting cross validation {0}".format(i))
    train_data = all_data[:i*splitLength-1]
    test_data = all_data[i*splitLength:(i+1)*splitLength-1]
    train_owner = all_owner[:i*splitLength-1]
    test_owner = all_owner[i*splitLength:(i+1)*splitLength-1]
    
    # Remove words outside the vocabulary
    updated_train_data = []    
    updated_train_data_length = []    
    updated_train_owner = []
    final_test_data = []
    final_test_owner = []
    for j, item in enumerate(train_data):
        current_train_filter = [word for word in item if word in vocabulary]
        if len(current_train_filter)>=min_sentence_length:  
          updated_train_data.append(current_train_filter)
          updated_train_owner.append(train_owner[j])  
          
    for j, item in enumerate(test_data):
        current_test_filter = [word for word in item if word in vocabulary]  
        if len(current_test_filter)>=min_sentence_length:
          final_test_data.append(current_test_filter)          
          final_test_owner.append(test_owner[j])          
    
    # Remove data from test set that is not there in train set
    train_owner_unique = set(updated_train_owner)
    test_owner_unique = set(final_test_owner)
    unwanted_owner = list(test_owner_unique - train_owner_unique)
    updated_test_data = []
    updated_test_owner = []
    updated_test_data_length = []
    for j in range(len(final_test_owner)):
        if final_test_owner[j] not in unwanted_owner:
            updated_test_data.append(final_test_data[j])
            updated_test_owner.append(final_test_owner[j])  
    
    train_data = []
    for item in updated_train_data:
          train_data.append(' '.join(item))
         
    test_data = []
    for item in updated_test_data:
          test_data.append(' '.join(item))
    
    vocab_data = []
    for item in vocabulary:
          vocab_data.append(item)
    
    # Extract tf based bag of words representation
    tfidf_transformer = TfidfTransformer(use_idf=False)
    count_vect = CountVectorizer(min_df=1, vocabulary= vocab_data,dtype=np.int32)
    
    train_counts = count_vect.fit_transform(train_data)       
    train_feats = tfidf_transformer.fit_transform(train_counts)
    print(train_feats.shape)
    
    test_counts = count_vect.transform(test_data)
    test_feats = tfidf_transformer.transform(test_counts)
    print(test_feats.shape)
    print("=" * 20)
    
    
    
    # perform classifification
    for classifier in range(1,5):
        #classifier = 3 # 1 - Niave Bayes, 2 - Softmax, 3 - cosine distance, 4 - SVM
        print classifier 
        if classifier == 1:            
            classifierModel = MultinomialNB(alpha=0.01)        
            classifierModel = OneVsRestClassifier(classifierModel).fit(train_feats, updated_train_owner)
            predict = classifierModel.predict_proba(test_feats)  
            classes = classifierModel.classes_  
            
            accuracy = []
            sortedIndices = []
            pred_classes = []
            for ll in predict:
                sortedIndices.append(sorted(range(len(ll)), key=lambda ii: ll[ii], reverse=True))
            for k in range(1, rankK+1):
                id = 0
                trueNum = 0
                for sortedInd in sortedIndices:            
                    if updated_test_owner[id] in classes[sortedInd[:k]]:
                        trueNum += 1
                        pred_classes.append(classes[sortedInd[:k]])
                    id += 1
                accuracy.append((float(trueNum) / len(predict)) * 100)
            print accuracy                                    
        elif classifier == 2:            
            classifierModel = LogisticRegression(solver='lbfgs', penalty='l2', tol=0.01)
            classifierModel = OneVsRestClassifier(classifierModel).fit(train_feats, updated_train_owner)
            predict = classifierModel.predict(test_feats)
            classes = classifierModel.classes_ 
            
            accuracy = []
            sortedIndices = []
            pred_classes = []
            for ll in predict:
                sortedIndices.append(sorted(range(len(ll)), key=lambda ii: ll[ii], reverse=True))
            for k in range(1, rankK+1):
                id = 0
                trueNum = 0
                for sortedInd in sortedIndices:            
                    if updated_test_owner[id] in classes[sortedInd[:k]]:
                        trueNum += 1
                        pred_classes.append(classes[sortedInd[:k]])
                    id += 1
                accuracy.append((float(trueNum) / len(predict)) * 100)
            print accuracy                                   
        elif classifier == 3:            
            predict = cosine_similarity(test_feats, train_feats)
            classes = np.array(updated_train_owner)
            classifierModel = []
            
            accuracy = []
            sortedIndices = []
            pred_classes = []
            for ll in predict:
                sortedIndices.append(sorted(range(len(ll)), key=lambda ii: ll[ii], reverse=True))
            for k in range(1, rankK+1):
                id = 0
                trueNum = 0
                for sortedInd in sortedIndices:            
                    if updated_test_owner[id] in classes[sortedInd[:k]]:
                        trueNum += 1
                        pred_classes.append(classes[sortedInd[:k]])
                    id += 1
                accuracy.append((float(trueNum) / len(predict)) * 100)
            print accuracy                        
        elif classifier == 4:            
            classifierModel = svm.SVC(probability=True, verbose=False, decision_function_shape='ovr', random_state=42)
            classifierModel.fit(train_feats, updated_train_owner)
            predict = classifierModel.predict(test_feats)
            classes = classifierModel.classes_ 
        
            accuracy = []
            sortedIndices = []
            pred_classes = []
            for ll in predict:
                sortedIndices.append(sorted(range(len(ll)), key=lambda ii: ll[ii], reverse=True))
            for k in range(1, rankK+1):
                id = 0
                trueNum = 0
                for sortedInd in sortedIndices:            
                    if updated_test_owner[id] in classes[sortedInd[:k]]:
                        trueNum += 1
                        pred_classes.append(classes[sortedInd[:k]])
                    id += 1
                accuracy.append((float(trueNum) / len(predict)) * 100)
            print accuracy


Starting cross validation 1
(74, 7924)
(31, 7924)
====================
1
[22.58064516129032, 29.03225806451613, 32.25806451612903, 35.483870967741936, 38.70967741935484, 45.16129032258064, 45.16129032258064, 45.16129032258064, 45.16129032258064, 48.38709677419355]
2
[9.67741935483871, 9.67741935483871, 9.67741935483871, 9.67741935483871, 9.67741935483871, 16.129032258064516, 16.129032258064516, 16.129032258064516, 16.129032258064516, 16.129032258064516]
3
[3.225806451612903, 16.129032258064516, 25.806451612903224, 29.03225806451613, 29.03225806451613, 32.25806451612903, 35.483870967741936, 38.70967741935484, 41.935483870967744, 48.38709677419355]
4
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Starting cross validation 2
(149, 7924)
(54, 7924)
====================
1
[9.25925925925926, 12.962962962962962, 24.074074074074073, 27.77777777777778, 31.48148148148148, 37.03703703703704, 38.88888888888889, 40.74074074074074, 42.592592592592595, 42.592592592592595]
2
[0.0, 5.555555555555555, 5.555555555555555, 5.555555555555555, 5.555555555555555, 7.4074074074074066, 7.4074074074074066, 7.4074074074074066, 7.4074074074074066, 7.4074074074074066]
3
[7.4074074074074066, 18.51851851851852, 20.37037037037037, 24.074074074074073, 25.925925925925924, 25.925925925925924, 27.77777777777778, 29.629629629629626, 29.629629629629626, 33.33333333333333]
4
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Starting cross validation 3
(225, 7924)
(47, 7924)
====================
1
[19.148936170212767, 25.53191489361702, 25.53191489361702, 29.78723404255319, 36.17021276595745, 40.42553191489361, 42.5531914893617, 44.680851063829785, 44.680851063829785, 46.808510638297875]
2
[2.127659574468085, 2.127659574468085, 2.127659574468085, 2.127659574468085, 4.25531914893617, 10.638297872340425, 10.638297872340425, 10.638297872340425, 10.638297872340425, 10.638297872340425]
3
[8.51063829787234, 12.76595744680851, 14.893617021276595, 17.02127659574468, 17.02127659574468, 19.148936170212767, 23.404255319148938, 29.78723404255319, 31.914893617021278, 31.914893617021278]
4
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Starting cross validation 4
(301, 7924)
(61, 7924)
====================
1
[3.278688524590164, 3.278688524590164, 3.278688524590164, 6.557377049180328, 9.836065573770492, 9.836065573770492, 13.114754098360656, 14.754098360655737, 18.0327868852459, 18.0327868852459]
2
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.639344262295082, 1.639344262295082, 1.639344262295082, 1.639344262295082]
3
[6.557377049180328, 9.836065573770492, 9.836065573770492, 9.836065573770492, 14.754098360655737, 16.39344262295082, 19.672131147540984, 22.950819672131146, 22.950819672131146, 22.950819672131146]
4
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Starting cross validation 5
(376, 7924)
(74, 7924)
====================
1
[4.054054054054054, 9.45945945945946, 17.56756756756757, 22.972972972972975, 25.675675675675674, 29.72972972972973, 31.08108108108108, 33.78378378378378, 33.78378378378378, 37.83783783783784]
2
[0.0, 0.0, 0.0, 0.0, 0.0, 1.3513513513513513, 1.3513513513513513, 1.3513513513513513, 1.3513513513513513, 1.3513513513513513]
3
[5.405405405405405, 8.108108108108109, 13.513513513513514, 20.27027027027027, 21.62162162162162, 22.972972972972975, 22.972972972972975, 25.675675675675674, 25.675675675675674, 28.37837837837838]
4
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Starting cross validation 6
(452, 7924)
(75, 7924)
====================
1
[9.333333333333334, 14.666666666666666, 16.0, 18.666666666666668, 22.666666666666664, 28.000000000000004, 32.0, 37.333333333333336, 41.333333333333336, 42.66666666666667]
2
[0.0, 1.3333333333333335, 2.666666666666667, 5.333333333333334, 5.333333333333334, 6.666666666666667, 6.666666666666667, 6.666666666666667, 6.666666666666667, 6.666666666666667]
3
[5.333333333333334, 8.0, 8.0, 13.333333333333334, 13.333333333333334, 13.333333333333334, 16.0, 18.666666666666668, 20.0, 21.333333333333336]
4
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Starting cross validation 7
(528, 7924)
(72, 7924)
====================
1
[12.5, 22.22222222222222, 30.555555555555557, 38.88888888888889, 38.88888888888889, 41.66666666666667, 44.44444444444444, 47.22222222222222, 52.77777777777778, 55.55555555555556]
2
[1.3888888888888888, 2.7777777777777777, 2.7777777777777777, 4.166666666666666, 4.166666666666666, 4.166666666666666, 6.944444444444445, 6.944444444444445, 6.944444444444445, 6.944444444444445]
3
[8.333333333333332, 9.722222222222223, 16.666666666666664, 25.0, 25.0, 33.33333333333333, 37.5, 41.66666666666667, 44.44444444444444, 45.83333333333333]
4
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Starting cross validation 8
(603, 7924)
(73, 7924)
====================
1
[12.32876712328767, 27.397260273972602, 34.24657534246575, 36.986301369863014, 39.726027397260275, 42.465753424657535, 47.94520547945205, 49.31506849315068, 50.68493150684932, 53.42465753424658]
2
[0.0, 4.10958904109589, 4.10958904109589, 6.8493150684931505, 6.8493150684931505, 6.8493150684931505, 15.068493150684931, 15.068493150684931, 15.068493150684931, 15.068493150684931]
3
[8.21917808219178, 12.32876712328767, 13.698630136986301, 19.17808219178082, 23.28767123287671, 26.027397260273972, 27.397260273972602, 30.136986301369863, 31.506849315068493, 34.24657534246575]
4
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Starting cross validation 9
(679, 7924)
(65, 7924)
====================
1
[13.846153846153847, 29.230769230769234, 32.30769230769231, 36.92307692307693, 38.46153846153847, 41.53846153846154, 41.53846153846154, 49.23076923076923, 49.23076923076923, 52.307692307692314]
2
[0.0, 0.0, 0.0, 3.076923076923077, 4.615384615384616, 12.307692307692308, 13.846153846153847, 13.846153846153847, 13.846153846153847, 13.846153846153847]
3
[6.153846153846154, 10.76923076923077, 21.53846153846154, 23.076923076923077, 23.076923076923077, 26.153846153846157, 27.692307692307693, 30.76923076923077, 33.84615384615385, 36.92307692307693]
4
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Starting cross validation 10
(755, 7924)
(68, 7924)
====================
1
[11.76470588235294, 11.76470588235294, 14.705882352941178, 17.647058823529413, 22.058823529411764, 23.52941176470588, 26.47058823529412, 26.47058823529412, 29.411764705882355, 29.411764705882355]
2
[0.0, 0.0, 0.0, 2.941176470588235, 4.411764705882353, 5.88235294117647, 7.352941176470589, 7.352941176470589, 7.352941176470589, 7.352941176470589]
3
[2.941176470588235, 4.411764705882353, 8.823529411764707, 8.823529411764707, 11.76470588235294, 14.705882352941178, 16.176470588235293, 16.176470588235293, 19.11764705882353, 19.11764705882353]
4
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

issues and Remedies

Issue: # Hack to increase size due to Error: field larger than field limit (131072)

Remedy:

maxInt = sys.maxsize
decrement = True

while decrement:
    # decrease the maxInt value by factor 10
    # as long as the OverflowError occurs.

    decrement = False
    try:
        csv.field_size_limit(maxInt)
    except OverflowError:
        maxInt = int(maxInt / 10)
        decrement = True

Issue:

AttributeError                            Traceback (most recent call last)
<ipython-input-4-a44dbf7d73b5> in <module>()
     34 # Learn the word2vec model and extract vocabulary
     35 wordvec_model = Word2Vec(all_data, min_count=min_word_frequency_word2vec, size=embed_size_word2vec, window=context_window_word2vec)
---> 36 vocabulary = wordvec_model.vocab
     37 vocab_size = len(vocabulary)

AttributeError: 'Word2Vec' object has no attribute 'vocab'

Remedy:

vocabulary = wordvec_model.wv.vocab

Issue:

TypeError                                 Traceback (most recent call last)
<ipython-input-2-665fcb716c54> in <module>()
    225     backwards_1 = LSTM(1024, go_backwards=True)(sequence)
    226     after_dp_backward_4 = Dropout(0.20)(backwards_1)
--> 227     merged = merge([after_dp_forward_4, after_dp_backward_4], mode='concat', concat_axis=-1)
    228     after_dp = Dropout(0.5)(merged)
    229     output = Dense(len(unique_train_label), activation='softmax')(after_dp)

TypeError: 'module' object is not callable

Remedy:

merged = layers.concatenate([after_dp_forward_4, after_dp_backward_4], axis=-1)
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-1-56f416da9cda> in <module>()
     12 from keras.preprocessing import sequence
     13 from keras.models import Model
---> 14 from keras.layers import Dense, Dropout, Embedding, LSTM, Input, Merge
     15 from keras.optimizers import RMSprop
     16 from keras.utils import np_utils

ImportError: cannot import name Merge

In [ ]: