In [1]:

    
import numpy as np
import csv
import pandas as pd
from sklearn import svm
from nltk.corpus import stopwords
import re
import nltk
from nltk.tokenize import *
from nltk.data import load

Load Test File



In [2]:

    
def readFile(fileName):
    f = open(fileName,"r+")
    sentences = []
    for line in f.readlines():
        line = line.strip()
        if len(line) == 0:
            continue
        if "TIMEX3" in line:
            continue 
        sentences.append(line.lower().rstrip())
    f.close()
    return sentences



In [3]:

    
def loadW2V(fileName):
    w2v = np.load(fileName).item()
    return w2v



In [4]:

    
def loadVocab(fileName):
    vocab = np.load(fileName)
    return vocab.tolist()

For Parts of Speech



In [5]:

    
def POSForSentence(sentence):
    text = word_tokenize(sentence)
    posSentence = nltk.pos_tag(text)
    posSentence = [y for x, y in posSentence]
    return posSentence



In [6]:

    
def getUniquePOS():
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    return len(tagdict), tagdict.keys()

Remove Non Word Character



In [7]:

    
def removeFancyChars(sentences):
    lengthPhrase = len(sentences)
    for i in range(lengthPhrase):
        sentences[i] = re.sub(r'([^\s\w]|_)+', '', sentences[i])
    return sentences

Stop Words



In [8]:

    
def removeSWFromSent(sentence):
    words = sentence.split()
    sentence = []
    for w in words:
        if w not in stopwords.words('english'):
            sentence.append(w)
    if len(sentence) == 0: 
        sentence = [""]
    return convertlistToString(sentence)



In [9]:

    
def removeSWFromPar(sentences):
    sents = []
    for i in range(0, len(sentences)):
        sent = removeSWFromSent(sentences[i])
        sents.append(sent)
    return sents



In [10]:

    
def convertlistToString(sentence):
    sentence = " ".join(sentence)
    return sentence

Define BOW Model



In [11]:

    
def defineBOWM(sentences, vocab):
    vocabSize = len(vocab)
    n = len(sentences)
    mat = np.zeros((n,vocabSize))
    k = 0
    for i in range(0, n):
        words = sentences[i].split()
        for w in words:
            if w in vocab:
                mat[k, vocab.index(w)] = 1.0
        k+=1
    return mat



In [12]:

    
def defineBOWMPOS(originalSentences, sentences, vocab):
    vocabSize = len(vocab)
    n = len(sentences)
    sizePOS, POSList = getUniquePOS()
    mat = np.zeros((n, vocabSize + sizePOS))
    matFromBOWM = defineBOWM(sentences, vocab)
    
    for i in range(0, n):
        mat[i,:vocabSize] = matFromBOWM[i]
        
    k = 0
    for i in range(0, len(originalSentences)):
        pos = POSForSentence(originalSentences[i])
        for p in pos:
            mat[k, vocabSize + POSList.index(p)] = 1.0
        k+=1
    return mat

Define W2V Model



In [20]:

    
def defineW2V(sentences, w2v, w2vSize = 300):
    n = len(sentences)
    mat = np.zeros((n, w2vSize))
    for i in range(0, n):
        words = sentences[i].split()
        d = np.zeros(300)
        ind = 0.0
        for w in words:
            if w not in w2v:
                w2v[w] = np.random.uniform(-0.25,0.25,w2vSize) 
            d += w2v[w]
            ind += 1.0
        d /= ind
        mat[i] = d
    return mat



In [21]:

    
def defineW2VPOS(originalSentences, sentences, w2v, dim = 300):
    n = len(sentences)
    sizePOS, POSList = getUniquePOS()
    mat = np.zeros((n, dim + sizePOS))
    matFromW2V = defineW2V(sentences, w2v)
    
    for i in range(0, n):
        mat[i,:dim] = matFromW2V[i]
        
    k = 0
    for i in range(0, len(originalSentences)):
        pos = POSForSentence(originalSentences[i])
        for p in pos:
            mat[k, dim + POSList.index(p)] = 1.0
        k+=1
    
    return mat

Save features



In [15]:

    
def savetestFile(fileName, mat):
    np.save(fileName, mat)

Main



In [38]:

    
def main():
    testFileName = "testdata_email_classification.txt"
    w2vFile = "word2VecDict.npy"
    vocabswFile = "vocabsw.npy"
    vocabFile = "vocab.npy"
    
    originalsentences = readFile(testFileName)
    sentences = removeFancyChars(originalsentences)
    
    w2v = loadW2V(w2vFile)
    vocabsw = loadVocab(vocabswFile)
    vocab = loadVocab(vocabFile)
    
    matW2V = defineW2V(sentences, w2v)
    savetestFile("testw2v.npy",matW2V)
    
    matW2VPOS = defineW2VPOS(originalsentences, sentences, w2v)
    savetestFile("testw2vpos.npy", matW2VPOS)
    
    matBOW = defineBOWM(sentences, vocab)
    savetestFile("testbow.npy", matBOW)
    
    matBOWPOS = defineBOWMPOS(originalsentences, sentences, vocab)
    savetestFile("testbowpos.npy", matBOWPOS)
    
    sentences = removeSWFromPar(sentences)
    matBOWSW = defineBOWM(sentences, vocabsw)
    savetestFile("testbowsw.npy", matBOWSW)
    
    matBOWPOSSW = defineBOWMPOS(originalsentences, sentences, vocabsw)
    savetestFile("testbowpossw.npy", matBOWPOSSW)



In [39]:

    
if __name__ == "__main__":
    main()