In [36]:
import numpy as np
import csv
import pandas as pd
from sklearn import svm
from nltk.corpus import stopwords
import re
from nltk.tokenize import *

Load Test File


In [4]:
def readFile(fileName):
    f = open(fileName,"r+")
    sentences = []
    for line in f.readlines():
        line = line.strip()
        if len(line) == 0:
            continue
        if "TIMEX3" in line:
            continue 
        sentences.append(line.lower().rstrip())
    f.close()
    return sentences

In [7]:
def loadW2V(fileName):
    w2v = np.load(fileName).item()
    return w2v

In [15]:
def loadVocab(fileName):
    vocab = np.load(fileName)
    return vocab

For Parts of Speech


In [48]:
def POSForSentence(sentence):
    text = word_tokenize(sentence)
    posSentence = nltk.pos_tag(text)
    posSentence = [y for x, y in posSentence]
    return posSentence

In [49]:
def getUniquePOS():
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    return len(tagdict), tagdict.keys()

Remove Non Word Character


In [43]:
def removeFancyChars(sentences):
    lengthPhrase = len(sentences)
    for i in range(lengthPhrase):
        sentences[i] = re.sub(r'([^\s\w]|_)+', '', sentences[i])
    return sentences

Stop Words


In [ ]:
def removeSWFromSent(sentence):
    words = sentence.split()
    sentence = []
    for w in words:
        if w not in stopwords.words('english'):
            sentence.append(w)
    if len(sentence) == 0: 
        sentence = [""]
    return convertlistToString(sentence)

In [34]:
def removeSWFromPar(sentences):
    sents = []
    for i in range(0, len(sentences)):
        sent = removeSWFromSent(sentences[i])
        sents.append(sent)
    return sents

In [35]:
def convertlistToString(sentence):
    sentence = " ".join(sentence)
    return sentence

Define BOW Model


In [50]:
def defineBOWM(sentences, vocab):
    vocabSize = len(vocab)
    n = len(sentences)
    mat = np.zeros((n,vocabSize))
    k = 0
    for i in range(0, n):
        words = sentences[i].split()
        for w in words:
            if w in vocab:
                mat[k, vocab.index(w)] = 1.0
        k+=1
    return mat

In [55]:
def defineBOWMPOS(originalSentences, sentences, vocab):
    vocabSize = len(vocab)
    n = len(sentences)
    sizePOS, POSList = getUniquePOS()
    mat = np.zeros((n, vocabSize + sizePOS))
    matFromBOWM = defineBOWM(sentences, vocab)
    
    for i in range(0, n):
        mat[i,:vocabSize] = matFromBOWM[i]
        
    k = 0
    for i in range(0, len(originalSentences)):
        pos = POSForSentence(originalSentencesClass[i][j])
        for p in pos:
            mat[k, vocabSize + POSList.index(p)] = 1.0
        k+=1
    
    return mat

Define W2V Model


In [51]:
def defineW2V(sentences, w2v, w2vSize = 300):
    n = len(sentences)
    mat = np.zeros((n, w2vSize))
    for i in range(0, n):
        words = sentences[i].split()
        d = np.zeros(300)
        ind = 0.0
        for w in words:
            if w not in w2v:
                w2v[w] = np.random.uniform(-0.25,0.25,dim) 
            d += w2v[w]
            ind += 1.0
        d /= ind
        mat[i] = d
    return mat

In [53]:
def defineW2VPOS(originalSentences, sentences, w2v, dim = 300):
    n = len(sentences)
    sizePOS, POSList = getUniquePOS()
    mat = np.zeros((n, dim + sizePOS))
    matFromW2V = defineW2V(sentences, w2v)
    
    for i in range(0, n):
        mat[i,:dim] = matFromW2V[i]
        
    k = 0
    for i in range(0, len(originalSentences)):
        pos = POSForSentence(originalSentences[i])
        for p in pos:
            mat[k, dim + POSList.index(p)] = 1.0
        k+=1
    
    return mat

Save features


In [33]:
def savetestFile(fileName, mat):
    np.save(fileName, mat)

Main


In [56]:
def main():
    testFileName = ""
    w2vFile = "word2VecDict.npy"
    vocabswFile = "vocabsw.npy"
    vocabFile = "vocab.npy"
    
    originalsentences = readFile(testFileName)
    sentences = removeFancyChars(sentences)
    
    w2v = loadW2V(w2vFile)
    vocabsw = loadVocab(vocabswFile)
    vocab = loadVocab(vocabFile)
    
    matW2V = defineW2V(sentences, w2v)
    savetestFile("testw2v.npy",matW2V)
    
    matW2VPOS = defineW2VPOS(originalsentences, sentences, w2v)
    savetestFile("testw2vpos.npy")
    
    sentences = removeSWFromPar(sentences)
    matBOW = defineBOWM(sentences, vocab)
    saveBOWM(matBOW)


  File "<ipython-input-56-6808527e2c49>", line 15
    savetestFile("testw2v.npy"matW2V)
                                   ^
SyntaxError: invalid syntax

In [30]:
if __name__ == "__main__":
    main()


1