In [36]:
import numpy as np
import csv
import pandas as pd
from sklearn import svm
from nltk.corpus import stopwords
import re
from nltk.tokenize import *
In [4]:
def readFile(fileName):
f = open(fileName,"r+")
sentences = []
for line in f.readlines():
line = line.strip()
if len(line) == 0:
continue
if "TIMEX3" in line:
continue
sentences.append(line.lower().rstrip())
f.close()
return sentences
In [7]:
def loadW2V(fileName):
w2v = np.load(fileName).item()
return w2v
In [15]:
def loadVocab(fileName):
vocab = np.load(fileName)
return vocab
In [48]:
def POSForSentence(sentence):
text = word_tokenize(sentence)
posSentence = nltk.pos_tag(text)
posSentence = [y for x, y in posSentence]
return posSentence
In [49]:
def getUniquePOS():
tagdict = load('help/tagsets/upenn_tagset.pickle')
return len(tagdict), tagdict.keys()
In [43]:
def removeFancyChars(sentences):
lengthPhrase = len(sentences)
for i in range(lengthPhrase):
sentences[i] = re.sub(r'([^\s\w]|_)+', '', sentences[i])
return sentences
In [ ]:
def removeSWFromSent(sentence):
words = sentence.split()
sentence = []
for w in words:
if w not in stopwords.words('english'):
sentence.append(w)
if len(sentence) == 0:
sentence = [""]
return convertlistToString(sentence)
In [34]:
def removeSWFromPar(sentences):
sents = []
for i in range(0, len(sentences)):
sent = removeSWFromSent(sentences[i])
sents.append(sent)
return sents
In [35]:
def convertlistToString(sentence):
sentence = " ".join(sentence)
return sentence
In [50]:
def defineBOWM(sentences, vocab):
vocabSize = len(vocab)
n = len(sentences)
mat = np.zeros((n,vocabSize))
k = 0
for i in range(0, n):
words = sentences[i].split()
for w in words:
if w in vocab:
mat[k, vocab.index(w)] = 1.0
k+=1
return mat
In [55]:
def defineBOWMPOS(originalSentences, sentences, vocab):
vocabSize = len(vocab)
n = len(sentences)
sizePOS, POSList = getUniquePOS()
mat = np.zeros((n, vocabSize + sizePOS))
matFromBOWM = defineBOWM(sentences, vocab)
for i in range(0, n):
mat[i,:vocabSize] = matFromBOWM[i]
k = 0
for i in range(0, len(originalSentences)):
pos = POSForSentence(originalSentencesClass[i][j])
for p in pos:
mat[k, vocabSize + POSList.index(p)] = 1.0
k+=1
return mat
In [51]:
def defineW2V(sentences, w2v, w2vSize = 300):
n = len(sentences)
mat = np.zeros((n, w2vSize))
for i in range(0, n):
words = sentences[i].split()
d = np.zeros(300)
ind = 0.0
for w in words:
if w not in w2v:
w2v[w] = np.random.uniform(-0.25,0.25,dim)
d += w2v[w]
ind += 1.0
d /= ind
mat[i] = d
return mat
In [53]:
def defineW2VPOS(originalSentences, sentences, w2v, dim = 300):
n = len(sentences)
sizePOS, POSList = getUniquePOS()
mat = np.zeros((n, dim + sizePOS))
matFromW2V = defineW2V(sentences, w2v)
for i in range(0, n):
mat[i,:dim] = matFromW2V[i]
k = 0
for i in range(0, len(originalSentences)):
pos = POSForSentence(originalSentences[i])
for p in pos:
mat[k, dim + POSList.index(p)] = 1.0
k+=1
return mat
In [33]:
def savetestFile(fileName, mat):
np.save(fileName, mat)
In [56]:
def main():
testFileName = ""
w2vFile = "word2VecDict.npy"
vocabswFile = "vocabsw.npy"
vocabFile = "vocab.npy"
originalsentences = readFile(testFileName)
sentences = removeFancyChars(sentences)
w2v = loadW2V(w2vFile)
vocabsw = loadVocab(vocabswFile)
vocab = loadVocab(vocabFile)
matW2V = defineW2V(sentences, w2v)
savetestFile("testw2v.npy",matW2V)
matW2VPOS = defineW2VPOS(originalsentences, sentences, w2v)
savetestFile("testw2vpos.npy")
sentences = removeSWFromPar(sentences)
matBOW = defineBOWM(sentences, vocab)
saveBOWM(matBOW)
In [30]:
if __name__ == "__main__":
main()