In [1]:
import numpy as np
import csv
import pandas as pd
from sklearn import svm
from nltk.corpus import stopwords
import re
import nltk
from nltk.tokenize import *
from nltk.data import load
In [2]:
def readFile(fileName):
f = open(fileName,"r+")
sentences = []
for line in f.readlines():
line = line.strip()
if len(line) == 0:
continue
if "TIMEX3" in line:
continue
sentences.append(line.lower().rstrip())
f.close()
return sentences
In [3]:
def loadW2V(fileName):
w2v = np.load(fileName).item()
return w2v
In [4]:
def loadVocab(fileName):
vocab = np.load(fileName)
return vocab.tolist()
In [5]:
def POSForSentence(sentence):
text = word_tokenize(sentence)
posSentence = nltk.pos_tag(text)
posSentence = [y for x, y in posSentence]
return posSentence
In [6]:
def getUniquePOS():
tagdict = load('help/tagsets/upenn_tagset.pickle')
return len(tagdict), tagdict.keys()
In [7]:
def removeFancyChars(sentences):
lengthPhrase = len(sentences)
for i in range(lengthPhrase):
sentences[i] = re.sub(r'([^\s\w]|_)+', '', sentences[i])
return sentences
In [8]:
def removeSWFromSent(sentence):
words = sentence.split()
sentence = []
for w in words:
if w not in stopwords.words('english'):
sentence.append(w)
if len(sentence) == 0:
sentence = [""]
return convertlistToString(sentence)
In [9]:
def removeSWFromPar(sentences):
sents = []
for i in range(0, len(sentences)):
sent = removeSWFromSent(sentences[i])
sents.append(sent)
return sents
In [10]:
def convertlistToString(sentence):
sentence = " ".join(sentence)
return sentence
In [11]:
def defineBOWM(sentences, vocab):
vocabSize = len(vocab)
n = len(sentences)
mat = np.zeros((n,vocabSize))
k = 0
for i in range(0, n):
words = sentences[i].split()
for w in words:
if w in vocab:
mat[k, vocab.index(w)] = 1.0
k+=1
return mat
In [12]:
def defineBOWMPOS(originalSentences, sentences, vocab):
vocabSize = len(vocab)
n = len(sentences)
sizePOS, POSList = getUniquePOS()
mat = np.zeros((n, vocabSize + sizePOS))
matFromBOWM = defineBOWM(sentences, vocab)
for i in range(0, n):
mat[i,:vocabSize] = matFromBOWM[i]
k = 0
for i in range(0, len(originalSentences)):
pos = POSForSentence(originalSentences[i])
for p in pos:
mat[k, vocabSize + POSList.index(p)] = 1.0
k+=1
return mat
In [20]:
def defineW2V(sentences, w2v, w2vSize = 300):
n = len(sentences)
mat = np.zeros((n, w2vSize))
for i in range(0, n):
words = sentences[i].split()
d = np.zeros(300)
ind = 0.0
for w in words:
if w not in w2v:
w2v[w] = np.random.uniform(-0.25,0.25,w2vSize)
d += w2v[w]
ind += 1.0
d /= ind
mat[i] = d
return mat
In [21]:
def defineW2VPOS(originalSentences, sentences, w2v, dim = 300):
n = len(sentences)
sizePOS, POSList = getUniquePOS()
mat = np.zeros((n, dim + sizePOS))
matFromW2V = defineW2V(sentences, w2v)
for i in range(0, n):
mat[i,:dim] = matFromW2V[i]
k = 0
for i in range(0, len(originalSentences)):
pos = POSForSentence(originalSentences[i])
for p in pos:
mat[k, dim + POSList.index(p)] = 1.0
k+=1
return mat
In [15]:
def savetestFile(fileName, mat):
np.save(fileName, mat)
In [38]:
def main():
testFileName = "testdata_email_classification.txt"
w2vFile = "word2VecDict.npy"
vocabswFile = "vocabsw.npy"
vocabFile = "vocab.npy"
originalsentences = readFile(testFileName)
sentences = removeFancyChars(originalsentences)
w2v = loadW2V(w2vFile)
vocabsw = loadVocab(vocabswFile)
vocab = loadVocab(vocabFile)
matW2V = defineW2V(sentences, w2v)
savetestFile("testw2v.npy",matW2V)
matW2VPOS = defineW2VPOS(originalsentences, sentences, w2v)
savetestFile("testw2vpos.npy", matW2VPOS)
matBOW = defineBOWM(sentences, vocab)
savetestFile("testbow.npy", matBOW)
matBOWPOS = defineBOWMPOS(originalsentences, sentences, vocab)
savetestFile("testbowpos.npy", matBOWPOS)
sentences = removeSWFromPar(sentences)
matBOWSW = defineBOWM(sentences, vocabsw)
savetestFile("testbowsw.npy", matBOWSW)
matBOWPOSSW = defineBOWMPOS(originalsentences, sentences, vocabsw)
savetestFile("testbowpossw.npy", matBOWPOSSW)
In [39]:
if __name__ == "__main__":
main()