In [1]:
import numpy as np
import csv
import pandas as pd
from sklearn import svm
from nltk.corpus import stopwords
import re
import nltk
from nltk.tokenize import *
from nltk.data import load
In [2]:
def sentenceExtractionForTraining(dirName, fileName, classes):
sentencesClass = []
for i in range(0,len(classes)):
sentences = readFile(dirName+fileName[i])
sentencesClass.append(sentences)
return sentencesClass
In [3]:
def readFile(fileName):
f = open(fileName,"r+")
sentences = []
for line in f.readlines():
line = line.strip()
if len(line)==0 :
continue
if "TIMEX3" in line:
continue
sentences.append(line.lower().rstrip())
f.close()
return sentences
In [4]:
def removeFancyChars(sentences):
lengthPhrase = len(sentences)
for i in range(lengthPhrase):
sentences[i] = re.sub(r'([^\s\w]|_)+', '', sentences[i])
return sentences
In [5]:
def removeFC(sentencesClass):
for i in range(0, len(sentencesClass)):
sentencesClass[i] = removeFancyChars(sentencesClass[i])
return sentencesClass
In [6]:
def POSForSentence(sentence):
text = word_tokenize(sentence)
posSentence = nltk.pos_tag(text)
posSentence = [y for x, y in posSentence]
return posSentence
In [7]:
def getUniquePOS():
tagdict = load('help/tagsets/upenn_tagset.pickle')
return len(tagdict), tagdict.keys()
In [8]:
def removeSWFromSent(sentence):
words = sentence.split()
sentence = []
for w in words:
if w not in stopwords.words('english'):
sentence.append(w)
if len(sentence) == 0:
sentence = [""]
return convertlistToString(sentence)
In [9]:
def removeSWFromPar(sentences):
sents = []
for i in range(0, len(sentences)):
sent = removeSWFromSent(sentences[i])
sents.append(sent)
return sents
In [10]:
def removeSWFromClass(sentencesClass):
sentClass = []
for i in range(0, len(sentencesClass)):
sentClass.append(removeSWFromPar(sentencesClass[i]))
return sentClass
In [11]:
def convertlistToString(sentence):
sentence = " ".join(sentence)
return sentence
In [12]:
def uniqueWordsCount(sentencesClass):
uniqueWords = set()
for i in range(0,len(sentencesClass)):
for j in range(0,len(sentencesClass[i])):
words = sentencesClass[i][j].split()
for k in words:
uniqueWords.add(k)
return list(uniqueWords)
In [13]:
def totalSentences(sentencesClass):
size = 0
for i in range(0, len(sentencesClass)):
size += len(sentencesClass[i])
return size;
In [14]:
def defineBOWM(sentencesClass, vocab):
vocabSize = len(vocab)
n = totalSentences(sentencesClass)
labels = np.zeros(n)
mat = np.zeros((n,vocabSize))
k = 0
for i in range(0, len(sentencesClass)):
for j in range(0, len(sentencesClass[i])):
words = sentencesClass[i][j].split()
for w in words:
mat[k, vocab.index(w)] = 1.0
labels[k] = i+1
k+=1
return mat, labels
In [15]:
def defineBOWMPOS(originalSentencesClass, sentencesClass, vocab):
vocabSize = len(vocab)
n = totalSentences(sentencesClass)
labels = np.zeros(n)
sizePOS, POSList = getUniquePOS()
mat = np.zeros((n, vocabSize + sizePOS))
matFromBOWM, labels = defineBOWM(sentencesClass, vocab)
for i in range(0, n):
mat[i,:vocabSize] = matFromBOWM[i]
k = 0
for i in range(0, len(originalSentencesClass)):
for j in range(0, len(originalSentencesClass[i])):
pos = POSForSentence(originalSentencesClass[i][j])
for p in pos:
mat[k, vocabSize + POSList.index(p)] = 1.0
k+=1
return mat, labels
In [16]:
def finalFeaturesLabel(X,y):
n, d = X.shape
finalMat = np.zeros((n,d+1))
for i in range(0, n):
finalMat[i, 0] = y[i]
finalMat[i, 1:] = X[i]
return finalMat
In [17]:
def saveBOW(fileName, finalMat):
np.save(fileName, finalMat)
In [18]:
def saveVocab(fileName, vocab):
np.save(fileName, vocab)
In [19]:
def main():
dirName = "Email-classification_dataset/"
classes = [1,2,3,4,5]
fileName = ["RD-positive-800.txt", "meetings-positive-800.txt", "negative-800.txt", "fyi-positive-800.txt", "tp-positive-500.txt",]
removeStopWords = True
originalsentencesClass = sentenceExtractionForTraining(dirName, fileName, classes)
sentencesClass = removeFC(originalsentencesClass)
vocab = uniqueWordsCount(sentencesClass)
XBOW, yBOW = defineBOWM(sentencesClass, vocab)
finalMatBOW = finalFeaturesLabel(XBOW, yBOW)
saveBOW("bow.npy", finalMatBOW)
saveVocab("vocab.npy", vocab)
sentencesClassWOSW = removeSWFromClass(sentencesClass)
vocabWOSW = uniqueWordsCount(sentencesClassWOSW)
XBOWWOSW, yBOWWOSW = defineBOWM(sentencesClassWOSW, vocabWOSW)
finalMatBOWWOSW = finalFeaturesLabel(XBOWWOSW, yBOWWOSW)
saveBOW("bowsw.npy", finalMatBOWWOSW)
saveVocab("vocabsw.npy", vocabWOSW)
XBOWPOS, yBOWPOS = defineBOWMPOS(originalsentencesClass, sentencesClass, vocab)
finalMatBOWPOS = finalFeaturesLabel(XBOWPOS, yBOWPOS)
saveBOW("bowpos.npy", finalMatBOWPOS)
XBOWPOSSW, yBOWPOSSW = defineBOWMPOS(originalsentencesClass, sentencesClassWOSW, vocabWOSW)
finalMatBOWPOSSW = finalFeaturesLabel(XBOWPOSSW, yBOWPOSSW)
saveBOW("bowpossw.npy", finalMatBOWPOSSW)
In [20]:
if __name__=="__main__":
main()