In [1]:
import numpy as np
import csv
import pandas as pd
from sklearn import svm
from nltk.corpus import stopwords
import re
import nltk
from nltk.tokenize import *
from nltk.data import load

Reading Files Line by Line


In [2]:
def sentenceExtractionForTraining(dirName, fileName, classes):
    sentencesClass = []
    for i in range(0,len(classes)):
        sentences = readFile(dirName+fileName[i])
        sentencesClass.append(sentences)
    return sentencesClass

In [3]:
def readFile(fileName):
    f = open(fileName,"r+")
    sentences = []
    for line in f.readlines():
        line = line.strip()
        if len(line)==0 :
            continue
        if "TIMEX3" in line:
            continue 
        sentences.append(line.lower().rstrip())
    f.close()
    return sentences

Removing non word characters


In [4]:
def removeFancyChars(sentences):
    lengthPhrase = len(sentences)
    for i in range(lengthPhrase):
        sentences[i] = re.sub(r'([^\s\w]|_)+', '', sentences[i])
    return sentences

In [5]:
def removeFC(sentencesClass):
    for i in range(0, len(sentencesClass)):
        sentencesClass[i] = removeFancyChars(sentencesClass[i])
    return sentencesClass

Parts of Speech


In [6]:
def POSForSentence(sentence):
    text = word_tokenize(sentence)
    posSentence = nltk.pos_tag(text)
    posSentence = [y for x, y in posSentence]
    return posSentence

In [7]:
def getUniquePOS():
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    return len(tagdict), tagdict.keys()

Removing StopWords


In [8]:
def removeSWFromSent(sentence):
    words = sentence.split()
    sentence = []
    for w in words:
        if w not in stopwords.words('english'):
            sentence.append(w)
    if len(sentence) == 0: 
        sentence = [""]
    return convertlistToString(sentence)

In [9]:
def removeSWFromPar(sentences):
    sents = []
    for i in range(0, len(sentences)):
        sent = removeSWFromSent(sentences[i])
        sents.append(sent)
    return sents

In [10]:
def removeSWFromClass(sentencesClass):
    sentClass = []
    for i in range(0, len(sentencesClass)):
        sentClass.append(removeSWFromPar(sentencesClass[i]))
    return sentClass

In [11]:
def convertlistToString(sentence):
    sentence = " ".join(sentence)
    return sentence

Count unique words


In [12]:
def uniqueWordsCount(sentencesClass):
    uniqueWords = set()
    for i in range(0,len(sentencesClass)):
        for j in range(0,len(sentencesClass[i])):
            words = sentencesClass[i][j].split()
            for k in words:
                uniqueWords.add(k)
    return list(uniqueWords)

Bag of words


In [13]:
def totalSentences(sentencesClass):
    size = 0
    for i in range(0, len(sentencesClass)):
        size += len(sentencesClass[i])
    return size;

In [14]:
def defineBOWM(sentencesClass, vocab):
    vocabSize = len(vocab)
    n = totalSentences(sentencesClass)
    labels = np.zeros(n)
    mat = np.zeros((n,vocabSize))
    k = 0
    for i in range(0, len(sentencesClass)):
        for j in range(0, len(sentencesClass[i])):
            words = sentencesClass[i][j].split()
            for w in words:
                mat[k, vocab.index(w)] = 1.0
            labels[k] = i+1
            k+=1
    return mat, labels

In [15]:
def defineBOWMPOS(originalSentencesClass, sentencesClass, vocab):
    vocabSize = len(vocab)
    n = totalSentences(sentencesClass)
    labels = np.zeros(n)
    sizePOS, POSList = getUniquePOS()
    mat = np.zeros((n, vocabSize + sizePOS))
    matFromBOWM, labels = defineBOWM(sentencesClass, vocab)
    
    for i in range(0, n):
        mat[i,:vocabSize] = matFromBOWM[i]
        
    k = 0
    for i in range(0, len(originalSentencesClass)):
        for j in range(0,  len(originalSentencesClass[i])):
            pos = POSForSentence(originalSentencesClass[i][j])
            for p in pos:
                mat[k, vocabSize + POSList.index(p)] = 1.0
            k+=1
    return mat, labels

Save features


In [16]:
def finalFeaturesLabel(X,y):
    n, d = X.shape
    finalMat = np.zeros((n,d+1))
    for i in range(0, n):
        finalMat[i, 0] = y[i]
        finalMat[i, 1:] = X[i]
    return finalMat

In [17]:
def saveBOW(fileName, finalMat):
    np.save(fileName, finalMat)

In [18]:
def saveVocab(fileName, vocab):
    np.save(fileName, vocab)

Main


In [19]:
def main():
    dirName = "Email-classification_dataset/"
    classes = [1,2,3,4,5]
    fileName = ["RD-positive-800.txt", "meetings-positive-800.txt", "negative-800.txt", "fyi-positive-800.txt", "tp-positive-500.txt",]
    removeStopWords = True
    
    originalsentencesClass = sentenceExtractionForTraining(dirName, fileName, classes)
    
    sentencesClass = removeFC(originalsentencesClass)
    vocab = uniqueWordsCount(sentencesClass)
    XBOW, yBOW = defineBOWM(sentencesClass, vocab)
    finalMatBOW = finalFeaturesLabel(XBOW, yBOW)
    saveBOW("bow.npy", finalMatBOW)
    saveVocab("vocab.npy", vocab)
    
    sentencesClassWOSW = removeSWFromClass(sentencesClass)
    vocabWOSW = uniqueWordsCount(sentencesClassWOSW)
    XBOWWOSW, yBOWWOSW = defineBOWM(sentencesClassWOSW, vocabWOSW)
    finalMatBOWWOSW = finalFeaturesLabel(XBOWWOSW, yBOWWOSW)
    saveBOW("bowsw.npy", finalMatBOWWOSW)
    saveVocab("vocabsw.npy", vocabWOSW)
    
    XBOWPOS, yBOWPOS = defineBOWMPOS(originalsentencesClass, sentencesClass, vocab)
    finalMatBOWPOS = finalFeaturesLabel(XBOWPOS, yBOWPOS)
    saveBOW("bowpos.npy", finalMatBOWPOS)
    
    XBOWPOSSW, yBOWPOSSW = defineBOWMPOS(originalsentencesClass, sentencesClassWOSW, vocabWOSW)
    finalMatBOWPOSSW = finalFeaturesLabel(XBOWPOSSW, yBOWPOSSW)
    saveBOW("bowpossw.npy", finalMatBOWPOSSW)

In [20]:
if __name__=="__main__":
    main()