In [1]:
import numpy as np
import csv
import pandas as pd
from sklearn import svm
from nltk.corpus import stopwords
import re
import nltk
from nltk.tokenize import *
from nltk.data import load

Reading Files Line by Line


In [2]:
def sentenceExtractionForTraining(dirName, fileName, classes):
    sentencesClass = []
    for i in range(0,len(classes)):
        sentences = readFile(dirName+fileName[i])
        sentencesClass.append(sentences)
    return sentencesClass

In [3]:
def readFile(fileName):
    f = open(fileName,"r+")
    sentences = []
    for line in f.readlines():
        line = line.strip()
        if len(line)==0 :
            continue
        if "TIMEX3" in line:
            continue 
        sentences.append(line.lower().rstrip())
    f.close()
    return sentences

In [4]:
def createVocab(sentencesClass):
    vocab = set()
    for i in range(0, len(sentencesClass)):
        for j in range(0,len(sentencesClass[i])):
            words = sentencesClass[i][j].split()
            for w in words:
                vocab.add(w)
    return vocab

Removing fancy characters


In [5]:
def removeFancyChars(sentences):
    lengthPhrase = len(sentences)
    for i in range(lengthPhrase):
        sentences[i] = re.sub(r'([^\s\w]|_)+', '', sentences[i])
    return sentences

In [6]:
def removeFC(sentencesClass):
    for i in range(0, len(sentencesClass)):
        sentencesClass[i] = removeFancyChars(sentencesClass[i])
    return sentencesClass

word2vec


In [7]:
def load_bin_vec(fname, vocab):
    """
    Loads 300x1 word vecs from Google (Mikolov) word2vec
    """
    word_vecs = {}
    with open(fname, "rb") as f:
        header = f.readline()
        vocab_size, layer1_size = map(int, header.split())
        binary_len = np.dtype('float32').itemsize * layer1_size
        for line in xrange(vocab_size):
            word = []
            while True:
                ch = f.read(1)
                if ch == ' ':
                    word = ''.join(word)
                    break
                if ch != '\n':
                    word.append(ch)   
            if word in vocab:
               word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')  
            else:
                f.read(binary_len)
    return word_vecs

In [8]:
def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
    for word in vocab:
        if word not in word_vecs:
            word_vecs[word] = np.random.uniform(-0.25,0.25,k)

In [9]:
def initializeWordVecs(sentencesClass):
    vocab = createVocab(sentencesClass)
    w2vFile = "GoogleNews-vectors-negative300.bin"
    
    w2v = load_bin_vec(w2vFile, vocab)
    add_unknown_words(w2v, vocab)
        
    return w2v

For Parts Of Speech


In [10]:
def POSForSentence(sentence):
    text = word_tokenize(sentence)
    posSentence = nltk.pos_tag(text)
    posSentence = [y for x, y in posSentence]
    return posSentence

In [11]:
def getUniquePOS():
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    return len(tagdict), tagdict.keys()

For w2v


In [12]:
def totalSentences(sentencesClass):
    size = 0
    for i in range(0, len(sentencesClass)):
        size += len(sentencesClass[i])
    return size;

In [13]:
def defineW2V(sentencesClass, w2v, dim = 300):
    n = totalSentences(sentencesClass)
    mat = np.zeros((n, dim))
    labels = np.zeros(n)
    k = 0
    for i in range(0, len(sentencesClass)):
        for j in range(0, len(sentencesClass[i])):
            words = sentencesClass[i][j].split()
            d = np.zeros(300)
            ind = 0.0
            for w in words:
                if w not in w2v:
                    w2v[w] = np.random.uniform(-0.25,0.25,dim) 
                d += w2v[w]
                ind += 1.0
            d /= ind
            mat[k] = d
            labels[k] = i+1
            k+=1
    return mat, labels

In [14]:
def defineW2VPOS(originalSentencesClass, sentencesClass, w2v, dim = 300):
    n = totalSentences(sentencesClass)
    labels = np.zeros(n)
    sizePOS, POSList = getUniquePOS()
    mat = np.zeros((n, dim + sizePOS))
    matFromW2V, labels = defineW2V(sentencesClass, w2v)
    
    for i in range(0, n):
        mat[i,:dim] = matFromW2V[i]
        
    k = 0
    for i in range(0, len(originalSentencesClass)):
        for j in range(0,  len(originalSentencesClass[i])):
            pos = POSForSentence(originalSentencesClass[i][j])
            for p in pos:
                mat[k, dim + POSList.index(p)] = 1.0
            k+=1
    
    return mat, labels

Saving to file


In [15]:
def savew2vToFile(w2v):
    fileName = "word2VecDict.npy"
    np.save(fileName, w2v)

In [16]:
def finalFeaturesLabel(X,y):
    n, d = X.shape
    finalMat = np.zeros((n,d+1))
    for i in range(0, n):
        finalMat[i, 0] = y[i]
        finalMat[i, 1:] = X[i]
    return finalMat

In [17]:
def saveW2V(fileName, finalMat):
    np.save(fileName, finalMat)

Loading from file


In [18]:
def loadW2V():
    w2v = np.load('word2VecDict.npy').item()
    
    i = 0
    for key, value in w2v.iteritems():
        if i>10:
            break
        print key, value
        i = i + 1

In [19]:
def main():
    dirName = "Email-classification_dataset/"
    classes = [1,2,3,4,5]
    fileName = ["RD-positive-800.txt", "meetings-positive-800.txt", "negative-800.txt", "fyi-positive-800.txt", "tp-positive-500.txt",]
    originalsentencesClass = sentenceExtractionForTraining(dirName, fileName, classes)
    sentencesClass = removeFC(originalsentencesClass)
    w2v = initializeWordVecs(sentencesClass)
    
    savew2vToFile(w2v)
    
    XW2V, yW2V = defineW2V(sentencesClass, w2v)
    
    XW2VPOS, yW2VPOS = defineW2VPOS(originalsentencesClass, sentencesClass, w2v)
    
    finalMatW2V = finalFeaturesLabel(XW2V, yW2V)
    finalMatW2VPOS = finalFeaturesLabel(XW2VPOS, yW2VPOS)
    
    saveW2V("w2v.npy",finalMatW2V)
    saveW2V("w2vpos.npy",finalMatW2VPOS)

In [20]:
if __name__=="__main__":
    main()