In [1]:
import numpy as np
import csv
import pandas as pd
from sklearn import svm
from nltk.corpus import stopwords
import re
import nltk
from nltk.tokenize import *
from nltk.data import load
In [2]:
def sentenceExtractionForTraining(dirName, fileName, classes):
sentencesClass = []
for i in range(0,len(classes)):
sentences = readFile(dirName+fileName[i])
sentencesClass.append(sentences)
return sentencesClass
In [3]:
def readFile(fileName):
f = open(fileName,"r+")
sentences = []
for line in f.readlines():
line = line.strip()
if len(line)==0 :
continue
if "TIMEX3" in line:
continue
sentences.append(line.lower().rstrip())
f.close()
return sentences
In [4]:
def createVocab(sentencesClass):
vocab = set()
for i in range(0, len(sentencesClass)):
for j in range(0,len(sentencesClass[i])):
words = sentencesClass[i][j].split()
for w in words:
vocab.add(w)
return vocab
In [5]:
def removeFancyChars(sentences):
lengthPhrase = len(sentences)
for i in range(lengthPhrase):
sentences[i] = re.sub(r'([^\s\w]|_)+', '', sentences[i])
return sentences
In [6]:
def removeFC(sentencesClass):
for i in range(0, len(sentencesClass)):
sentencesClass[i] = removeFancyChars(sentencesClass[i])
return sentencesClass
In [7]:
def load_bin_vec(fname, vocab):
"""
Loads 300x1 word vecs from Google (Mikolov) word2vec
"""
word_vecs = {}
with open(fname, "rb") as f:
header = f.readline()
vocab_size, layer1_size = map(int, header.split())
binary_len = np.dtype('float32').itemsize * layer1_size
for line in xrange(vocab_size):
word = []
while True:
ch = f.read(1)
if ch == ' ':
word = ''.join(word)
break
if ch != '\n':
word.append(ch)
if word in vocab:
word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
else:
f.read(binary_len)
return word_vecs
In [8]:
def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
for word in vocab:
if word not in word_vecs:
word_vecs[word] = np.random.uniform(-0.25,0.25,k)
In [9]:
def initializeWordVecs(sentencesClass):
vocab = createVocab(sentencesClass)
w2vFile = "GoogleNews-vectors-negative300.bin"
w2v = load_bin_vec(w2vFile, vocab)
add_unknown_words(w2v, vocab)
return w2v
In [10]:
def POSForSentence(sentence):
text = word_tokenize(sentence)
posSentence = nltk.pos_tag(text)
posSentence = [y for x, y in posSentence]
return posSentence
In [11]:
def getUniquePOS():
tagdict = load('help/tagsets/upenn_tagset.pickle')
return len(tagdict), tagdict.keys()
In [12]:
def totalSentences(sentencesClass):
size = 0
for i in range(0, len(sentencesClass)):
size += len(sentencesClass[i])
return size;
In [13]:
def defineW2V(sentencesClass, w2v, dim = 300):
n = totalSentences(sentencesClass)
mat = np.zeros((n, dim))
labels = np.zeros(n)
k = 0
for i in range(0, len(sentencesClass)):
for j in range(0, len(sentencesClass[i])):
words = sentencesClass[i][j].split()
d = np.zeros(300)
ind = 0.0
for w in words:
if w not in w2v:
w2v[w] = np.random.uniform(-0.25,0.25,dim)
d += w2v[w]
ind += 1.0
d /= ind
mat[k] = d
labels[k] = i+1
k+=1
return mat, labels
In [14]:
def defineW2VPOS(originalSentencesClass, sentencesClass, w2v, dim = 300):
n = totalSentences(sentencesClass)
labels = np.zeros(n)
sizePOS, POSList = getUniquePOS()
mat = np.zeros((n, dim + sizePOS))
matFromW2V, labels = defineW2V(sentencesClass, w2v)
for i in range(0, n):
mat[i,:dim] = matFromW2V[i]
k = 0
for i in range(0, len(originalSentencesClass)):
for j in range(0, len(originalSentencesClass[i])):
pos = POSForSentence(originalSentencesClass[i][j])
for p in pos:
mat[k, dim + POSList.index(p)] = 1.0
k+=1
return mat, labels
In [15]:
def savew2vToFile(w2v):
fileName = "word2VecDict.npy"
np.save(fileName, w2v)
In [16]:
def finalFeaturesLabel(X,y):
n, d = X.shape
finalMat = np.zeros((n,d+1))
for i in range(0, n):
finalMat[i, 0] = y[i]
finalMat[i, 1:] = X[i]
return finalMat
In [17]:
def saveW2V(fileName, finalMat):
np.save(fileName, finalMat)
In [18]:
def loadW2V():
w2v = np.load('word2VecDict.npy').item()
i = 0
for key, value in w2v.iteritems():
if i>10:
break
print key, value
i = i + 1
In [19]:
def main():
dirName = "Email-classification_dataset/"
classes = [1,2,3,4,5]
fileName = ["RD-positive-800.txt", "meetings-positive-800.txt", "negative-800.txt", "fyi-positive-800.txt", "tp-positive-500.txt",]
originalsentencesClass = sentenceExtractionForTraining(dirName, fileName, classes)
sentencesClass = removeFC(originalsentencesClass)
w2v = initializeWordVecs(sentencesClass)
savew2vToFile(w2v)
XW2V, yW2V = defineW2V(sentencesClass, w2v)
XW2VPOS, yW2VPOS = defineW2VPOS(originalsentencesClass, sentencesClass, w2v)
finalMatW2V = finalFeaturesLabel(XW2V, yW2V)
finalMatW2VPOS = finalFeaturesLabel(XW2VPOS, yW2VPOS)
saveW2V("w2v.npy",finalMatW2V)
saveW2V("w2vpos.npy",finalMatW2VPOS)
In [20]:
if __name__=="__main__":
main()