In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.preprocessing import scale
from nltk.corpus import stopwords
import nltk
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import os
import random
import zipfile
import numpy as np
from six.moves import urllib
from six.moves import xrange
import tensorflow as tf
In [2]:
df = pd.read_csv('./data/announcements.csv')
In [3]:
titles = df.icol(0)
body = df.icol(1)
# 4167 is total number of the emails
N = 4167
body = body.reshape(N,1)
titles = titles.reshape(N,1)
In [4]:
def sentence_to_words( sentence ):
# Function to convert a raw review to a string of words
# The input is a single string (a raw movie review), and
# the output is a single string (a preprocessed movie review)
#
# 2. Remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", sentence)
#
# 3. Convert to lower case, split into individual words
words = letters_only.lower().split()
#
# 4. In Python, searching a set is much faster than searching
# a list, so convert the stop words to a set
stops = set(stopwords.words("english"))
#
# 5. Remove stop words
meaningful_words = [w for w in words if not w in stops]
#
# 6. Join the words back into one string separated by space,
# and return the result.
#return( " ".join( meaningful_words ))
return meaningful_words
In [5]:
def email_to_words():
list = []
for i in range(N):
array = sentence_to_words(titles[i][0])
list.extend(array)
if(isinstance(body[i][0], str)):
array = sentence_to_words(body[i][0])
list.extend(array)
return list
In [6]:
list = email_to_words()
len(list)
Out[6]:
In [7]:
words = list
count = [['UNK', -1]]
vocabulary_size = 3000
In [8]:
def build_dataset(words):
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = []
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
In [9]:
data, count, dictionary, reverse_dictionary = build_dataset(words)
In [10]:
del words # Hint to reduce memory.
In [11]:
print('Most common words (+UNK)', count[:5])
In [12]:
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
In [21]:
df_testdata = pd.read_csv('./data/test_try1.csv')
In [22]:
df_testdata.head()
Out[22]:
In [24]:
titles = df_testdata.icol(1)
body = df_testdata.icol(2)
In [62]:
body.shape
Out[62]:
In [63]:
N = 7313
In [64]:
df_embed = pd.read_csv('./data/final_embedding.csv')
In [65]:
df_embed.head()
Out[65]:
In [66]:
import pandas as pd
df=pd.read_csv('./data/final_embedding.csv', sep=',',header=None)
final_embeddings = df.values
In [67]:
final_embeddings.shape
Out[67]:
In [68]:
final_embeddings = final_embeddings[1:3002,1:129]
In [100]:
def findembeding(word):
if word in dictionary:
value = dictionary[word]
return final_embeddings[value]
else:
return final_embeddings[0]
In [106]:
#represent an email with a 128 dimension vector
def meanonemail(i):
onelist = []
result = np.ones(128)
result = np.append([result], [result], axis=0)
if(isinstance(titles[i], str)):
array = sentence_to_words(titles[i])
onelist.extend(array)
# array = sentence_to_words(titles[i])
# onelist.extend(array)
if(isinstance(body[i], str)):
array = sentence_to_words(body[i])
onelist.extend(array)
N = len(onelist)
for count in range(N):
word = onelist[count]
embed = findembeding(word)
result = np.append(result, [embed], axis=0)
row = result.shape[0]
result = result[2:row]
return np.mean(result, axis=0)
In [111]:
#this result should be saved
def representeveryemail():
result = []
for i in range(N):
#print(i)
result.append(meanonemail(i))
return result
In [112]:
result = representeveryemail()
In [115]:
import csv
with open('./data/testemailvector.csv', 'w') as mycsvfile:
thedatawriter = csv.writer(mycsvfile)
for row in result:
thedatawriter.writerow(row)
In [ ]: