In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.preprocessing import scale
from nltk.corpus import stopwords
import nltk

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  
import tensorflow as tf

In [2]:
df = pd.read_csv('./data/announcements.csv')

In [3]:
titles = df.icol(0)
body = df.icol(1)
# 4167 is total number of the emails
N = 4167  
body = body.reshape(N,1)
titles = titles.reshape(N,1)


/opt/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: icol(i) is deprecated. Please use .iloc[:,i]
  if __name__ == '__main__':
/opt/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: icol(i) is deprecated. Please use .iloc[:,i]
  from ipykernel import kernelapp as app

In [4]:
def sentence_to_words( sentence ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", sentence) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    #return( " ".join( meaningful_words ))  
    return meaningful_words

In [5]:
def email_to_words():
    list = []
    for i in range(N):
        array = sentence_to_words(titles[i][0])
        list.extend(array)
        if(isinstance(body[i][0], str)):
            array = sentence_to_words(body[i][0])
            list.extend(array)
    return list

In [6]:
list = email_to_words()
len(list)


Out[6]:
159378

In [7]:
words = list
count = [['UNK', -1]]
vocabulary_size = 3000

In [8]:
def build_dataset(words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = []
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary

In [9]:
data, count, dictionary, reverse_dictionary = build_dataset(words)

In [10]:
del words  # Hint to reduce memory.

In [11]:
print('Most common words (+UNK)', count[:5])


Most common words (+UNK) [['UNK', 20378], ('rose', 3020), ('hulman', 2705), ('edu', 2386), ('please', 1651)]

In [12]:
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])


Sample data [17, 11, 34, 39, 42, 1, 598, 27, 63, 162] ['need', 'ride', 'indy', 'airport', 'back', 'rose', 'happy', 'new', 'year', 'everyone']

preprocessing test emails


In [21]:
df_testdata = pd.read_csv('./data/test_try1.csv')

In [22]:
df_testdata.head()


Out[22]:
Created Title Body Item Type Path
0 9/27/10 15:30 Lost T-Shirt On Monday, September 20, I've left my black t-... Item announcements/Lists/Personal
1 3/7/11 13:24 LOST POWER CORD \nPower cord has been lost either in D114 or i... Item announcements/Lists/Personal
2 4/15/11 13:31 SURVEY OH MY GOSH IT"S A SURVEY ABOUT NAPKIN D... HEY! Remember when you were in that design cla... Item announcements/Lists/Personal
3 8/28/11 23:25 Physics Textbook for sale. Physics textbook, purchased last year. Physic... Item announcements/Lists/Personal
4 9/11/11 18:54 Computer for Sale I am selling a desktop computer for $250. It i... Item announcements/Lists/Personal

In [24]:
titles = df_testdata.icol(1)
body = df_testdata.icol(2)


/opt/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: icol(i) is deprecated. Please use .iloc[:,i]
  if __name__ == '__main__':
/opt/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: icol(i) is deprecated. Please use .iloc[:,i]
  from ipykernel import kernelapp as app

In [62]:
body.shape


Out[62]:
(7313,)

In [63]:
N = 7313

In [64]:
df_embed = pd.read_csv('./data/final_embedding.csv')

In [65]:
df_embed.head()


Out[65]:
Unnamed: 0 0 1 2 3 4 5 6 7 8 ... 118 119 120 121 122 123 124 125 126 127
0 0 0.090831 0.071408 -0.015945 0.254145 -0.010695 0.229046 -0.033513 -0.090405 0.213907 ... 0.055312 0.162304 0.120240 -0.016991 -0.025453 -0.012157 -0.108012 0.097278 0.051454 -0.041614
1 1 -0.041176 -0.018642 -0.163324 0.057316 0.017544 -0.105726 -0.021705 -0.192541 0.136645 ... 0.097817 0.049488 -0.092370 -0.126346 0.040714 0.093636 -0.065419 0.054761 0.132670 -0.061487
2 2 0.026670 0.037819 -0.117624 -0.033714 0.155645 -0.018152 -0.003465 -0.207928 0.109318 ... -0.075659 0.045242 0.090872 0.075135 -0.198950 0.047245 0.030635 0.074394 0.116719 0.022163
3 3 0.047291 -0.061027 0.023446 0.117821 0.014034 -0.142155 0.020697 -0.102846 0.019864 ... -0.042347 0.088607 0.179124 -0.007854 0.028658 0.107261 -0.061317 0.020750 0.077657 0.070614
4 4 0.082039 0.049939 -0.015991 0.005912 -0.079839 0.034675 -0.076623 0.105315 0.048360 ... 0.018757 0.143541 0.017612 0.002007 0.110670 0.058479 -0.092609 -0.117860 0.212146 -0.003607

5 rows × 129 columns


In [66]:
import pandas as pd
df=pd.read_csv('./data/final_embedding.csv', sep=',',header=None)
final_embeddings = df.values

In [67]:
final_embeddings.shape


Out[67]:
(3001, 129)

In [68]:
final_embeddings = final_embeddings[1:3002,1:129]

In [100]:
def findembeding(word):
    if word in dictionary:
        value = dictionary[word]
        return final_embeddings[value]
    else:
        return final_embeddings[0]

In [106]:
#represent an email with a 128 dimension vector
def meanonemail(i):
    onelist = []
    result = np.ones(128)
    result = np.append([result], [result], axis=0)
    
    if(isinstance(titles[i], str)):
        array = sentence_to_words(titles[i])
        onelist.extend(array)
    # array = sentence_to_words(titles[i])
    # onelist.extend(array)
    if(isinstance(body[i], str)):
        array = sentence_to_words(body[i])
        onelist.extend(array)
    N = len(onelist)
    
    for count in range(N):
        word = onelist[count]
        embed = findembeding(word)
        
        result = np.append(result, [embed], axis=0)
    
    row = result.shape[0]
    result = result[2:row]
    
    return np.mean(result, axis=0)

In [111]:
#this result should be saved
def representeveryemail():
    result = []
    for i in range(N):
        #print(i)
        result.append(meanonemail(i))
    return result

In [112]:
result = representeveryemail()


/opt/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:59: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)

In [115]:
import csv
with open('./data/testemailvector.csv', 'w') as mycsvfile:
    thedatawriter = csv.writer(mycsvfile)
    for row in result:
        thedatawriter.writerow(row)

In [ ]: