notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.preprocessing import scale
from nltk.corpus import stopwords
import nltk

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  
import tensorflow as tf



In [2]:

    
df = pd.read_csv('./data/announcements.csv')



In [3]:

    
titles = df.icol(0)
body = df.icol(1)
# 4167 is total number of the emails
N = 4167  
body = body.reshape(N,1)
titles = titles.reshape(N,1)









    



/opt/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: icol(i) is deprecated. Please use .iloc[:,i]
  if __name__ == '__main__':
/opt/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: icol(i) is deprecated. Please use .iloc[:,i]
  from ipykernel import kernelapp as app



In [4]:

    
def sentence_to_words( sentence ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", sentence) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    #return( " ".join( meaningful_words ))  
    return meaningful_words



In [5]:

    
def email_to_words():
    list = []
    for i in range(N):
        array = sentence_to_words(titles[i][0])
        list.extend(array)
        if(isinstance(body[i][0], str)):
            array = sentence_to_words(body[i][0])
            list.extend(array)
    return list



In [6]:

    
list = email_to_words()
len(list)









    Out[6]:





159378



In [7]:

    
words = list
count = [['UNK', -1]]
vocabulary_size = 3000



In [8]:

    
def build_dataset(words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = []
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary



In [9]:

    
data, count, dictionary, reverse_dictionary = build_dataset(words)



In [10]:

    
del words  # Hint to reduce memory.



In [11]:

    
print('Most common words (+UNK)', count[:5])









    



Most common words (+UNK) [['UNK', 20378], ('rose', 3020), ('hulman', 2705), ('edu', 2386), ('please', 1651)]



In [12]:

    
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])









    



Sample data [17, 11, 34, 39, 42, 1, 598, 27, 63, 162] ['need', 'ride', 'indy', 'airport', 'back', 'rose', 'happy', 'new', 'year', 'everyone']

preprocessing test emails



In [21]:

    
df_testdata = pd.read_csv('./data/test_try1.csv')



In [22]:

    
df_testdata.head()









    Out[22]:






  
    
      
      Created
      Title
      Body
      Item Type
      Path
    
  
  
    
      0
      9/27/10 15:30
      Lost T-Shirt
      On Monday, September 20, I've left my black t-...
      Item
      announcements/Lists/Personal
    
    
      1
      3/7/11 13:24
      LOST POWER CORD
      \nPower cord has been lost either in D114 or i...
      Item
      announcements/Lists/Personal
    
    
      2
      4/15/11 13:31
      SURVEY OH MY GOSH IT"S A SURVEY ABOUT NAPKIN D...
      HEY! Remember when you were in that design cla...
      Item
      announcements/Lists/Personal
    
    
      3
      8/28/11 23:25
      Physics Textbook for sale.
      Physics textbook, purchased last year.  Physic...
      Item
      announcements/Lists/Personal
    
    
      4
      9/11/11 18:54
      Computer for Sale
      I am selling a desktop computer for $250. It i...
      Item
      announcements/Lists/Personal



In [24]:

    
titles = df_testdata.icol(1)
body = df_testdata.icol(2)









    



/opt/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: icol(i) is deprecated. Please use .iloc[:,i]
  if __name__ == '__main__':
/opt/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: icol(i) is deprecated. Please use .iloc[:,i]
  from ipykernel import kernelapp as app



In [62]:

    
body.shape









    Out[62]:





(7313,)



In [63]:

    
N = 7313



In [64]:

    
df_embed = pd.read_csv('./data/final_embedding.csv')



In [65]:

    
df_embed.head()









    Out[65]:






  
    
      
      Unnamed: 0
      0
      1
      2
      3
      4
      5
      6
      7
      8
      ...
      118
      119
      120
      121
      122
      123
      124
      125
      126
      127
    
  
  
    
      0
      0
      0.090831
      0.071408
      -0.015945
      0.254145
      -0.010695
      0.229046
      -0.033513
      -0.090405
      0.213907
      ...
      0.055312
      0.162304
      0.120240
      -0.016991
      -0.025453
      -0.012157
      -0.108012
      0.097278
      0.051454
      -0.041614
    
    
      1
      1
      -0.041176
      -0.018642
      -0.163324
      0.057316
      0.017544
      -0.105726
      -0.021705
      -0.192541
      0.136645
      ...
      0.097817
      0.049488
      -0.092370
      -0.126346
      0.040714
      0.093636
      -0.065419
      0.054761
      0.132670
      -0.061487
    
    
      2
      2
      0.026670
      0.037819
      -0.117624
      -0.033714
      0.155645
      -0.018152
      -0.003465
      -0.207928
      0.109318
      ...
      -0.075659
      0.045242
      0.090872
      0.075135
      -0.198950
      0.047245
      0.030635
      0.074394
      0.116719
      0.022163
    
    
      3
      3
      0.047291
      -0.061027
      0.023446
      0.117821
      0.014034
      -0.142155
      0.020697
      -0.102846
      0.019864
      ...
      -0.042347
      0.088607
      0.179124
      -0.007854
      0.028658
      0.107261
      -0.061317
      0.020750
      0.077657
      0.070614
    
    
      4
      4
      0.082039
      0.049939
      -0.015991
      0.005912
      -0.079839
      0.034675
      -0.076623
      0.105315
      0.048360
      ...
      0.018757
      0.143541
      0.017612
      0.002007
      0.110670
      0.058479
      -0.092609
      -0.117860
      0.212146
      -0.003607
    
  

5 rows × 129 columns



In [66]:

    
import pandas as pd
df=pd.read_csv('./data/final_embedding.csv', sep=',',header=None)
final_embeddings = df.values



In [67]:

    
final_embeddings.shape









    Out[67]:





(3001, 129)



In [68]:

    
final_embeddings = final_embeddings[1:3002,1:129]



In [100]:

    
def findembeding(word):
    if word in dictionary:
        value = dictionary[word]
        return final_embeddings[value]
    else:
        return final_embeddings[0]



In [106]:

    
#represent an email with a 128 dimension vector
def meanonemail(i):
    onelist = []
    result = np.ones(128)
    result = np.append([result], [result], axis=0)
    
    if(isinstance(titles[i], str)):
        array = sentence_to_words(titles[i])
        onelist.extend(array)
    # array = sentence_to_words(titles[i])
    # onelist.extend(array)
    if(isinstance(body[i], str)):
        array = sentence_to_words(body[i])
        onelist.extend(array)
    N = len(onelist)
    
    for count in range(N):
        word = onelist[count]
        embed = findembeding(word)
        
        result = np.append(result, [embed], axis=0)
    
    row = result.shape[0]
    result = result[2:row]
    
    return np.mean(result, axis=0)



In [111]:

    
#this result should be saved
def representeveryemail():
    result = []
    for i in range(N):
        #print(i)
        result.append(meanonemail(i))
    return result



In [112]:

    
result = representeveryemail()









    



/opt/anaconda3/lib/python3.5/site-packages/numpy/core/_methods.py:59: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)



In [115]:

    
import csv
with open('./data/testemailvector.csv', 'w') as mycsvfile:
    thedatawriter = csv.writer(mycsvfile)
    for row in result:
        thedatawriter.writerow(row)



In [ ]:

	Created	Title	Body	Item Type	Path
0	9/27/10 15:30	Lost T-Shirt	On Monday, September 20, I've left my black t-...	Item	announcements/Lists/Personal
1	3/7/11 13:24	LOST POWER CORD	\nPower cord has been lost either in D114 or i...	Item	announcements/Lists/Personal
2	4/15/11 13:31	SURVEY OH MY GOSH IT"S A SURVEY ABOUT NAPKIN D...	HEY! Remember when you were in that design cla...	Item	announcements/Lists/Personal
3	8/28/11 23:25	Physics Textbook for sale.	Physics textbook, purchased last year. Physic...	Item	announcements/Lists/Personal
4	9/11/11 18:54	Computer for Sale	I am selling a desktop computer for $250. It i...	Item	announcements/Lists/Personal

	Unnamed: 0	0	1	2	3	4	5	6	7	8	...	118	119	120	121	122	123	124	125	126	127
0	0	0.090831	0.071408	-0.015945	0.254145	-0.010695	0.229046	-0.033513	-0.090405	0.213907	...	0.055312	0.162304	0.120240	-0.016991	-0.025453	-0.012157	-0.108012	0.097278	0.051454	-0.041614
1	1	-0.041176	-0.018642	-0.163324	0.057316	0.017544	-0.105726	-0.021705	-0.192541	0.136645	...	0.097817	0.049488	-0.092370	-0.126346	0.040714	0.093636	-0.065419	0.054761	0.132670	-0.061487
2	2	0.026670	0.037819	-0.117624	-0.033714	0.155645	-0.018152	-0.003465	-0.207928	0.109318	...	-0.075659	0.045242	0.090872	0.075135	-0.198950	0.047245	0.030635	0.074394	0.116719	0.022163
3	3	0.047291	-0.061027	0.023446	0.117821	0.014034	-0.142155	0.020697	-0.102846	0.019864	...	-0.042347	0.088607	0.179124	-0.007854	0.028658	0.107261	-0.061317	0.020750	0.077657	0.070614
4	4	0.082039	0.049939	-0.015991	0.005912	-0.079839	0.034675	-0.076623	0.105315	0.048360	...	0.018757	0.143541	0.017612	0.002007	0.110670	0.058479	-0.092609	-0.117860	0.212146	-0.003607