In [1]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.manifold import TSNE
import pandas as pd
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [2]:
data = pd.read_csv('data/announcements.csv')
model = Word2Vec.load("100features_5minwords_10context_AnnouncementTitle")
features = 100

In [3]:
def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [4]:
data["Body"].shape


Out[4]:
(4167,)

In [5]:
emails = [None] * data["Body"].shape[0]  # Initialize an empty list of sentences

print("Parsing sentences from default set")
i = 0
for review in data["Body"]:
    emails[i] = review_to_wordlist(str(review))
    i = i+1


Parsing sentences from default set
/opt/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

The code that caused this warning is on line 184 of the file /opt/anaconda3/lib/python3.5/runpy.py. To get rid of this warning, change code that looks like this:

 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))
/opt/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:282: UserWarning: "http://www.matmartinez.net/nsfw/" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/opt/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:282: UserWarning: "http://images5.fanpop.com/image/photos/25500000/Aww-Yeah-random-25538108-268-209.gif" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup

In [6]:
size = len(emails)
print(size)


4167

In [7]:
model['a'].shape


Out[7]:
(100,)

In [8]:
cbow = [None] * data["Body"].shape[0]
i = 0
for email in emails:
    base = [0] * features
    wordSkipped = 0
    for word in email:
        if model.vocab.get(word) is None :
            wordSkipped = wordSkipped+1
            continue
        base = base + model[word]
    cbow[i] = np.array(base) / (len(email) - wordSkipped)
    i = i + 1

In [9]:
cbow = pd.DataFrame(cbow)

In [10]:
cbow.head(10)


Out[10]:
0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
0 -0.054458 -0.027328 -0.015995 -0.086978 0.051909 -0.046620 0.099372 0.117895 -0.050922 0.074041 ... 0.096191 0.035673 -0.046638 0.020655 0.053338 -0.011624 0.050724 0.022764 0.037663 0.001001
1 -0.038013 -0.009144 -0.006346 -0.102329 0.070879 -0.034090 0.143415 0.127730 -0.084724 0.109901 ... 0.132670 0.060247 -0.078525 0.016744 0.038884 0.012081 0.061925 0.007142 0.026701 -0.011024
2 -0.039144 -0.021391 -0.034704 -0.089550 0.060454 -0.042070 0.100869 0.130518 -0.067005 0.088970 ... 0.103148 0.036284 -0.050385 0.022605 0.032885 -0.000573 0.053779 0.021603 0.029601 0.015369
3 -0.054148 -0.033593 -0.062348 -0.096243 0.052012 -0.048855 0.050170 0.130426 -0.035423 0.101087 ... 0.088368 0.010062 -0.038597 0.031514 0.011624 0.014437 0.042852 0.024989 0.016962 0.018475
4 -0.060062 -0.027242 -0.039294 -0.100929 0.055206 -0.043566 0.082062 0.128750 -0.048666 0.091695 ... 0.106932 0.036858 -0.056458 0.030766 0.025858 0.016257 0.060411 0.028373 0.022683 0.001653
5 0.027722 0.027745 -0.047450 -0.054526 0.059847 -0.011157 0.161741 0.123732 -0.087430 0.085111 ... 0.091070 0.053377 -0.054374 0.018723 -0.009706 -0.006647 0.057398 0.027132 0.056001 0.072841
6 -0.034197 -0.010460 -0.058316 -0.090962 0.052264 -0.034730 0.080387 0.133822 -0.063091 0.088065 ... 0.096674 0.029329 -0.048702 0.030098 0.008048 0.015879 0.058715 0.033835 0.022425 0.029658
7 -0.012030 -0.036253 -0.046687 -0.074468 0.058023 -0.051465 0.099205 0.119681 -0.055361 0.068613 ... 0.096074 0.036813 -0.038630 0.018051 0.021463 -0.023137 0.046512 0.021000 0.017344 0.039204
8 -0.041923 -0.043118 -0.050025 -0.095298 0.054262 -0.052029 0.071712 0.129125 -0.036149 0.088446 ... 0.095634 0.028847 -0.048054 0.024821 0.016836 -0.005735 0.047435 0.021545 0.013688 0.014882
9 -0.033800 -0.027144 -0.030981 -0.093649 0.065957 -0.032709 0.101311 0.133010 -0.049015 0.122436 ... 0.107207 0.040554 -0.066223 0.016178 0.006763 0.011080 0.044505 0.002788 0.014103 0.004971

10 rows × 100 columns


In [12]:
cbow.to_csv('email_cbow.csv')

In [ ]: