notebook.community

Edit and run



In [1]:

    
from gensim.models import Word2Vec
import numpy as np
from sklearn.manifold import TSNE
import pandas as pd
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords



In [2]:

    
data = pd.read_csv('data/announcements.csv')
model = Word2Vec.load("100features_5minwords_10context_AnnouncementTitle")
features = 100



In [3]:

    
def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)



In [4]:

    
data["Body"].shape









    Out[4]:





(4167,)



In [5]:

    
emails = [None] * data["Body"].shape[0]  # Initialize an empty list of sentences

print("Parsing sentences from default set")
i = 0
for review in data["Body"]:
    emails[i] = review_to_wordlist(str(review))
    i = i+1









    



Parsing sentences from default set






    



/opt/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

The code that caused this warning is on line 184 of the file /opt/anaconda3/lib/python3.5/runpy.py. To get rid of this warning, change code that looks like this:

 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))
/opt/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:282: UserWarning: "http://www.matmartinez.net/nsfw/" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/opt/anaconda3/lib/python3.5/site-packages/bs4/__init__.py:282: UserWarning: "http://images5.fanpop.com/image/photos/25500000/Aww-Yeah-random-25538108-268-209.gif" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup



In [6]:

    
size = len(emails)
print(size)



In [7]:

    
model['a'].shape









    Out[7]:





(100,)



In [8]:

    
cbow = [None] * data["Body"].shape[0]
i = 0
for email in emails:
    base = [0] * features
    wordSkipped = 0
    for word in email:
        if model.vocab.get(word) is None :
            wordSkipped = wordSkipped+1
            continue
        base = base + model[word]
    cbow[i] = np.array(base) / (len(email) - wordSkipped)
    i = i + 1



In [9]:

    
cbow = pd.DataFrame(cbow)



In [10]:

    
cbow.head(10)









    Out[10]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      90
      91
      92
      93
      94
      95
      96
      97
      98
      99
    
  
  
    
      0
      -0.054458
      -0.027328
      -0.015995
      -0.086978
      0.051909
      -0.046620
      0.099372
      0.117895
      -0.050922
      0.074041
      ...
      0.096191
      0.035673
      -0.046638
      0.020655
      0.053338
      -0.011624
      0.050724
      0.022764
      0.037663
      0.001001
    
    
      1
      -0.038013
      -0.009144
      -0.006346
      -0.102329
      0.070879
      -0.034090
      0.143415
      0.127730
      -0.084724
      0.109901
      ...
      0.132670
      0.060247
      -0.078525
      0.016744
      0.038884
      0.012081
      0.061925
      0.007142
      0.026701
      -0.011024
    
    
      2
      -0.039144
      -0.021391
      -0.034704
      -0.089550
      0.060454
      -0.042070
      0.100869
      0.130518
      -0.067005
      0.088970
      ...
      0.103148
      0.036284
      -0.050385
      0.022605
      0.032885
      -0.000573
      0.053779
      0.021603
      0.029601
      0.015369
    
    
      3
      -0.054148
      -0.033593
      -0.062348
      -0.096243
      0.052012
      -0.048855
      0.050170
      0.130426
      -0.035423
      0.101087
      ...
      0.088368
      0.010062
      -0.038597
      0.031514
      0.011624
      0.014437
      0.042852
      0.024989
      0.016962
      0.018475
    
    
      4
      -0.060062
      -0.027242
      -0.039294
      -0.100929
      0.055206
      -0.043566
      0.082062
      0.128750
      -0.048666
      0.091695
      ...
      0.106932
      0.036858
      -0.056458
      0.030766
      0.025858
      0.016257
      0.060411
      0.028373
      0.022683
      0.001653
    
    
      5
      0.027722
      0.027745
      -0.047450
      -0.054526
      0.059847
      -0.011157
      0.161741
      0.123732
      -0.087430
      0.085111
      ...
      0.091070
      0.053377
      -0.054374
      0.018723
      -0.009706
      -0.006647
      0.057398
      0.027132
      0.056001
      0.072841
    
    
      6
      -0.034197
      -0.010460
      -0.058316
      -0.090962
      0.052264
      -0.034730
      0.080387
      0.133822
      -0.063091
      0.088065
      ...
      0.096674
      0.029329
      -0.048702
      0.030098
      0.008048
      0.015879
      0.058715
      0.033835
      0.022425
      0.029658
    
    
      7
      -0.012030
      -0.036253
      -0.046687
      -0.074468
      0.058023
      -0.051465
      0.099205
      0.119681
      -0.055361
      0.068613
      ...
      0.096074
      0.036813
      -0.038630
      0.018051
      0.021463
      -0.023137
      0.046512
      0.021000
      0.017344
      0.039204
    
    
      8
      -0.041923
      -0.043118
      -0.050025
      -0.095298
      0.054262
      -0.052029
      0.071712
      0.129125
      -0.036149
      0.088446
      ...
      0.095634
      0.028847
      -0.048054
      0.024821
      0.016836
      -0.005735
      0.047435
      0.021545
      0.013688
      0.014882
    
    
      9
      -0.033800
      -0.027144
      -0.030981
      -0.093649
      0.065957
      -0.032709
      0.101311
      0.133010
      -0.049015
      0.122436
      ...
      0.107207
      0.040554
      -0.066223
      0.016178
      0.006763
      0.011080
      0.044505
      0.002788
      0.014103
      0.004971
    
  

10 rows × 100 columns



In [12]:

    
cbow.to_csv('email_cbow.csv')



In [ ]:

	0	1	2	3	4	5	6	7	8	9	...	90	91	92	93	94	95	96	97	98	99
0	-0.054458	-0.027328	-0.015995	-0.086978	0.051909	-0.046620	0.099372	0.117895	-0.050922	0.074041	...	0.096191	0.035673	-0.046638	0.020655	0.053338	-0.011624	0.050724	0.022764	0.037663	0.001001
1	-0.038013	-0.009144	-0.006346	-0.102329	0.070879	-0.034090	0.143415	0.127730	-0.084724	0.109901	...	0.132670	0.060247	-0.078525	0.016744	0.038884	0.012081	0.061925	0.007142	0.026701	-0.011024
2	-0.039144	-0.021391	-0.034704	-0.089550	0.060454	-0.042070	0.100869	0.130518	-0.067005	0.088970	...	0.103148	0.036284	-0.050385	0.022605	0.032885	-0.000573	0.053779	0.021603	0.029601	0.015369
3	-0.054148	-0.033593	-0.062348	-0.096243	0.052012	-0.048855	0.050170	0.130426	-0.035423	0.101087	...	0.088368	0.010062	-0.038597	0.031514	0.011624	0.014437	0.042852	0.024989	0.016962	0.018475
4	-0.060062	-0.027242	-0.039294	-0.100929	0.055206	-0.043566	0.082062	0.128750	-0.048666	0.091695	...	0.106932	0.036858	-0.056458	0.030766	0.025858	0.016257	0.060411	0.028373	0.022683	0.001653
5	0.027722	0.027745	-0.047450	-0.054526	0.059847	-0.011157	0.161741	0.123732	-0.087430	0.085111	...	0.091070	0.053377	-0.054374	0.018723	-0.009706	-0.006647	0.057398	0.027132	0.056001	0.072841
6	-0.034197	-0.010460	-0.058316	-0.090962	0.052264	-0.034730	0.080387	0.133822	-0.063091	0.088065	...	0.096674	0.029329	-0.048702	0.030098	0.008048	0.015879	0.058715	0.033835	0.022425	0.029658
7	-0.012030	-0.036253	-0.046687	-0.074468	0.058023	-0.051465	0.099205	0.119681	-0.055361	0.068613	...	0.096074	0.036813	-0.038630	0.018051	0.021463	-0.023137	0.046512	0.021000	0.017344	0.039204
8	-0.041923	-0.043118	-0.050025	-0.095298	0.054262	-0.052029	0.071712	0.129125	-0.036149	0.088446	...	0.095634	0.028847	-0.048054	0.024821	0.016836	-0.005735	0.047435	0.021545	0.013688	0.014882
9	-0.033800	-0.027144	-0.030981	-0.093649	0.065957	-0.032709	0.101311	0.133010	-0.049015	0.122436	...	0.107207	0.040554	-0.066223	0.016178	0.006763	0.011080	0.044505	0.002788	0.014103	0.004971