notebook.community

Edit and run



In [1]:

    
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer



In [2]:

    
vectorizer = CountVectorizer()



In [3]:

    
string1 = "hi Katie the self driving car will be late Best Sebastian"
string2 = "Hi Sebastian the machine learning class will be great great great Best Katie"
string3 = "Hi Katie the machine learning class will be most excellent"

email_list = [string1, string2, string3]



In [4]:

    
bag_of_words = vectorizer.fit(email_list)
bag_of_words = vectorizer.transform(email_list)



In [6]:

    
print bag_of_words









    



  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 4)	1
  (0, 7)	1
  (0, 8)	1
  (0, 9)	1
  (0, 13)	1
  (0, 14)	1
  (0, 15)	1
  (0, 16)	1
  (1, 0)	1
  (1, 1)	1
  (1, 3)	1
  (1, 6)	3
  (1, 7)	1
  (1, 8)	1
  (1, 10)	1
  (1, 11)	1
  (1, 13)	1
  (1, 15)	1
  (1, 16)	1
  (2, 0)	1
  (2, 3)	1
  (2, 5)	1
  (2, 7)	1
  (2, 8)	1
  (2, 10)	1
  (2, 11)	1
  (2, 12)	1
  (2, 15)	1
  (2, 16)	1



In [9]:

    
vectorizer.vocabulary_.get("great")









    Out[9]:





6



In [10]:

    
from nltk.corpus import stopwords



In [11]:

    
sw = stopwords.words("english")



In [13]:

    
sw[0]









    Out[13]:





u'i'



In [15]:

    
sw[10]









    Out[15]:





u'yours'



In [16]:

    
len(sw)









    Out[16]:





127



In [17]:

    
from nltk.stem.snowball import SnowballStemmer



In [18]:

    
stemmer = SnowballStemmer("english")



In [19]:

    
stemmer.stem("responsiveness")









    Out[19]:





u'respons'



In [20]:

    
stemmer.stem("responsivity")









    Out[20]:





u'respons'



In [21]:

    
stemmer.stem("unresponsive")









    Out[21]:





u'unrespons'



In [22]:

    
stemmer.stem("respond")









    Out[22]:





u'respond'