In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
In [2]:
vectorizer = CountVectorizer()
In [3]:
string1 = "hi Katie the self driving car will be late Best Sebastian"
string2 = "Hi Sebastian the machine learning class will be great great great Best Katie"
string3 = "Hi Katie the machine learning class will be most excellent"
email_list = [string1, string2, string3]
In [4]:
bag_of_words = vectorizer.fit(email_list)
bag_of_words = vectorizer.transform(email_list)
In [6]:
print bag_of_words
In [9]:
vectorizer.vocabulary_.get("great")
Out[9]:
In [10]:
from nltk.corpus import stopwords
In [11]:
sw = stopwords.words("english")
In [13]:
sw[0]
Out[13]:
In [15]:
sw[10]
Out[15]:
In [16]:
len(sw)
Out[16]:
In [17]:
from nltk.stem.snowball import SnowballStemmer
In [18]:
stemmer = SnowballStemmer("english")
In [19]:
stemmer.stem("responsiveness")
Out[19]:
In [20]:
stemmer.stem("responsivity")
Out[20]:
In [21]:
stemmer.stem("unresponsive")
Out[21]:
In [22]:
stemmer.stem("respond")
Out[22]: