In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
vectorizer = CountVectorizer()

In [3]:
string1 = "hi Katie the self driving car will be late Best Sebastian"
string2 = "Hi Sebastian the machine learning class will be great great great Best Katie"
string3 = "Hi Katie the machine learning class will be most excellent"

email_list = [string1, string2, string3]

In [4]:
bag_of_words = vectorizer.fit(email_list)
bag_of_words = vectorizer.transform(email_list)

In [6]:
print bag_of_words


  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 4)	1
  (0, 7)	1
  (0, 8)	1
  (0, 9)	1
  (0, 13)	1
  (0, 14)	1
  (0, 15)	1
  (0, 16)	1
  (1, 0)	1
  (1, 1)	1
  (1, 3)	1
  (1, 6)	3
  (1, 7)	1
  (1, 8)	1
  (1, 10)	1
  (1, 11)	1
  (1, 13)	1
  (1, 15)	1
  (1, 16)	1
  (2, 0)	1
  (2, 3)	1
  (2, 5)	1
  (2, 7)	1
  (2, 8)	1
  (2, 10)	1
  (2, 11)	1
  (2, 12)	1
  (2, 15)	1
  (2, 16)	1

In [9]:
vectorizer.vocabulary_.get("great")


Out[9]:
6

In [10]:
from nltk.corpus import stopwords

In [11]:
sw = stopwords.words("english")

In [13]:
sw[0]


Out[13]:
u'i'

In [15]:
sw[10]


Out[15]:
u'yours'

In [16]:
len(sw)


Out[16]:
127

In [17]:
from nltk.stem.snowball import SnowballStemmer

In [18]:
stemmer = SnowballStemmer("english")

In [19]:
stemmer.stem("responsiveness")


Out[19]:
u'respons'

In [20]:
stemmer.stem("responsivity")


Out[20]:
u'respons'

In [21]:
stemmer.stem("unresponsive")


Out[21]:
u'unrespons'

In [22]:
stemmer.stem("respond")


Out[22]:
u'respond'