When learning from text the biggest problem is that different text have different length. A smaller email would require lesser features while longer email would require more features.
In [1]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
string1 = "hi aseem the car will be late regards company"
string2 = "hi company why will it be late I paid in advance regards aseem"
string3 = "hi aseem we don't know why will it be late regards company of company"
email_list = [string1, string2, string3]
vectorizer.fit(email_list)
bag_of_words = vectorizer.transform(email_list)
print vectorizer.vocabulary_
print bag_of_words
In [5]:
import nltk
nltk.download()
Out[5]:
In [7]:
from nltk.corpus import stopwords
sw = stopwords.words("english")
len(sw)
Out[7]:
In [12]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
print stemmer.stem("responsiveness")
print stemmer.stem("responsivity")
print stemmer.stem("unresponsive")