In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
np.set_printoptions(precision=2)
In [ ]:
X = ["Some say the world will end in fire,",
"Some say in ice."]
In [ ]:
len(X)
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(X)
vectorizer.vocabulary_
In [ ]:
X_bag_of_words = vectorizer.transform(X)
X_bag_of_words
In [ ]:
print(X_bag_of_words.toarray())
In [ ]:
print(X)
vectorizer.get_feature_names()
In [ ]:
vectorizer.inverse_transform(X_bag_of_words)
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(X)
print(tfidf_vectorizer.get_feature_names())
print(tfidf_vectorizer.transform(X).toarray())
In [ ]:
X
In [ ]:
# look at sequences of tokens of minimum length 2 and maximum length 2
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
bigram_vectorizer.fit(X)
bigram_vectorizer.get_feature_names()
In [ ]:
bigram_vectorizer.transform(X).toarray()
In [ ]:
gram_vectorizer = CountVectorizer(ngram_range=(1, 2))
gram_vectorizer.fit(X)
gram_vectorizer.get_feature_names()
In [ ]:
X_1_2_gram = gram_vectorizer.transform(X)
print(X_1_2_gram.shape)
print(X_1_2_gram.toarray())
In [ ]:
char_vectorizer = CountVectorizer(ngram_range=(2, 3), analyzer="char")
char_vectorizer.fit(X)
print(char_vectorizer.get_feature_names())