In [1]:
%matplotlib inline
In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import CountVectorizer
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.text import FreqDistVisualizer
from yellowbrick.datasets import load_hobbies
In [3]:
# Load the text data
corpus = load_hobbies()
y = LabelEncoder().fit_transform(corpus.target)
vectorizer = CountVectorizer(stop_words='english')
# docs = vectorizer.fit_transform(corpus.data)
# features = vectorizer.get_feature_names()
In [13]:
docs = vectorizer.fit_transform([doc for idx,doc in enumerate(corpus.data) if corpus.target[idx]=="sports"])
features = vectorizer.get_feature_names()
visualizer = FreqDistVisualizer(features=features, orient='v', size=(600, 300), color=["rebeccapurple"])
visualizer.fit(docs)
visualizer.poof()
In [14]:
docs = vectorizer.fit_transform([doc for idx,doc in enumerate(corpus.data) if corpus.target[idx]=="cooking"])
features = vectorizer.get_feature_names()
visualizer = FreqDistVisualizer(features=features, orient='v', size=(600, 300), color=['navy'])
visualizer.fit(docs)
visualizer.poof()
In [12]:
docs = vectorizer.fit_transform(corpus.data)
visualizer = ClassPredictionError(
RandomForestClassifier(random_state=42, n_estimators=10),
classes=corpus.labels,
size=(800, 600),
colors=["tomato","navy","yellow","green","rebeccapurple"]
)
X_train, X_test, y_train, y_test = tts(
docs, y, test_size=0.20, random_state=42)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.poof()
In [ ]: