Title: Bag Of Words
Slug: bag_of_words
Summary: How to encode unstructured text data as bags of words for machine learning in Python.
Date: 2017-09-09 12:00
Category: Machine Learning
Tags: Preprocessing Text
Authors: Chris Albon
In [1]:
# Load library
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
In [2]:
# Create text
text_data = np.array(['I love Brazil. Brazil!',
'Sweden is best',
'Germany beats both'])
In [3]:
# Create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)
# Show feature matrix
bag_of_words.toarray()
Out[3]:
In [4]:
# Get feature names
feature_names = count.get_feature_names()
# View feature names
feature_names
Out[4]:
In [5]:
# Create data frame
pd.DataFrame(bag_of_words.toarray(), columns=feature_names)
Out[5]: