In [1]:
import pandas
from sklearn.feature_extraction.text import CountVectorizer
In [2]:
corpus_dict = {1: 'This is the first document.',
2: 'This is the second second document.',
3: 'And the third one.',
4: 'Is this the first document?'}
df = pandas.DataFrame(corpus_dict.items(), columns=['id', 'text'])
print(df.shape)
In [3]:
print(df)
In [4]:
# get list of 'text' column
list_of_texts = df['text'].tolist()
print(list_of_texts)
In [5]:
vectorizer = CountVectorizer(min_df=1)
term_doc_matrix = vectorizer.fit_transform(list_of_texts)
In [7]:
vectorizer.get_feature_names()
Out[7]:
In [8]:
print(vectorizer.get_feature_names())
print(term_doc_matrix.toarray())
In [9]:
# Put BoW vectors into a new df
df_bow = pandas.DataFrame(term_doc_matrix.toarray(), columns=vectorizer.get_feature_names())
In [10]:
print(df_bow)
In [11]:
# Important: Make sure the concat() function uses the original id index of the first, text datafram
result = pandas.concat([df, df_bow], axis=1, join_axes=[df.index])
In [12]:
result
Out[12]:
In [ ]: