In [1]:

    
import pandas
from sklearn.feature_extraction.text import CountVectorizer

Setup text df of text



In [2]:

    
corpus_dict = {1: 'This is the first document.',
          2: 'This is the second second document.',
          3: 'And the third one.',
          4: 'Is this the first document?'}

df = pandas.DataFrame(corpus_dict.items(), columns=['id', 'text'])
print(df.shape)









    



(4, 2)



In [3]:

    
print(df)









    



   id                                 text
0   1          This is the first document.
1   2  This is the second second document.
2   3                   And the third one.
3   4          Is this the first document?



In [4]:

    
# get list of 'text' column
list_of_texts = df['text'].tolist()
print(list_of_texts)









    



['This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?']

Setup scikit vectorizer



In [5]:

    
vectorizer = CountVectorizer(min_df=1)
term_doc_matrix = vectorizer.fit_transform(list_of_texts)



In [7]:

    
vectorizer.get_feature_names()









    Out[7]:





[u'and',
 u'document',
 u'first',
 u'is',
 u'one',
 u'second',
 u'the',
 u'third',
 u'this']



In [8]:

    
print(vectorizer.get_feature_names())
print(term_doc_matrix.toarray())









    



[u'and', u'document', u'first', u'is', u'one', u'second', u'the', u'third', u'this']
[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]



In [9]:

    
# Put BoW vectors into a new df
df_bow = pandas.DataFrame(term_doc_matrix.toarray(), columns=vectorizer.get_feature_names())



In [10]:

    
print(df_bow)









    



   and  document  first  is  one  second  the  third  this
0    0         1      1   1    0       0    1      0     1
1    0         1      0   1    0       2    1      0     1
2    1         0      0   0    1       0    1      1     0
3    0         1      1   1    0       0    1      0     1

Merge two dfs



In [11]:

    
# Important: Make sure the concat() function uses the original id index of the first, text datafram
result = pandas.concat([df, df_bow], axis=1, join_axes=[df.index])



In [12]:

    
result









    Out[12]:






  
    
      
      id
      text
      and
      document
      first
      is
      one
      second
      the
      third
      this
    
  
  
    
      0
      1
      This is the first document.
      0
      1
      1
      1
      0
      0
      1
      0
      1
    
    
      1
      2
      This is the second second document.
      0
      1
      0
      1
      0
      2
      1
      0
      1
    
    
      2
      3
      And the third one.
      1
      0
      0
      0
      1
      0
      1
      1
      0
    
    
      3
      4
      Is this the first document?
      0
      1
      1
      1
      0
      0
      1
      0
      1



In [ ]:

	id	text	and	document	first	is	one	second	the	third	this
0	1	This is the first document.	0	1	1	1	0	0	1	0	1
1	2	This is the second second document.	0	1	0	1	0	2	1	0	1
2	3	And the third one.	1	0	0	0	1	0	1	1	0
3	4	Is this the first document?	0	1	1	1	0	0	1	0	1