Example computations for TFIDF using pandas data frames



In [1]:

    
import pandas as pd
from collections import Counter
import numpy as np

d1 = "in the new york times in" # (repeated in)
d2 = "the new york post"
d3 = "the los angeles times"

docstrs = [d1,d2,d3]
docs = [s.split() for s in docstrs]
docs









    Out[1]:





[['in', 'the', 'new', 'york', 'times', 'in'],
 ['the', 'new', 'york', 'post'],
 ['the', 'los', 'angeles', 'times']]



In [2]:

    
uniq = set(' '.join(docstrs).split())
uniq = sorted(list(uniq))
uniq









    Out[2]:





['angeles', 'in', 'los', 'new', 'post', 'the', 'times', 'york']

Word vectors and term counts



In [25]:

    
tf = pd.DataFrame(data=list(uniq), columns=['word'])
tf = tf.set_index('word')
tf



In [26]:

    
Counter(d1.split()).items()









    Out[26]:





dict_items([('in', 2), ('the', 1), ('new', 1), ('york', 1), ('times', 1)])



In [27]:

    
for i,d in enumerate(docs):
    c = Counter(d)
    doc_items = pd.DataFrame.from_records(list(c.items()),
                                          columns=['word',f'd{i+1}'])
    doc_items = doc_items.set_index('word')
    tf = tf.merge(doc_items, on='word', how='left')

tf = tf.fillna(0).astype('int')
print(tf)
print(tf.T)









    



         d1  d2  d3
word               
angeles   0   0   1
in        2   0   0
los       0   0   1
new       1   1   0
post      0   1   0
the       1   1   1
times     1   0   1
york      1   1   0
word  angeles  in  los  new  post  the  times  york
d1          0   2    0    1     0    1      1     1
d2          0   0    0    1     1    1      0     1
d3          1   0    1    0     0    1      1     0

Document frequencies



In [6]:

    
df = pd.DataFrame(data=uniq, columns=['word'])
df = df.set_index('word')
df['doc count'] = [np.sum([w in d for d in docs]) for w in uniq]
df



In [7]:

    
df['df'] = df['doc count'] / len(docs)
df['idf'] = 1 / df['df']
df['log idf'] = np.log10(df['idf'])
df

Term frequencies



In [8]:

    
tfidf = pd.concat([df, tf], axis=1)
tfidf



In [9]:

    
tfidf['d1 tf'] = tfidf['d1'] / len(docs[0])
tfidf['d2 tf'] = tfidf['d2'] / len(docs[1])
tfidf['d3 tf'] = tfidf['d3'] / len(docs[2])
tfidf

TF-IDF



In [10]:

    
tfidf['d1 tfidf'] = tfidf['d1 tf'] * tfidf['log idf']
tfidf['d2 tfidf'] = tfidf['d2 tf'] * tfidf['log idf']
tfidf['d3 tfidf'] = tfidf['d3 tf'] * tfidf['log idf']
tfidf

TextVectorizer

Ok, now do it the easy way.



In [11]:

    
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(docstrs)
X









    Out[11]:





<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>



In [12]:

    
vectorizer.get_feature_names()









    Out[12]:





['angeles', 'in', 'los', 'new', 'post', 'the', 'times', 'york']



In [13]:

    
X.toarray()









    Out[13]:





array([[0, 2, 0, 1, 0, 1, 1, 1],
       [0, 0, 0, 1, 1, 1, 0, 1],
       [1, 0, 1, 0, 0, 1, 1, 0]], dtype=int64)



In [14]:

    
D = pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names())
D = D.T
D.columns = ['d1','d2','d3']
D

	doc count	df	idf	log idf
word
angeles	1	0.333333	3.0	0.477121
in	1	0.333333	3.0	0.477121
los	1	0.333333	3.0	0.477121
new	2	0.666667	1.5	0.176091
post	1	0.333333	3.0	0.477121
the	3	1.000000	1.0	0.000000
times	2	0.666667	1.5	0.176091
york	2	0.666667	1.5	0.176091

	doc count	df	idf	log idf	d1	d2	d3
word
angeles	1	0.333333	3.0	0.477121	0	0	1
in	1	0.333333	3.0	0.477121	2	0	0
los	1	0.333333	3.0	0.477121	0	0	1
new	2	0.666667	1.5	0.176091	1	1	0
post	1	0.333333	3.0	0.477121	0	1	0
the	3	1.000000	1.0	0.000000	1	1	1
times	2	0.666667	1.5	0.176091	1	0	1
york	2	0.666667	1.5	0.176091	1	1	0

	doc count	df	idf	log idf	d1	d2	d3	d1 tf	d2 tf	d3 tf
word
angeles	1	0.333333	3.0	0.477121	0	0	1	0.000000	0.00	0.25
in	1	0.333333	3.0	0.477121	2	0	0	0.333333	0.00	0.00
los	1	0.333333	3.0	0.477121	0	0	1	0.000000	0.00	0.25
new	2	0.666667	1.5	0.176091	1	1	0	0.166667	0.25	0.00
post	1	0.333333	3.0	0.477121	0	1	0	0.000000	0.25	0.00
the	3	1.000000	1.0	0.000000	1	1	1	0.166667	0.25	0.25
times	2	0.666667	1.5	0.176091	1	0	1	0.166667	0.00	0.25
york	2	0.666667	1.5	0.176091	1	1	0	0.166667	0.25	0.00

	doc count	df	idf	log idf	d1	d2	d3	d1 tf	d2 tf	d3 tf	d1 tfidf	d2 tfidf	d3 tfidf
word
angeles	1	0.333333	3.0	0.477121	0	0	1	0.000000	0.00	0.25	0.000000	0.000000	0.119280
in	1	0.333333	3.0	0.477121	2	0	0	0.333333	0.00	0.00	0.159040	0.000000	0.000000
los	1	0.333333	3.0	0.477121	0	0	1	0.000000	0.00	0.25	0.000000	0.000000	0.119280
new	2	0.666667	1.5	0.176091	1	1	0	0.166667	0.25	0.00	0.029349	0.044023	0.000000
post	1	0.333333	3.0	0.477121	0	1	0	0.000000	0.25	0.00	0.000000	0.119280	0.000000
the	3	1.000000	1.0	0.000000	1	1	1	0.166667	0.25	0.25	0.000000	0.000000	0.000000
times	2	0.666667	1.5	0.176091	1	0	1	0.166667	0.00	0.25	0.029349	0.000000	0.044023
york	2	0.666667	1.5	0.176091	1	1	0	0.166667	0.25	0.00	0.029349	0.044023	0.000000