Example computations for TFIDF using pandas data frames


In [1]:
import pandas as pd
from collections import Counter
import numpy as np

d1 = "in the new york times in" # (repeated in)
d2 = "the new york post"
d3 = "the los angeles times"

docstrs = [d1,d2,d3]
docs = [s.split() for s in docstrs]
docs


Out[1]:
[['in', 'the', 'new', 'york', 'times', 'in'],
 ['the', 'new', 'york', 'post'],
 ['the', 'los', 'angeles', 'times']]

In [2]:
uniq = set(' '.join(docstrs).split())
uniq = sorted(list(uniq))
uniq


Out[2]:
['angeles', 'in', 'los', 'new', 'post', 'the', 'times', 'york']

Word vectors and term counts


In [25]:
tf = pd.DataFrame(data=list(uniq), columns=['word'])
tf = tf.set_index('word')
tf


Out[25]:
word
angeles
in
los
new
post
the
times
york

In [26]:
Counter(d1.split()).items()


Out[26]:
dict_items([('in', 2), ('the', 1), ('new', 1), ('york', 1), ('times', 1)])

In [27]:
for i,d in enumerate(docs):
    c = Counter(d)
    doc_items = pd.DataFrame.from_records(list(c.items()),
                                          columns=['word',f'd{i+1}'])
    doc_items = doc_items.set_index('word')
    tf = tf.merge(doc_items, on='word', how='left')

tf = tf.fillna(0).astype('int')
print(tf)
print(tf.T)


         d1  d2  d3
word               
angeles   0   0   1
in        2   0   0
los       0   0   1
new       1   1   0
post      0   1   0
the       1   1   1
times     1   0   1
york      1   1   0
word  angeles  in  los  new  post  the  times  york
d1          0   2    0    1     0    1      1     1
d2          0   0    0    1     1    1      0     1
d3          1   0    1    0     0    1      1     0

Document frequencies


In [6]:
df = pd.DataFrame(data=uniq, columns=['word'])
df = df.set_index('word')
df['doc count'] = [np.sum([w in d for d in docs]) for w in uniq]
df


Out[6]:
doc count
word
angeles 1
in 1
los 1
new 2
post 1
the 3
times 2
york 2

In [7]:
df['df'] = df['doc count'] / len(docs)
df['idf'] = 1 / df['df']
df['log idf'] = np.log10(df['idf'])
df


Out[7]:
doc count df idf log idf
word
angeles 1 0.333333 3.0 0.477121
in 1 0.333333 3.0 0.477121
los 1 0.333333 3.0 0.477121
new 2 0.666667 1.5 0.176091
post 1 0.333333 3.0 0.477121
the 3 1.000000 1.0 0.000000
times 2 0.666667 1.5 0.176091
york 2 0.666667 1.5 0.176091

Term frequencies


In [8]:
tfidf = pd.concat([df, tf], axis=1)
tfidf


Out[8]:
doc count df idf log idf d1 d2 d3
word
angeles 1 0.333333 3.0 0.477121 0 0 1
in 1 0.333333 3.0 0.477121 2 0 0
los 1 0.333333 3.0 0.477121 0 0 1
new 2 0.666667 1.5 0.176091 1 1 0
post 1 0.333333 3.0 0.477121 0 1 0
the 3 1.000000 1.0 0.000000 1 1 1
times 2 0.666667 1.5 0.176091 1 0 1
york 2 0.666667 1.5 0.176091 1 1 0

In [9]:
tfidf['d1 tf'] = tfidf['d1'] / len(docs[0])
tfidf['d2 tf'] = tfidf['d2'] / len(docs[1])
tfidf['d3 tf'] = tfidf['d3'] / len(docs[2])
tfidf


Out[9]:
doc count df idf log idf d1 d2 d3 d1 tf d2 tf d3 tf
word
angeles 1 0.333333 3.0 0.477121 0 0 1 0.000000 0.00 0.25
in 1 0.333333 3.0 0.477121 2 0 0 0.333333 0.00 0.00
los 1 0.333333 3.0 0.477121 0 0 1 0.000000 0.00 0.25
new 2 0.666667 1.5 0.176091 1 1 0 0.166667 0.25 0.00
post 1 0.333333 3.0 0.477121 0 1 0 0.000000 0.25 0.00
the 3 1.000000 1.0 0.000000 1 1 1 0.166667 0.25 0.25
times 2 0.666667 1.5 0.176091 1 0 1 0.166667 0.00 0.25
york 2 0.666667 1.5 0.176091 1 1 0 0.166667 0.25 0.00

TF-IDF


In [10]:
tfidf['d1 tfidf'] = tfidf['d1 tf'] * tfidf['log idf']
tfidf['d2 tfidf'] = tfidf['d2 tf'] * tfidf['log idf']
tfidf['d3 tfidf'] = tfidf['d3 tf'] * tfidf['log idf']
tfidf


Out[10]:
doc count df idf log idf d1 d2 d3 d1 tf d2 tf d3 tf d1 tfidf d2 tfidf d3 tfidf
word
angeles 1 0.333333 3.0 0.477121 0 0 1 0.000000 0.00 0.25 0.000000 0.000000 0.119280
in 1 0.333333 3.0 0.477121 2 0 0 0.333333 0.00 0.00 0.159040 0.000000 0.000000
los 1 0.333333 3.0 0.477121 0 0 1 0.000000 0.00 0.25 0.000000 0.000000 0.119280
new 2 0.666667 1.5 0.176091 1 1 0 0.166667 0.25 0.00 0.029349 0.044023 0.000000
post 1 0.333333 3.0 0.477121 0 1 0 0.000000 0.25 0.00 0.000000 0.119280 0.000000
the 3 1.000000 1.0 0.000000 1 1 1 0.166667 0.25 0.25 0.000000 0.000000 0.000000
times 2 0.666667 1.5 0.176091 1 0 1 0.166667 0.00 0.25 0.029349 0.000000 0.044023
york 2 0.666667 1.5 0.176091 1 1 0 0.166667 0.25 0.00 0.029349 0.044023 0.000000

TextVectorizer

Ok, now do it the easy way.


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(docstrs)
X


Out[11]:
<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [12]:
vectorizer.get_feature_names()


Out[12]:
['angeles', 'in', 'los', 'new', 'post', 'the', 'times', 'york']

In [13]:
X.toarray()


Out[13]:
array([[0, 2, 0, 1, 0, 1, 1, 1],
       [0, 0, 0, 1, 1, 1, 0, 1],
       [1, 0, 1, 0, 0, 1, 1, 0]], dtype=int64)

In [14]:
D = pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names())
D = D.T
D.columns = ['d1','d2','d3']
D


Out[14]:
d1 d2 d3
angeles 0 0 1
in 2 0 0
los 0 0 1
new 1 1 0
post 0 1 0
the 1 1 1
times 1 0 1
york 1 1 0