In [54]:
docA = "The cat sat on my face"
docB = "The dog sat on my bed"
In [55]:
bowA = docA.split(" ")
bowB = docB.split(" ")
In [56]:
bowB
Out[56]:
In [57]:
wordSet = set(bowA).union(set(bowB))
In [58]:
wordSet
Out[58]:
In [59]:
wordDictA = dict.fromkeys(wordSet, 0)
wordDictB = dict.fromkeys(wordSet, 0)
In [60]:
wordDictA
Out[60]:
In [61]:
for word in bowA:
wordDictA[word]+=1
for word in bowB:
wordDictB[word]+=1
In [62]:
wordDictA
Out[62]:
In [63]:
import pandas as pd
pd.DataFrame([wordDictA, wordDictB])
Out[63]:
In [64]:
def computeTF(wordDict, bow):
tfDict = {}
bowCount = len(bow)
for word, count in wordDict.items():
tfDict[word] = count/float(bowCount)
return tfDict
In [65]:
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)
In [66]:
tfBowA
Out[66]:
In [67]:
tfBowB
Out[67]:
In [75]:
def computeIDF(docList):
import math
idfDict = {}
N = len(docList)
idfDict = dict.fromkeys(docList[0].keys(), 0)
for doc in docList:
for word, val in doc.items():
if val > 0:
idfDict[word] += 1
for word, val in idfDict.items():
idfDict[word] = math.log10(N / float(val))
return idfDict
In [79]:
idfs = computeIDF([wordDictA, wordDictB])
In [78]:
def computeTFIDF(tfBow, idfs):
tfidf = {}
for word, val in tfBow.items():
tfidf[word] = val*idfs[word]
return tfidf
In [80]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)
In [83]:
import pandas as pd
pd.DataFrame([tfidfBowA, tfidfBowB])
Out[83]: