notebook.community

Edit and run



In [54]:

    
docA = "The cat sat on my face"
docB = "The dog sat on my bed"



In [55]:

    
bowA = docA.split(" ")
bowB = docB.split(" ")



In [56]:

    
bowB









    Out[56]:





['The', 'dog', 'sat', 'on', 'my', 'bed']



In [57]:

    
wordSet = set(bowA).union(set(bowB))



In [58]:

    
wordSet









    Out[58]:





{'The', 'bed', 'cat', 'dog', 'face', 'my', 'on', 'sat'}



In [59]:

    
wordDictA = dict.fromkeys(wordSet, 0) 
wordDictB = dict.fromkeys(wordSet, 0)



In [60]:

    
wordDictA









    Out[60]:





{'The': 0, 'cat': 0, 'bed': 0, 'dog': 0, 'my': 0, 'face': 0, 'sat': 0, 'on': 0}



In [61]:

    
for word in bowA:
    wordDictA[word]+=1
    
for word in bowB:
    wordDictB[word]+=1



In [62]:

    
wordDictA









    Out[62]:





{'The': 1, 'cat': 1, 'bed': 0, 'dog': 0, 'my': 1, 'face': 1, 'sat': 1, 'on': 1}



In [63]:

    
import pandas as pd
pd.DataFrame([wordDictA, wordDictB])



In [64]:

    
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict



In [65]:

    
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)



In [66]:

    
tfBowA









    Out[66]:





{'The': 0.16666666666666666,
 'cat': 0.16666666666666666,
 'bed': 0.0,
 'dog': 0.0,
 'my': 0.16666666666666666,
 'face': 0.16666666666666666,
 'sat': 0.16666666666666666,
 'on': 0.16666666666666666}



In [67]:

    
tfBowB









    Out[67]:





{'The': 0.16666666666666666,
 'cat': 0.0,
 'bed': 0.16666666666666666,
 'dog': 0.16666666666666666,
 'my': 0.16666666666666666,
 'face': 0.0,
 'sat': 0.16666666666666666,
 'on': 0.16666666666666666}



In [75]:

    
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict



In [79]:

    
idfs = computeIDF([wordDictA, wordDictB])



In [78]:

    
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf



In [80]:

    
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)



In [83]:

    
import pandas as pd
pd.DataFrame([tfidfBowA, tfidfBowB])

	The	bed	cat	dog	face	my	on	sat
0	0.0	0.000000	0.050172	0.000000	0.050172	0.0	0.0	0.0
1	0.0	0.050172	0.000000	0.050172	0.000000	0.0	0.0	0.0