In [ ]:
###a: quote is the list question a is asking for
fhand= open('quotes.txt')
quote = []
n = 0
for line in fhand:
line=line.rstrip()
n+=1
if n % 2 != 0: a = line
else: quote.append(a + '-'+ line )
print quote[0] # test quote[0] result
In [ ]:
###b: splite every quote into words
import re
def quote2word(a):
word=re.split('\W+', a)
return [x.lower() for x in word]
print quote2word(quote[0]) # test quote[0] result
In [ ]:
###c: founction postinglist count each word in each quote and return a dictionary contains value as key, count as value.
from collections import Counter
def postinglist(a):
word=quote2word(a)
wordCount = dict(Counter(word))
return wordCount
print postinglist(quote[0]) # test quote[0] result
In [ ]:
###d: repostlist is the reverse pisting-list dictionary
Repostlist = dict()
valuelist= dict()
def reverse_postinglist(a):
word = postinglist(a)
for k,v in word.items():
if k in Repostlist:
Repostlist[k][a]=v
else:
b=dict()
b[a]=v
Repostlist[k]=b
for list in quote:
reverse_postinglist(list)
print Repostlist
In [ ]:
###e:
def TF(w,a):
pl=postinglist(a)
TFvalue=pl[w]/max(pl.values())
return TFvalue
import math
def IDF(w):
IDFvalue=math.log(895/len(Repostlist[w]))
return IDFvalue
def TF_IDF(w,a):
return TF(w,a)*IDF(w)
print TF_IDF('we',quote[0])
In [ ]:
###f:
def Quote_search_single(w):
dict1=Repostlist[w]
dict2=dict()
for k in dict1:
a = TF_IDF(w,k)
dict2[k]= a
return dict2
print Quote_search_single('we')
In [8]:
###g
def Quote_search_multiple(list):
dict2=dict()
for w in list:
#print w
dict1=Quote_search_single(w)
#print dict1
for k,v in dict1.iteritems():
#print k,v
if k in dict2: dict2[k]=dict2[k]*v
else: dict2[k]=v
return dict2
print len(Quote_search_multiple(['we','he']))
In [ ]: