In [ ]:
#Problem: The Google of Quotes
#Group members: S. Wang, Y. Ye, M. Staton, W. Wang
#Version 1.1
In [ ]:
#to access math.log(), and regex functions
import math, re
#to access collections.Counter() function which counts the occurence of items in a list
from collections import Counter
#to facilitate timing the script
from time import time
In [ ]:
#start timing
start_time0 = time()
In [ ]:
#make a handle on the file
quotes_file = open("quotes.txt", "r")
In [ ]:
#make an empty list to store the quotes
quote_list = []
#read in one line each time
new_line = quotes_file.readline()
#join the adjacent two lines to make a full quote
while len(new_line) != 0:
str1, str2 = new_line.strip("\r\n"), quotes_file.readline().strip("\r\n")
quote_list.extend([" - ".join([str1, str2])])
new_line = quotes_file.readline()
In [ ]:
#for safety, close the handle. This can also be done with "with open() as handle"
quotes_file.close()
In [ ]:
#this function take one quote and make a list of the unique words in that quote
def words_in_quote(quote):
pattern = "\W+"
word_list = [word.lower() for word in re.split(pattern, quote)]
while "" in word_list:
word_list.remove("")
return word_list
In [ ]:
#define a function to create a posting list given a list of full quotes
def postings_list(quotes):
start_time = time()
if type(quotes) != list:
print "This function only takes list arguments."
return {}
postings_list_dict = {}
for pair in [(quote, dict(Counter(words_in_quote(quote)))) for quote in quotes]:
postings_list_dict[pair[0]] = pair[1]
end_time = time()
print end_time - start_time
return postings_list_dict
In [ ]:
postings_list_dict = postings_list(quote_list)
In [ ]:
#define a function to create a reverse posting list given a list of full quotes
def rev_postings_list(quotes):
start_time = time()
if type(quotes) != list:
print "This function only takes list arguments."
return {}
rev_postings_list_dict = {}
forward_dict = postings_list(quotes)
for quote, word_count in forward_dict.items():
for word, count in word_count.items():
if word not in rev_postings_list_dict:
rev_postings_list_dict[word] = {}
rev_postings_list_dict[word][quote] = count
else:
rev_postings_list_dict[word][quote] = count
end_time = time()
print end_time - start_time
return rev_postings_list_dict
In [ ]:
#warning: trying printing reverse_postings_list_dict foreshadows disaster!
reverse_postings_list_dict = rev_postings_list(quote_list)
In [ ]:
#this function take a tuple of (quote, word) and returns the TF_IDF index of word in quote
def tf_idf((quote, word)):
word = str(word).lower()
if quote not in postings_list_dict:
print "The quote is not found."
return 0
if word not in reverse_postings_list_dict:
print "The word is not found."
return 0
max_ocur_in_quote = max([value for (key, value) in postings_list_dict[quote].items() ])
TF = float(count_ocur(quote, word)) / max_ocur_in_quote
#print "TF=", TF
N = len(quote_list)
n = len([(key, value) for (key, value) in reverse_postings_list_dict[word].items() if value > 0])
IDF = math.log(float(N)/n)
#print "IDF=", IDF
TF_IDF = TF * IDF
return TF_IDF
In [ ]:
tf_idf(("We are all worms, but I do believe I am a glow-worm. - Winston Churchill", "we"))
In [ ]:
#print TF_IDF index of "entertainer" in the Marlon Brando quote to verify the result
print tf_idf((quote_list[0], "we"))
In [ ]:
#define a function to search a single word in all quotes
def sg_word_search(default_word=""):
if len(default_word) == 0:
word = ""
while len(word) == 0:
word = str(raw_input("Type in a word to search, or \"!!!\" to quit: ")).lower().strip()
if word == "!!!":
print "See you next time."
return {}
else:
word = default_word
search_dict = {}
if word not in reverse_postings_list_dict:
print word, "is not found."
return {}
retrieved_quotes = [key for (key, value) in reverse_postings_list_dict[word].items() if value > 0]
for quote in retrieved_quotes:
search_dict[quote] = tf_idf((quote, word))
#print word, " in ", quote, word in quote
return search_dict
In [ ]:
#this function takes a list of words and return all the quotes containing any of the words
#and shows the sum of the TF_IDF index of every word in every quote
def mul_word_search():
words = []
while True:
if len(words) == 0:
text = raw_input("Type one word and Enter\n\"!!!\" >>> quit\n:").lower().strip()
else:
text = raw_input("Type one word and Enter\n\"???\" >>> start searching\n\"!!!\" >>> quit\n:").lower().strip()
if text == "!!!":
print "See you next time."
return {}
if text == "???" and len(words) > 0:
break
else:
words.extend([text])
continue
while "" in words:
words.remove("")
words = [s.lower() for s in words]
search_dict2 = {}
retrieved_quotes2 = map(sg_word_search, words)
_quote_list = []
for quote_ifidf_dict in retrieved_quotes2:
_quote_list.extend(quote_ifidf_dict.keys())
retrieved_quotes2_unique = Counter(_quote_list).keys()
for quote in retrieved_quotes2_unique:
new_params = [(quote, word) for word in words]
search_dict2[quote] = sum(map(tf_idf, new_params))
return search_dict2
In [ ]:
#print out a sample result of calling sg_word_search on keyword "and"
#check the length of the returned list to verify the result
result = sg_word_search()
print len(result)
print result
In [ ]:
#print out the result of mul_word_search
result = mul_word_search()
print len(result)
print result
In [ ]:
#stop timing and print out total running time
end_time0 = time()
print "Total running time", end_time0 - start_time0, "seconds"