In [ ]:
In [ ]:
def print_lines(filename, num_lines):
count = 0
with open(filename,'r') as f:
for line in f:
for word in line.split():
print word
count += 1
if count >= num_lines:
break
In [ ]:
print_lines('leaves-of-grass.txt', 4)
In [ ]:
import re
with open('leaves-of-grass.txt','r') as f:
words = [word.lower() for line in f for word in re.findall(r"[\w']+|[.,!?;]", line)]
In [ ]:
print "number of words =", len(words)
print words[0]
print words[12305]
print words[148185]
print words[148186]
In [ ]:
print 'leaves' in words
In [ ]:
print 'leaVes' in words
In [ ]:
print 'captain' in words
In [ ]:
print 'internet' in words
In [ ]:
def contains(wordlist, word):
if len(wordlist) == 0:
# Wordlist is empty
return False
for i in xrange(len(wordlist)):
element = wordlist[i]
if word == element:
return True
# No match in wordlist
return False
In [ ]:
print contains(words, 'captain')
In [ ]:
print contains(words, 'internet')
In [ ]:
def contains(wordlist, word):
if len(wordlist) == 0:
# Wordlist is empty
print "Checked 1 word(s)"
return False
for i in xrange(len(wordlist)):
element = wordlist[i]
if word == element:
print "Checked", i+1, "word(s)"
return True
# No match in wordlist
print "Checked", i+1, "word(s)"
return False
In [ ]:
print contains(words, 'leaves')
In [ ]:
print contains(words, 'captain')
In the worst case, we have to check all words.
In [ ]:
print contains(words, 'internet')
What if each comparison took 1 second?
In [ ]:
print "Worst case is %.1f days to query 'words'" % (len(words)/60./60/24)
This is less than a MB of text! What if we wanted to search through every book in the library of congress?
Determining average and worst-case runtimes can be complicated, but for a simple for loop it is easy. Since our worst case runtime is bounded by the number of words ($n$) in the list, inceasing the number of words in the list increases the runtime by a linear factor, i.e. for $2*n$, the worst case is $2*n$ operations.
Other algorithms will scale in different ways, for example $n^2$ or even $2^n$. There are more details, but these runtimes are referred to as "Big O", so a linear algorithm is $O(n)$.
Let's say that instead of strings, our "words" are just integers randomly chosen from 0 to 1 million
In [ ]:
import random
In [ ]:
print random.randint(0, 10**6)
print random.randint(0, 10**6)
print random.randint(0, 10**6)
print random.randint(0, 10**6)
In [ ]:
n = len(words)
print n
In [ ]:
words = [random.randint(0, 10**6 - 1) for i in range(n)]
In [ ]:
words[0:10]
We can still use our function to check contains, because python is forgiving that way.
In [ ]:
contains(words, 1)
In [ ]:
contains(words, 9999)
In [ ]:
contains(words, 100000)
In [ ]:
contains(words, 413351)
In [ ]:
We can afford to process the data for 1 day, but queries should run in 1 second. How can we do this for arbitary lists.
Since we know that the number of integers is bounded by [0,100000) we can make a custom data structure to represent our unsorted set of words. Its form is a list, where the index is the integer in question, and the data is a boolean representing true or false (in the set, not in the set.)
This type of data structure is sometimes called a bit map. It maps the integer field to a "bit" (yes or no)
In [ ]:
# Start with the empty set of ints from 1 to 999,999
words_map = [False for i in range(1000000)]
In [ ]:
words_map[0:10]
If we want to say the integer 2 is in our word set, we set words_map[2] = True
In [ ]:
words_map[2] = True
In [ ]:
words_map[0:10]
Starting from our empty set, let's load our wordslist in
In [ ]:
words_map = [False for i in range(1000000)]
for i in words:
words_map[i] = True
In [ ]:
In [ ]:
contains(words, 219943)
In [ ]:
words_map[219943]
We did that look up only checking one entry!
Now we can write a contains function that does the same thing!
In [ ]:
def contains_map(word_bitmap, word):
# Caution, only works on integers up to len(word_bitmap)!
print "Checked 1 word(s)"
return word_bitmap[word]
In [ ]:
contains(words, 219943)
In [ ]:
contains_map(words_map, 219943)
In [ ]:
contains(words, 99149)
In [ ]:
contains_map(words_map, 129343)
In [ ]:
In [ ]:
for i in range(2,10):
print "Word of length %d has %d possible combinations" % (i, 26**i)
In [ ]:
4/2
In [ ]:
4%2
In [ ]:
5/2
In [ ]:
5%2
In [ ]:
2000/1000
In [ ]:
2999/1000
In [ ]:
2999%1000
In [ ]:
3000/1000
If we take $N % k$, the number will always be between $0$ and $k - 1$
In [ ]:
k = 100000
In [ ]:
words_map = [False for i in range(k)]
In [ ]:
words_map
In [ ]:
len(words_map)
Populate our word_map
In [ ]:
for i in words:
words_map[i%k] = True
In [ ]:
words_map
In [ ]:
In [ ]:
words_map[5]
In [ ]:
5 % k
In [ ]:
100005 % k
This is called a collision. In a basic hashing bitmap, we can only say with certainty is a number is NOT in the map
In [ ]:
def contains_modmap(word_modmap, k, word):
print "Checked 1 word(s)"
if word_modmap[word % k]:
return "Maybe"
else:
return False
In [ ]:
contains_modmap(words_map, 100000, 219943)
In [ ]:
In [ ]:
In [ ]:
Instead of a bitmap to True/False, let's store the data directly! One list per entry in the map.
In [ ]:
k = 100000
In [ ]:
words_map = [list() for i in range(k)]
In [ ]:
words_map
How do we add a number to words_map?
In [ ]:
number = 3
In [ ]:
word_list = words_map[3 % k]
In [ ]:
word_list.append(3)
In [ ]:
In [ ]:
words_map
In [ ]:
In [ ]:
number = 100003
In [ ]:
word_list = words_map[number % k]
word_list.append(number)
In [ ]:
words_map
In [ ]:
Query, is number in word_map?
In [ ]:
def contains_modmap(word_modmap, k, word):
word_list = word_modmap[word % k]
print contains(word_list, word)
In [ ]:
contains_modmap(words_map, 100000, 100003)
In [ ]:
words_map[3 % 100000]
In [ ]:
In [ ]:
def our_hash(n):
k = 100000
return n % k
We set k equal to the length of our array. We can use any size of array by changing the mod k.
In [ ]:
In [ ]:
hash(0)
In [ ]:
hash(1)
In [ ]:
hash("Ryan")
In [ ]:
hash("Ryan") % 10
In [ ]:
In [ ]:
# Load the book in again
In [ ]:
with open('leaves-of-grass.txt','r') as f:
words = [word.lower() for line in f for word in re.findall(r"[\w']+|[.,!?;]", line)]
In [ ]:
k = 100000
words_hashtable = [list() for i in range(k)]
for word in words:
word_list = words_hashtable[hash(word) % k]
word_list.append(word)
In [ ]:
words_hashtable
In [ ]:
def contains_hashtable(words_hashtable, k, word):
word_list = words_hashtable[hash(word) % k]
print contains(word_list, word)
In [ ]:
print contains(words, 'captain')
In [ ]:
contains_hashtable(words_hashtable, 100000, 'lincoln')
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
print "leaves" < "zebra"
In [ ]:
print "leaves" < "abc"
In [ ]:
words.sort()
In [ ]:
print words[0]
In [ ]:
print words[30000]
In [ ]:
print words[90000]
In [ ]:
print words[148186]
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
book.__contains__("captain")