In [1]:
#!/usr/bin/python3
# -*- coding: utf-8 -*-
def get_words(line):
'''
This function takes a text and returns
a list of lowercase words. It removes all digits
and strips punctuation characters from the words.
>>> get_words('Here are severall1000 words!\\n Blub')
['here', 'are', 'severall', 'words', 'blub']
>>> get_words('"Where are they? 10 \t 1000!?"')
['where', 'are', 'they']
'''
import string
line = line.lower()
line = line.translate(str.maketrans('','',
string.digits))
words = line.split()
words = [w.strip(string.punctuation) for w in words]
words = [w for w in words if w != '']
return words
def word_counter(f):
'''
This function takes a path to a text file. It returns
a Counter of all words in the given file.
'''
import fileinput
from collections import Counter
word_counts = Counter('')
for line in fileinput.input(f):
word_counts += Counter(get_words(line))
return word_counts
def main():
wordc = word_counter(None)
for w in sorted(wordc, key=wordc.get, reverse=True):
print(w, wordc[w])
if __name__ == '__main__':
import doctest
doctest.testmod()
#main()
In [2]:
counts = dict(word_counter('sherlock.txt'))
d = {c: counts[c] for c in counts if c[0] == 'o'}
for w in sorted(d, key=d.get, reverse=True):
print(w, d[w])
In [3]:
def normalize_string(text):
import string
text = text.lower()
text = text.translate(str.maketrans('','',
string.punctuation+
string.whitespace))
return text
def edit_distance(str1, str2):
'''
This function implements the Wagner-Fischer algorithm
to compute the edit distance between two strings
>>> edit_distance('bear', 'bar')
1
>>> edit_distance('"computer"', 'comm"uter')
1
>>> edit_distance('ea t', 'feat')
1
>>> edit_distance('bee!', 'beer')
1
>>> edit_distance('science', 'seance')
3
>>> edit_distance('laboratory', 'lobotomy')
4
'''
str1 = normalize_string(str1)
str2 = normalize_string(str2)
lenx = len(str1)+1
leny = len(str2)+1
d = [[0 for i in range(leny)] for i in range(lenx)]
for i in range(leny):
d[0][i] = i
for j in range(lenx):
d[j][0] = j
for j in range(1,leny):
for i in range(1,lenx):
if str1[i-1] == str2[j-1]:
d[i][j] = d[i-1][j-1]
else:
d[i][j] = min(d[i-1][j] +1,
d[i][j-1] +1,
d[i-1][j-1] +1)
return d[lenx-1][leny-1]
if __name__ == '__main__':
import doctest
doctest.testmod()
In [4]:
str1 = 'Source code is hard to write, nevertheless \
Python makes it easy to read!'
str2 = 'Sauce code was heard to writhe, nevertheleast \
Python makes it supercalifragilisticexpialidocious to reap!'
edit_distance(str1, str2)
Out[4]: