Match all occurrences of the given pattern in the given text.
In [1]:
import re
In [2]:
path_to_file = "data/do.txt"
In [3]:
# in the middle we have 2017.8.26, which looks like a date so
# both the second and third elements should match one or
# two-digit numbers (month and days)
process_pat = re.compile(r'\d{7}-\d{2}\.\d{4}\.\d{1,2}\.\d{1,2}\.\d{4}')
In [4]:
test1 = "0001040-18.2017.8.26.0100" # from the instructions
process_pat.findall(test1)
Out[4]:
In [5]:
test2 = "0001040-18.2017.10.1.0100" # matches single-digit days and double-digit months too
process_pat.findall(test2)
Out[5]:
In [6]:
test3 = "0001040-18.2017.8.1.0100 foo bar baz 0001040-18.2017.8.23.0100" # multiple occurrences
process_pat.findall(test3)
Out[6]:
In [7]:
test4 = "0001040-18.2017.8.222.0100" # should NOT match
process_pat.findall(test4)
Out[7]:
In [8]:
# numbers may be split across lines
test5 = """
0001040-18.2017.8
.26.0100
"""
# this will be solved in approach 2
process_pat.findall(test5)
Out[8]:
In [9]:
def approach1(file,pattern):
output = list()
with open(file,"r") as f:
for line in f:
# there may be more than a single occurrence in the line so we must use findall
matches = pattern.findall(line)
for match in matches:
output.append(match)
# remove duplicates
return set(output)
# running
matches = approach1(path_to_file,process_pat)
print("{} distinct matches found: \n".format(len(matches)))
for match in matches:
print(match)
In [10]:
def approach2(file,pattern):
line_break = r'\n'
output = list()
with open(file,"r") as f:
while True:
line1 = f.readline()
last_position = f.tell()
line2 = f.readline()
# end of file
if not line1: break
merged = re.sub(line_break,'',line1+line2)
# there may be more than a single occurrence in the line so we must use findall
matches = pattern.findall(merged)
for match in matches:
output.append(match)
# rewind to position before last line was read
f.seek(last_position)
# remove duplicates
return set(output)
# running
matches = approach2(path_to_file,process_pat)
print("{} distinct matches found: \n".format(len(matches)))
for match in matches:
print(match)