Match all occurrences of the given pattern in the given text.


In [1]:
import re

In [2]:
path_to_file = "data/do.txt"

approach 1: does not match across line breaks


In [3]:
# in the middle we have 2017.8.26, which looks like a date so 
# both the second and third elements should match one or 
# two-digit numbers (month and days)
process_pat = re.compile(r'\d{7}-\d{2}\.\d{4}\.\d{1,2}\.\d{1,2}\.\d{4}')

In [4]:
test1 = "0001040-18.2017.8.26.0100" # from the instructions
process_pat.findall(test1)


Out[4]:
['0001040-18.2017.8.26.0100']

In [5]:
test2 = "0001040-18.2017.10.1.0100" # matches single-digit days and double-digit months too
process_pat.findall(test2)


Out[5]:
['0001040-18.2017.10.1.0100']

In [6]:
test3 = "0001040-18.2017.8.1.0100 foo bar baz 0001040-18.2017.8.23.0100" # multiple occurrences
process_pat.findall(test3)


Out[6]:
['0001040-18.2017.8.1.0100', '0001040-18.2017.8.23.0100']

In [7]:
test4 = "0001040-18.2017.8.222.0100" # should NOT match
process_pat.findall(test4)


Out[7]:
[]

In [8]:
# numbers may be split across lines
test5 = """
0001040-18.2017.8
.26.0100
"""

# this will be solved in approach 2
process_pat.findall(test5)


Out[8]:
[]

In [9]:
def approach1(file,pattern):
   
    output = list()
    
    with open(file,"r") as f:
        for line in f:
            # there may be more than a single occurrence in the line so we must use findall
            matches = pattern.findall(line)
            
            for match in matches:
                output.append(match)
 
    # remove duplicates                
    return set(output)

# running
matches = approach1(path_to_file,process_pat)

print("{} distinct matches found: \n".format(len(matches)))
    
for match in matches:    
    print(match)


8 distinct matches found: 

0033887-73.2017.8.26.0100
0047167-14.2017.8.26.0100
0001040-18.2017.8.26.0100
0028605-54.2017.8.26.0100
0032840-64.2017.8.26.0100
0030677-14.2017.8.26.0100
0026982-52.2017.8.26.0100
0047169-81.2017.8.26.0100

approach 2: match even across line breaks

  • read two lines at a time, so that we capture text which may have been split across lines

  • basic strategy: read two lines then rewind the file pointer 1 line (otherwise we would only be able to capture stuff every 2 lines)


In [10]:
def approach2(file,pattern):
    
    line_break = r'\n'
   
    output = list()
    
    with open(file,"r") as f:           
        while True:
            
            line1 = f.readline()   
            last_position = f.tell()
            line2 = f.readline()
            
            # end of file
            if not line1: break
                           
            merged = re.sub(line_break,'',line1+line2)
                       
            # there may be more than a single occurrence in the line so we must use findall    
            matches = pattern.findall(merged)
            
            for match in matches:
                output.append(match)
            
            # rewind to position before last line was read
            f.seek(last_position)
    
    # remove duplicates
    return set(output)                
            
# running
matches = approach2(path_to_file,process_pat)

print("{} distinct matches found: \n".format(len(matches)))
    
for match in matches:    
    print(match)


16 distinct matches found: 

0033887-73.2017.8.26.0100
0047167-14.2017.8.26.0100
0001040-18.2017.8.26.0100
1021197-29.2016.8.26.0100
0028605-54.2017.8.26.0100
1056022-04.2013.8.26.0100
1071666-79.2016.8.26.0100
0155113-55.2011.8.26.0100
0030677-14.2017.8.26.0100
0032840-64.2017.8.26.0100
1055991-47.2014.8.26.0100
0026982-52.2017.8.26.0100
1128654-91.2014.8.26.0100
0047169-81.2017.8.26.0100
1058515-46.2016.8.26.0100
1042408-29.2013.8.26.0100