Re module

Syntax of regular expressions


In [ ]:
# DOTALL == 16
from re import search, DOTALL

print(search('.', 'c') != None and
      search('.', 'D') != None and
      search('.', '_') != None and
      search('.', '5') != None and
      search('.', ' ') != None and
      search('.', '@') != None and
      search('.', '\n') == None and
      search('.', '\n', DOTALL) != None)
print()

# w for "word"
print(search('\w', 'c') != None and
      search('\w', 'D') != None and
      search('\w', '_') != None and
      search('\w', '5') != None and
      search('\w', ' ') == None and
      search('\w', '@') == None and
      search('\w', '\n') == None)
    
print(search('\W', 'c') == None and
      search('\W', 'D') == None and
      search('\W', '_') == None and
      search('\W', '5') == None and
      search('\W', ' ') != None and
      search('\W', '@') != None and
      search('\W', '\n') != None)
print()

# d for "digit"
print(search('\d', 'c') == None and
      search('\d', 'D') == None and
      search('\d', '_') == None and
      search('\d', '5') != None and
      search('\d', ' ') == None and
      search('\d', '@') == None and
      search('\d', '\n') == None)

print(search('\D', 'c') != None and
      search('\D', 'D') != None and
      search('\D', '_') != None and
      search('\D', '5') == None and
      search('\D', ' ') != None and
      search('\D', '@') != None and
      search('\D', '\n') != None)
print()

# s for "space"
print(search('\s', 'c') == None and
      search('\s', 'D') == None and
      search('\s', '_') == None and
      search('\s', '5') == None and
      search('\s', ' ') != None and
      search('\s', '@') == None and
      search('\s', '\n') != None)

print(search('\S', 'c') != None and
      search('\S', 'D') != None and
      search('\S', '_') != None and
      search('\S', '5') != None and
      search('\S', ' ') == None and
      search('\S', '@') != None and
      search('\S', '\n') == None)

In [ ]:
from re import search

# Character classes
print(search('[2-5d-h[\]^]', '3') != None and
      search('[2-5d-h[\]^]', 'e') != None and
      search('[\^2-5d-h[\]]', '[') != None and
      search('[\^2-5d-h[\]]', ']') != None and
      search('[\^2-5d-h[\]]', '^') != None)    
print(search('[2-5d-h^]', '6') == None and search('[\^2-5d-h]', 'c') == None)
print()

print(search('[^^2-5d-h[\]]', '3') == None and
      search('[^^2-5d-h[\]]', 'e') == None and
      search('[^^2-5d-h[\]]', '[') == None and
      search('[^^2-5d-h[\]]', ']') == None  and
      search('[^^2-5d-h[\]]', '^') == None)    
print(search('[^^2-5d-h[\]]', '6') != None and
      search('[^^2-5d-h[\]]', 'c') != None)

In [ ]:
from re import search, escape

print(search(escape('[^ab](*'), '0[^ab](*A') != None)

In [ ]:
from re import search

# Alternation
print(search('123|abc|\|@', '123') != None and
      search('123|abc|\|@', 'abc') != None and
      search('123|abc|\|\\\\@', '|\@') != None and
      search(r'123|abc|\|\\@', '|\@') != None)

In [ ]:
from re import search

# Quantifiers
print(search('a?', '') != None and
      search('a?', 'a') != None)
print(search('a*', '') != None and
      search('a*', 'a') != None and
      search('a*', 'aa') != None and
      search('a*', 'aaa') != None)
print(search('a+', 'a') != None and
      search('a+', 'aa') != None and
      search('a+', 'aaa') != None)
print()

print(search('a{3}', 'aaa') != None)
# No space before or after the comma!
print(search('a{2,4}', 'aa') != None and
      search('a{2,4}', 'aaa') != None and
      search('a{2,4}', 'aaaa') != None)
print(search('a{,3}', '') != None and
      search('a{,3}', 'a') != None and
      search('a{,3}', 'aa') != None and
      search('a{,3}', 'aaa') != None)
print(search('a{3,}', 'aaa') != None and
      search('a{3,}', 'aaaa') != None and
      search('a{3,}', 'aaaaa') != None)

In [ ]:
from re import search

# Quantifiers: (by default) greedy versus reluctant
search('a?', 'aaaa'), search('a??', 'aaaa'),\
search('a*', 'aaaaa'), search('a*?', 'aaaaa'),\
search('a+', 'aaaaa'), search('a+?', 'aaaaa'),\
search('a{2,3}', 'aaaaa'), search('a{2,3}?', 'aaaaa'),\
search('a{2,}', 'aaaaa'), search('a{2,}?', 'aaaaa')

In [ ]:
# Alternatively, replace MULTILINE by M
# MULTILINE == M == 8
from re import search, MULTILINE

print(search(r'\bab', ' ab') != None and search(r'ab\b', 'ab!') != None)
print(search('\Bab', '_ab') != None and search('ab\B', 'ab2') != None)

print(search('^ab', 'abc') != None and
      search('^ab', 'cab') == None and
      search('^ab', 'c\nab') == None and
      search('^ab', 'c\nab', MULTILINE) != None)
print(search('ab$', 'cab') != None and
      search('ab$', 'abc') == None and
      search('ab$', 'ab\nc') == None and
      search('ab$', 'ab\nc', MULTILINE) != None)
print()

print(search('\Aab', 'abc') != None and
      search('\Aab', 'cab') == None and
      search('\Aab', 'c\nab') == None)
print(search('ab\Z', 'cab') != None and
      search('ab\Z', 'abc') == None and
      search('ab\Z', 'ab\nc') == None)

In [ ]:
# Alternatively, replace IGNORECASE by I, ASCII by A
# IGNORECASE == I == 2
# ASCII == A == 256
from re import search, IGNORECASE, ASCII

print(search('aBcD', 'AbcD') == None and search('aBcD', 'AbcD', IGNORECASE) != None)
print(search('\w', 'î') != None and search('\w', 'î', ASCII) == None)
print(search(r'\bab', 'éab') == None and search(r'\bab', 'éab', ASCII) != None)
print(search('ab\B', 'abü') != None and search('ab\B', 'abü', ASCII) == None)
print()

print(search(r'\beF', 'àEf', IGNORECASE + ASCII) != None)

In [ ]:
from re import search

print(search(r'(\w+) (\w+) \1 (\w+) \1 \3', 'abc def abc ghi abc ghi') != None)
print(search(r'(\d)\1(2)', '332') != None)
# Capturing and noncapturing parentheses
print(search(r'(?:\w+) (\w+) \1 (\w+) (?:\w+) (\w+) \3 \2',
             'abc def def ghi jkl mno mno ghi') != None)

In [ ]:
from re import findall

print(findall('aba', 'abababa'))
print(findall('(ab)a', 'abababa'))
print(findall('(ab)(a)', 'abababa'))
print(findall('((ab)a)', 'abababa'))
print(findall('(a|b)+', 'abacaba'))
print(findall('(?:a|b)+', 'abacaba'))
print(findall('((a|b)+)', 'abacaba'))
print()

print(findall('a+', 'aaa'))
print(findall('a*', 'aaa'))

In [ ]:
from re import findall

print(findall('a', 'aaabaab'))
# a followed by b
print(findall('a(?=b)', 'aaabaab'))
# a not followed by b
print(findall('a(?!b)', 'aaabaab'))
# a preceded by b
print(findall('(?<=b)a', 'aaabaab'))
# a not preceded by b
print(findall('(?<!b)a', 'aaabaab'))

In [ ]:
from re import search

# If group (here, 2) matches then match this, (optionally) else match that
print(search('_(abc)(\d{3})?(?(2)aaa)', '_abc345aaa') != None and
      search('_(abc)(\d{3})?(?(2)aaa|BBB)', '_abc345aaa') != None and
      search('_(abc)(\d{3})?(?(2)aaa|BBB)', '_abcBBB') != None)
print(search('_(abc)(?P<word_2>\d{3})?(?(word_2)aaa)', '_abc345aaa') != None and
      search('_(abc)(?P<word_2>\d{3})?(?(word_2)aaa|BBB)', '_abc345aaa') != None and
      search('_(abc)(?P<word_2>\d{3})?(?(word_2)aaa|BBB)', '_abcBBB') != None)

In [ ]:
# Alternatively, replace VERBOSE by X
# VERBOSE == X == 64
# DEBUG == 128
from re import search, VERBOSE, DEBUG

print(search('''
             \d{3}   # Comments
             ( |-)   # and spaces
             [a-z]   # are ignored
             ''', '845-f', VERBOSE) != None)
print()

print(search('\d{3}( |-)[a-z]', '845-f', DEBUG))

Functions

All occurrences of f(pattern, ...) below can be replaced by compile(pattern).f(...)

No occurrence of compile(pattern).f(...) below can be replaced by f(pattern, ...)

Functions that return a Match object


In [ ]:
from re import compile, search

print(search('cd', 'abcde'))
print(compile('cd').search('abcde', 2))
print(compile('cd').search('abcde', 3))
print(compile('cd').search('abcde', 2, 5))
print(compile('cd').search('abcde', 2, 3))

In [ ]:
from re import compile, match

print(match('cd', 'cde'))
print(match('cd', 'abcde'))
print(compile('cd').match('abcde', 2))
print(compile('cd').match('abcde', 3))
print(compile('cd').match('abcde', 2, 5))
print(compile('cd').match('abcde', 2, 3))

In [ ]:
from re import fullmatch

print(fullmatch('cd', 'cde'))
print(fullmatch('cde', 'cde'))

In [ ]:
from re import compile, findall

print(findall('.', 'abcdef'),
      compile('.').findall('abcdef', 2),
      compile('.').findall('abcdef', 2, 5))

In [ ]:
from re import compile, finditer

matches = finditer('.', 'abcdef')
for match in matches:
    print(match)
print()

matches = compile('.').finditer('abcdef', 2)
for match in matches:
    print(match)
print()

matches = compile('.').finditer('abcdef', 2, 5)
for match in matches:
    print(match)

Functions applied to a Match object


In [ ]:
from re import search

print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').group())
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').group(0))
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').group(1))
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').group(2))
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').group(3))
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').group(1, 3))
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').group(0, 2, 1))
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').groups())
print()

print(search('\w+ (?P<word_1>\w+) (?P<word_2>\w+ (?P<word_3>\w+))',
             'ab cd ef gh ij').group('word_2'))
print(search('\w+ (?P<word_1>\w+) (?P<word_2>\w+ (?P<word_3>\w+))',
             'ab cd ef gh ij').group('word_1', 'word_3'))
print(search('\w+ (?P<word_1>\w+) (?P<word_2>\w+ (?P<word_3>\w+))',
             'ab cd ef gh ij').groupdict())

In [ ]:
from re import search

print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').start())
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').start(0))
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').start(1))
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').start(2))
print()

print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').end())
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').end(0))
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').end(1))
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').end(2))
print()

print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').span())
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').span(0))
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').span(1))
print(search('\w+ (\w+) (\w+ (\w+))', 'ab cd ef gh ij').span(2))

Functions that do not return a Match object


In [ ]:
from re import split

print(split(':!', 'ab:!cd:ef!gh:!ij'))
print(split('(:!)', 'ab:!cd:ef!gh:!ij'))
print()

print(split(' ', 'ab cd ef gh ij'))
print(split(' ', 'ab cd ef gh ij', 0))
print(split(' ', 'ab cd ef gh ij', 6))
print(split(' ', 'ab cd ef gh ij', 3))
print(split(' ', 'ab cd ef gh ij', 2))

In [ ]:
from re import sub, subn

print(sub('aba', '*', 'ababababababa'))
print(sub('aba', '*', 'ababababababa', 0))
print(sub('aba', '*', 'ababababababa', 4))
print(sub('aba', '*', 'ababababababa', 2))
print(sub('aba', '*', 'ababababababa', 1))
print()

print(subn('aba', '*', 'ababababababa'))
print(subn('aba', '*', 'ababababababa', 0))
print(subn('aba', '*', 'ababababababa', 4))
print(subn('aba', '*', 'ababababababa', 2))
print(subn('aba', '*', 'ababababababa', 1))

In [ ]:
from re import sub, subn

print(sub(r'(?:\w+) (\w+) \1 (\w+) (?:\w+) (\w+) \3 \2', r'*\2--\g<1>2!\3',  
             'abc def def ghi jkl mno mno ghi ABC DEF DEF GHI JKL MNO MNO GHI'))
print(sub('\w+ (?P<word_1>\w+) (?P<word_2>\w+ (?P<word_3>\w+))', r'\g<word_2>--\g<word_1>2\3',
          'ab cd ef gh ij AB CD EF GH IJ'))
print()

print(subn(r'(?:\w+) (\w+) \1 (\w+) (?:\w+) (\w+) \3 \2', r'*\2--\g<1>2!\3',  
             'abc def def ghi jkl mno mno ghi ABC DEF DEF GHI JKL MNO MNO GHI'))
print(subn('\w+ (?P<word_1>\w+) (?P<word_2>\w+ (?P<word_3>\w+))', r'\g<word_2>--\g<word_1>2\3',
          'ab cd ef gh ij AB CD EF GH IJ'))