1 Finding Patterns in Text


In [4]:
import re
pattern = 'this'
text = 'Does this text match the pattern'
match = re.search(pattern, text)
s = match.start()
e = match.end()
print('Found "{}" \n in "{}" from {} to {} ("{}")'.format(match.re.pattern,match.string, s, e, text[s:e]))


Found "this" 
 in "Does this text match the pattern" from 5 to 9 ("this")

2 Compiling Expressions


In [6]:
import re
regexes = [
    re.compile(p)
    for p in ['this', 'that']
]
text = 'Does this text match the pattern'
print('Text: {!r}\n'.format(text))
for regex in regexes:
    print('Seeking "{}"->'.format(regex.pattern), end= '')
    if regex.search(text):
        print('match!')
    else:
        print('no match')


Text: 'Does this text match the pattern'

Seeking "this"->match!
Seeking "that"->no match

3 Multiple Matches


In [7]:
import re
text = 'abbaaabbbbaaaaa'
pattern = 'ab'
for match in re.findall(pattern, text):
    print('Found {!r}'.format(match))


Found 'ab'
Found 'ab'

In [8]:
for match in re.finditer(pattern, text):
    s = match.start()
    e = match.end()
    print('Found at {:d}:{:d}'.format(s,e))


Found at 0:2
Found at 5:7

4 Repetition


In [10]:
import re
def test_patterns(text, patterns):
    """Given source text and a list of patterns, look for
    matches for each pattern within the text and print
    them to stdout.
    """
    # Look for each pattern in the text and print the results
    for pattern, desc in patterns:
        print("'{}' ({})\n".format(pattern, desc))
        print("  '{}'".format(text))
        for match in re.finditer(pattern, text):
            s = match.start()
            e = match.end()
            substr = text[s:e]
            n_backslashes = text[:s].count('\\')
            prefix = '.' * (s + n_backslashes)
            print("  {}'{}'".format(prefix, substr))
        print()
    return

test_patterns('abbaabbba',
              [
                  ('ab*','a followdd by zero or more b'),
                  ('ab+','a followed by one or more b'),
                  ('ab?', 'a followed by zero or one b'),
                  ('ab{3}','a followed by three b'),
                  ('ab{2,3}', 'a followed by two or three b')
              ]
)


'ab*' (a followdd by zero or more b)

  'abbaabbba'
  'abb'
  ...'a'
  ....'abbb'
  ........'a'

'ab+' (a followed by one or more b)

  'abbaabbba'
  'abb'
  ....'abbb'

'ab?' (a followed by zero or one b)

  'abbaabbba'
  'ab'
  ...'a'
  ....'ab'
  ........'a'

'ab{3}' (a followed by three b)

  'abbaabbba'
  ....'abbb'

'ab{2,3}' (a followed by two or three b)

  'abbaabbba'
  'abb'
  ....'abbb'

When processing a repetition instruction, re will usually consume as much of the input as possible while matching the pattern. This so-called greedy behavior may result in fewer individual matches, or the matches may include more of the input text than intended. Greediness can be turned off by following the repetition instruction with ?.


In [11]:
test_patterns('abbaabbba',
              [
                  ('ab*?','a followdd by zero or more b'),
                  ('ab+?','a followed by one or more b'),
                  ('ab??', 'a followed by zero or one b'),
                  ('ab{3}?','a followed by three b'),
                  ('ab{2,3}?', 'a followed by two or three b')
              ]
)


'ab*?' (a followdd by zero or more b)

  'abbaabbba'
  'a'
  ...'a'
  ....'a'
  ........'a'

'ab+?' (a followed by one or more b)

  'abbaabbba'
  'ab'
  ....'ab'

'ab??' (a followed by zero or one b)

  'abbaabbba'
  'a'
  ...'a'
  ....'a'
  ........'a'

'ab{3}?' (a followed by three b)

  'abbaabbba'
  ....'abbb'

'ab{2,3}?' (a followed by two or three b)

  'abbaabbba'
  'abb'
  ....'abb'

5 character Sets


In [12]:
test_patterns(
    'abbaabbba',
    [
        ('[ab]', 'either a or b'),
        ('a[ab]+', 'a followed by one or more a or b'),
        ('a[ab]+?', 'a followed by one or more a or b, not greedy'),
    ]
)


'[ab]' (either a or b)

  'abbaabbba'
  'a'
  .'b'
  ..'b'
  ...'a'
  ....'a'
  .....'b'
  ......'b'
  .......'b'
  ........'a'

'a[ab]+' (a followed by one or more a or b)

  'abbaabbba'
  'abbaabbba'

'a[ab]+?' (a followed by one or more a or b, not greedy)

  'abbaabbba'
  'ab'
  ...'aa'


In [13]:
test_patterns(
    'This is some text -- with punctuation',
    [
        ('[^-. ]+', 'sequence withouct -, ., or space')
    ]
)


'[^-. ]+' (sequence withouct -, ., or space)

  'This is some text -- with punctuation'
  'This'
  .....'is'
  ........'some'
  .............'text'
  .....................'with'
  ..........................'punctuation'


In [14]:
test_patterns(
    'This is some text -- with punctuation',
    [
        ('[a-z]+', 'sequence of lowercase letters'),
        ('[A-Z]+', 'sequecne of uppercase letters'),
        ('[a-zA-Z]+', 'sequecne of letters of either case'),
        ('[A-Z][a-z]+', 'one uppercase followed by lowercase')
    ]
)


'[a-z]+' (sequence of lowercase letters)

  'This is some text -- with punctuation'
  .'his'
  .....'is'
  ........'some'
  .............'text'
  .....................'with'
  ..........................'punctuation'

'[A-Z]+' (sequecne of uppercase letters)

  'This is some text -- with punctuation'
  'T'

'[a-zA-Z]+' (sequecne of letters of either case)

  'This is some text -- with punctuation'
  'This'
  .....'is'
  ........'some'
  .............'text'
  .....................'with'
  ..........................'punctuation'

'[A-Z][a-z]+' (one uppercase followed by lowercase)

  'This is some text -- with punctuation'
  'This'


In [15]:
test_patterns(
    'abbaabbba',
    [
        ('a.', 'a followed by any one character'),
        ('b.', 'b followed by any one character'),
        ('a.*b', 'a followed by anything, end in b'),
        ('a.*?b', 'a followed by anythin, end in b')
    ]
)


'a.' (a followed by any one character)

  'abbaabbba'
  'ab'
  ...'aa'

'b.' (b followed by any one character)

  'abbaabbba'
  .'bb'
  .....'bb'
  .......'ba'

'a.*b' (a followed by anything, end in b)

  'abbaabbba'
  'abbaabbb'

'a.*?b' (a followed by anythin, end in b)

  'abbaabbba'
  'ab'
  ...'aab'

5 Escape Codes

code Meaning
\d a digit
\D a non-digit
\s whitespace(tab,space, newline, etc)
\S non-whitespace
\w alphanumeric
\W non-alphanumeric

In [16]:
test_patterns(
    'A prime #1 example!',
    [
        (r'\d+', 'sequece of digits'),
        (r'\D+', 'sequence of non-digits'),
        (r'\s+', 'sequence of whitespace'),
        (r'\S+', 'sequence of non-whitespace'),
        (r'\w+', 'alphanumeric characters'),
        (r'\W+', 'non-alphanumeric')
    ]
)


'\d+' (sequece of digits)

  'A prime #1 example!'
  .........'1'

'\D+' (sequence of non-digits)

  'A prime #1 example!'
  'A prime #'
  ..........' example!'

'\s+' (sequence of whitespace)

  'A prime #1 example!'
  .' '
  .......' '
  ..........' '

'\S+' (sequence of non-whitespace)

  'A prime #1 example!'
  'A'
  ..'prime'
  ........'#1'
  ...........'example!'

'\w+' (alphanumeric characters)

  'A prime #1 example!'
  'A'
  ..'prime'
  .........'1'
  ...........'example'

'\W+' (non-alphanumeric)

  'A prime #1 example!'
  .' '
  .......' #'
  ..........' '
  ..................'!'

6 Anchoring

code Meaning
^ start of string, or line
$ end of string, or line
\A start of string
\Z end of string
\b empty string at begining or end of a word
\B empty string not at begining or end of word

In [19]:
test_patterns(
    'This is some text -- with punctuation.',
    [(r'^\w+', 'word at start of string'),
     (r'\A\w+', 'word at start of string'),
     (r'\w+\S*$', 'word near end of string'),
     (r'\w+\S*\Z', 'word near end of string'),
     (r'\w*t\w*', 'word containing t'),
     (r'\bt\w+', 't at start of word'),
     (r'\w+t\b', 't at end of word'),
     (r'\Bt\B', 't, not start or end of word')],
)Constraining the Search


'^\w+' (word at start of string)

  'This is some text -- with punctuation.'
  'This'

'\A\w+' (word at start of string)

  'This is some text -- with punctuation.'
  'This'

'\w+\S*$' (word near end of string)

  'This is some text -- with punctuation.'
  ..........................'punctuation.'

'\w+\S*\Z' (word near end of string)

  'This is some text -- with punctuation.'
  ..........................'punctuation.'

'\w*t\w*' (word containing t)

  'This is some text -- with punctuation.'
  .............'text'
  .....................'with'
  ..........................'punctuation'

'\bt\w+' (t at start of word)

  'This is some text -- with punctuation.'
  .............'text'

'\w+t\b' (t at end of word)

  'This is some text -- with punctuation.'
  .............'text'

'\Bt\B' (t, not start or end of word)

  'This is some text -- with punctuation.'
  .......................'t'
  ..............................'t'
  .................................'t'

7 Constraining the Search


In [20]:
import re
text = 'This is some text --with punctuation.'
pattern = 'is'
print('Text   :',text)
print('pattern:', pattern)

m = re.match(pattern, text)
print('Match', m)
s = re.search(pattern ,text)
print('Search', s)


Text   : This is some text --with punctuation.
pattern: is
Match None
Search <_sre.SRE_Match object; span=(2, 4), match='is'>

8 Dissecting Matches with groups


In [21]:
test_patterns(
    'abbaaabbbbaaaaa',
    [
        ('a(ab)', 'a followed by literal ab'),
        ('a(a*b*)','a followed by 0-n a and 0-b b'),
        ('a(ab)*', 'a followed by 0-n ab'),
        ('a(ab)+', 'a followed by 1-n ab')
    ]
)


'a(ab)' (a followed by literal ab)

  'abbaaabbbbaaaaa'
  ....'aab'

'a(a*b*)' (a followed by 0-n a and 0-b b)

  'abbaaabbbbaaaaa'
  'abb'
  ...'aaabbbb'
  ..........'aaaaa'

'a(ab)*' (a followed by 0-n ab)

  'abbaaabbbbaaaaa'
  'a'
  ...'a'
  ....'aab'
  ..........'a'
  ...........'a'
  ............'a'
  .............'a'
  ..............'a'

'a(ab)+' (a followed by 1-n ab)

  'abbaaabbbbaaaaa'
  ....'aab'


In [25]:
import re
text = 'This is some text -- with punctuation'
print(text)
print()

patterns = [
    (r'^(\w+)', 'word at start of string'),
    (r'(\w+)\S*$', 'word at end, with optional punctuation'),
    (r'(\bt\w+)\W+(\w+)', 'word starting with t, another word'),
    (r'(\w+t)\b', 'word ending with t')
]
for pattern, desc in patterns:
    regex = re.compile(pattern)
    match = regex.search(text)
    print("'{}' ({})\n".format(pattern, desc))
    print('  ', match.groups())
    print()


This is some text -- with punctuation

'^(\w+)' (word at start of string)

   ('This',)

'(\w+)\S*$' (word at end, with optional punctuation)

   ('punctuation',)

'(\bt\w+)\W+(\w+)' (word starting with t, another word)

   ('text', 'with')

'(\w+t)\b' (word ending with t)

   ('text',)


In [30]:
import re
text = 'This is some text -- with punctuation'
print(text)
print()

patterns = [
    r'(?P<first_word>\w+)',
    r'(?P<last_word>\w+)\S*$',
    r'(?P<t_word>\bt\w+)\W+(?P<other_word>\w+)',
    r'(?P<ends_with_t>\w+t)\b'
]

for pattern in patterns:
    regex = re.compile(pattern)
    match = regex.search(text)
    print("'{}'".format(pattern))
    print('  ', match.groups())
    print('  ', match.groupdict())
    print()


This is some text -- with punctuation

'(?P<first_word>\w+)'
   ('This',)
   {'first_word': 'This'}

'(?P<last_word>\w+)\S*$'
   ('punctuation',)
   {'last_word': 'punctuation'}

'(?P<t_word>\bt\w+)\W+(?P<other_word>\w+)'
   ('text', 'with')
   {'t_word': 'text', 'other_word': 'with'}

'(?P<ends_with_t>\w+t)\b'
   ('text',)
   {'ends_with_t': 'text'}

8 Search Options

8.1 Case-insensitive match


In [31]:
import re

text = 'This is some text -- with punctuation.'
pattern = r'\bT\w+'
with_case = re.compile(pattern)
without_case = re.compile(pattern, re.IGNORECASE)

print('Text:\n  {!r}'.format(text))
print('Pattern:\n  {}'.format(pattern))
print('Case-sensitive:')
for match in with_case.findall(text):
    print('  {!r}'.format(match))
print('Case-insensitive:')
for match in without_case.findall(text):
    print('  {!r}'.format(match))


Text:
  'This is some text -- with punctuation.'
Pattern:
  \bT\w+
Case-sensitive:
  'This'
Case-insensitive:
  'This'
  'text'

8.1 Input with mulitline


In [32]:
import re

text = 'This is some text -- with punctuation.\nA second line.'
pattern = r'(^\w+)|(\w+\S*$)'
single_line = re.compile(pattern)
multiline = re.compile(pattern, re.MULTILINE)

print('Text:\n  {!r}'.format(text))
print('Pattern:\n  {}'.format(pattern))
print('Single Line :')
for match in single_line.findall(text):
    print('  {!r}'.format(match))
print('Multline    :')
for match in multiline.findall(text):
    print('  {!r}'.format(match))


Text:
  'This is some text -- with punctuation.\nA second line.'
Pattern:
  (^\w+)|(\w+\S*$)
Single Line :
  ('This', '')
  ('', 'line.')
Multline    :
  ('This', '')
  ('', 'punctuation.')
  ('A', '')
  ('', 'line.')

9 Unicode


In [35]:
import re

text = u'Français złoty Österreich 中国矿业大学'
pattern = r'\w+'
ascii_pattern = re.compile(pattern, re.ASCII)
unicode_pattern = re.compile(pattern)

print('Text    :', text)
print('Pattern :', pattern)
print('ASCII   :', list(ascii_pattern.findall(text)))
print('Unicode :', list(unicode_pattern.findall(text)))


Text    : Français złoty Österreich 中国矿业大学
Pattern : \w+
ASCII   : ['Fran', 'ais', 'z', 'oty', 'sterreich']
Unicode : ['Français', 'złoty', 'Österreich', '中国矿业大学']

10 Verbose Expression Syntax


In [36]:
import re

address = re.compile(
    '''
    [\w\d.+-]+       # username
    @
    ([\w\d.]+\.)+    # domain name prefix
    (com|org|edu)    # TODO: support more top-level domains
    ''',
    re.VERBOSE)

candidates = [
    u'first.last@example.com',
    u'first.last+category@gmail.com',
    u'valid-address@mail.example.com',
    u'not-valid@example.foo',
]

for candidate in candidates:
    match = address.search(candidate)
    print('{:<30}  {}'.format(
        candidate, 'Matches' if match else 'No match'),
    )


first.last@example.com          Matches
first.last+category@gmail.com   Matches
valid-address@mail.example.com  Matches
not-valid@example.foo           No match

11 Modifying Strings with Patterns


In [38]:
import re

bold = re.compile(r'\*{2}(.*?)\*{2}')

text = 'Make this **bold**.  This **too**.'

print('Text:', text)
print('Bold:', bold.sub(r'<b>\1</b>', text))


Text: Make this **bold**.  This **too**.
Bold: Make this <b>bold</b>.  This <b>too</b>.

In [39]:
import re

bold = re.compile(r'\*{2}(?P<bold_text>.*?)\*{2}')

text = 'Make this **bold**.  This **too**.'

print('Text:', text)
print('Bold:', bold.sub(r'<b>\g<bold_text></b>', text))


Text: Make this **bold**.  This **too**.
Bold: Make this <b>bold</b>.  This <b>too</b>.

12 Spliting with patterns


In [40]:
import re

text = '''Paragraph one
on two lines.

Paragraph two.


Paragraph three.'''

print('With findall:')
for num, para in enumerate(re.findall(r'(.+?)(\n{2,}|$)',
                                      text,
                                      flags=re.DOTALL)):
    print(num, repr(para))
    print()

print()
print('With split:')
for num, para in enumerate(re.split(r'\n{2,}', text)):
    print(num, repr(para))
    print()


With findall:
0 ('Paragraph one\non two lines.', '\n\n')

1 ('Paragraph two.', '\n\n\n')

2 ('Paragraph three.', '')


With split:
0 'Paragraph one\non two lines.'

1 'Paragraph two.'

2 'Paragraph three.'