1 Finding Patterns in Text



In [4]:

    
import re
pattern = 'this'
text = 'Does this text match the pattern'
match = re.search(pattern, text)
s = match.start()
e = match.end()
print('Found "{}" \n in "{}" from {} to {} ("{}")'.format(match.re.pattern,match.string, s, e, text[s:e]))









    



Found "this" 
 in "Does this text match the pattern" from 5 to 9 ("this")

2 Compiling Expressions



In [6]:

    
import re
regexes = [
    re.compile(p)
    for p in ['this', 'that']
]
text = 'Does this text match the pattern'
print('Text: {!r}\n'.format(text))
for regex in regexes:
    print('Seeking "{}"->'.format(regex.pattern), end= '')
    if regex.search(text):
        print('match!')
    else:
        print('no match')









    



Text: 'Does this text match the pattern'

Seeking "this"->match!
Seeking "that"->no match

3 Multiple Matches



In [7]:

    
import re
text = 'abbaaabbbbaaaaa'
pattern = 'ab'
for match in re.findall(pattern, text):
    print('Found {!r}'.format(match))









    



Found 'ab'
Found 'ab'



In [8]:

    
for match in re.finditer(pattern, text):
    s = match.start()
    e = match.end()
    print('Found at {:d}:{:d}'.format(s,e))









    



Found at 0:2
Found at 5:7

4 Repetition



In [10]:

    
import re
def test_patterns(text, patterns):
    """Given source text and a list of patterns, look for
    matches for each pattern within the text and print
    them to stdout.
    """
    # Look for each pattern in the text and print the results
    for pattern, desc in patterns:
        print("'{}' ({})\n".format(pattern, desc))
        print("  '{}'".format(text))
        for match in re.finditer(pattern, text):
            s = match.start()
            e = match.end()
            substr = text[s:e]
            n_backslashes = text[:s].count('\\')
            prefix = '.' * (s + n_backslashes)
            print("  {}'{}'".format(prefix, substr))
        print()
    return

test_patterns('abbaabbba',
              [
                  ('ab*','a followdd by zero or more b'),
                  ('ab+','a followed by one or more b'),
                  ('ab?', 'a followed by zero or one b'),
                  ('ab{3}','a followed by three b'),
                  ('ab{2,3}', 'a followed by two or three b')
              ]
)









    



'ab*' (a followdd by zero or more b)

  'abbaabbba'
  'abb'
  ...'a'
  ....'abbb'
  ........'a'

'ab+' (a followed by one or more b)

  'abbaabbba'
  'abb'
  ....'abbb'

'ab?' (a followed by zero or one b)

  'abbaabbba'
  'ab'
  ...'a'
  ....'ab'
  ........'a'

'ab{3}' (a followed by three b)

  'abbaabbba'
  ....'abbb'

'ab{2,3}' (a followed by two or three b)

  'abbaabbba'
  'abb'
  ....'abbb'

When processing a repetition instruction, re will usually consume as much of the input as possible while matching the pattern. This so-called greedy behavior may result in fewer individual matches, or the matches may include more of the input text than intended. Greediness can be turned off by following the repetition instruction with ?.



In [11]:

    
test_patterns('abbaabbba',
              [
                  ('ab*?','a followdd by zero or more b'),
                  ('ab+?','a followed by one or more b'),
                  ('ab??', 'a followed by zero or one b'),
                  ('ab{3}?','a followed by three b'),
                  ('ab{2,3}?', 'a followed by two or three b')
              ]
)









    



'ab*?' (a followdd by zero or more b)

  'abbaabbba'
  'a'
  ...'a'
  ....'a'
  ........'a'

'ab+?' (a followed by one or more b)

  'abbaabbba'
  'ab'
  ....'ab'

'ab??' (a followed by zero or one b)

  'abbaabbba'
  'a'
  ...'a'
  ....'a'
  ........'a'

'ab{3}?' (a followed by three b)

  'abbaabbba'
  ....'abbb'

'ab{2,3}?' (a followed by two or three b)

  'abbaabbba'
  'abb'
  ....'abb'

5 character Sets



In [12]:

    
test_patterns(
    'abbaabbba',
    [
        ('[ab]', 'either a or b'),
        ('a[ab]+', 'a followed by one or more a or b'),
        ('a[ab]+?', 'a followed by one or more a or b, not greedy'),
    ]
)









    



'[ab]' (either a or b)

  'abbaabbba'
  'a'
  .'b'
  ..'b'
  ...'a'
  ....'a'
  .....'b'
  ......'b'
  .......'b'
  ........'a'

'a[ab]+' (a followed by one or more a or b)

  'abbaabbba'
  'abbaabbba'

'a[ab]+?' (a followed by one or more a or b, not greedy)

  'abbaabbba'
  'ab'
  ...'aa'



In [13]:

    
test_patterns(
    'This is some text -- with punctuation',
    [
        ('[^-. ]+', 'sequence withouct -, ., or space')
    ]
)









    



'[^-. ]+' (sequence withouct -, ., or space)

  'This is some text -- with punctuation'
  'This'
  .....'is'
  ........'some'
  .............'text'
  .....................'with'
  ..........................'punctuation'



In [14]:

    
test_patterns(
    'This is some text -- with punctuation',
    [
        ('[a-z]+', 'sequence of lowercase letters'),
        ('[A-Z]+', 'sequecne of uppercase letters'),
        ('[a-zA-Z]+', 'sequecne of letters of either case'),
        ('[A-Z][a-z]+', 'one uppercase followed by lowercase')
    ]
)









    



'[a-z]+' (sequence of lowercase letters)

  'This is some text -- with punctuation'
  .'his'
  .....'is'
  ........'some'
  .............'text'
  .....................'with'
  ..........................'punctuation'

'[A-Z]+' (sequecne of uppercase letters)

  'This is some text -- with punctuation'
  'T'

'[a-zA-Z]+' (sequecne of letters of either case)

  'This is some text -- with punctuation'
  'This'
  .....'is'
  ........'some'
  .............'text'
  .....................'with'
  ..........................'punctuation'

'[A-Z][a-z]+' (one uppercase followed by lowercase)

  'This is some text -- with punctuation'
  'This'



In [15]:

    
test_patterns(
    'abbaabbba',
    [
        ('a.', 'a followed by any one character'),
        ('b.', 'b followed by any one character'),
        ('a.*b', 'a followed by anything, end in b'),
        ('a.*?b', 'a followed by anythin, end in b')
    ]
)









    



'a.' (a followed by any one character)

  'abbaabbba'
  'ab'
  ...'aa'

'b.' (b followed by any one character)

  'abbaabbba'
  .'bb'
  .....'bb'
  .......'ba'

'a.*b' (a followed by anything, end in b)

  'abbaabbba'
  'abbaabbb'

'a.*?b' (a followed by anythin, end in b)

  'abbaabbba'
  'ab'
  ...'aab'

5 Escape Codes

code	Meaning
\d	a digit
\D	a non-digit
\s	whitespace(tab,space, newline, etc)
\S	non-whitespace
\w	alphanumeric
\W	non-alphanumeric



In [16]:

    
test_patterns(
    'A prime #1 example!',
    [
        (r'\d+', 'sequece of digits'),
        (r'\D+', 'sequence of non-digits'),
        (r'\s+', 'sequence of whitespace'),
        (r'\S+', 'sequence of non-whitespace'),
        (r'\w+', 'alphanumeric characters'),
        (r'\W+', 'non-alphanumeric')
    ]
)









    



'\d+' (sequece of digits)

  'A prime #1 example!'
  .........'1'

'\D+' (sequence of non-digits)

  'A prime #1 example!'
  'A prime #'
  ..........' example!'

'\s+' (sequence of whitespace)

  'A prime #1 example!'
  .' '
  .......' '
  ..........' '

'\S+' (sequence of non-whitespace)

  'A prime #1 example!'
  'A'
  ..'prime'
  ........'#1'
  ...........'example!'

'\w+' (alphanumeric characters)

  'A prime #1 example!'
  'A'
  ..'prime'
  .........'1'
  ...........'example'

'\W+' (non-alphanumeric)

  'A prime #1 example!'
  .' '
  .......' #'
  ..........' '
  ..................'!'

6 Anchoring

code	Meaning
^	start of string, or line
$	end of string, or line
\A	start of string
\Z	end of string
\b	empty string at begining or end of a word
\B	empty string not at begining or end of word



In [19]:

    
test_patterns(
    'This is some text -- with punctuation.',
    [(r'^\w+', 'word at start of string'),
     (r'\A\w+', 'word at start of string'),
     (r'\w+\S*$', 'word near end of string'),
     (r'\w+\S*\Z', 'word near end of string'),
     (r'\w*t\w*', 'word containing t'),
     (r'\bt\w+', 't at start of word'),
     (r'\w+t\b', 't at end of word'),
     (r'\Bt\B', 't, not start or end of word')],
)Constraining the Search









    



'^\w+' (word at start of string)

  'This is some text -- with punctuation.'
  'This'

'\A\w+' (word at start of string)

  'This is some text -- with punctuation.'
  'This'

'\w+\S*$' (word near end of string)

  'This is some text -- with punctuation.'
  ..........................'punctuation.'

'\w+\S*\Z' (word near end of string)

  'This is some text -- with punctuation.'
  ..........................'punctuation.'

'\w*t\w*' (word containing t)

  'This is some text -- with punctuation.'
  .............'text'
  .....................'with'
  ..........................'punctuation'

'\bt\w+' (t at start of word)

  'This is some text -- with punctuation.'
  .............'text'

'\w+t\b' (t at end of word)

  'This is some text -- with punctuation.'
  .............'text'

'\Bt\B' (t, not start or end of word)

  'This is some text -- with punctuation.'
  .......................'t'
  ..............................'t'
  .................................'t'

7 Constraining the Search



In [20]:

    
import re
text = 'This is some text --with punctuation.'
pattern = 'is'
print('Text   :',text)
print('pattern:', pattern)

m = re.match(pattern, text)
print('Match', m)
s = re.search(pattern ,text)
print('Search', s)









    



Text   : This is some text --with punctuation.
pattern: is
Match None
Search <_sre.SRE_Match object; span=(2, 4), match='is'>

8 Dissecting Matches with groups



In [21]:

    
test_patterns(
    'abbaaabbbbaaaaa',
    [
        ('a(ab)', 'a followed by literal ab'),
        ('a(a*b*)','a followed by 0-n a and 0-b b'),
        ('a(ab)*', 'a followed by 0-n ab'),
        ('a(ab)+', 'a followed by 1-n ab')
    ]
)









    



'a(ab)' (a followed by literal ab)

  'abbaaabbbbaaaaa'
  ....'aab'

'a(a*b*)' (a followed by 0-n a and 0-b b)

  'abbaaabbbbaaaaa'
  'abb'
  ...'aaabbbb'
  ..........'aaaaa'

'a(ab)*' (a followed by 0-n ab)

  'abbaaabbbbaaaaa'
  'a'
  ...'a'
  ....'aab'
  ..........'a'
  ...........'a'
  ............'a'
  .............'a'
  ..............'a'

'a(ab)+' (a followed by 1-n ab)

  'abbaaabbbbaaaaa'
  ....'aab'



In [25]:

    
import re
text = 'This is some text -- with punctuation'
print(text)
print()

patterns = [
    (r'^(\w+)', 'word at start of string'),
    (r'(\w+)\S*$', 'word at end, with optional punctuation'),
    (r'(\bt\w+)\W+(\w+)', 'word starting with t, another word'),
    (r'(\w+t)\b', 'word ending with t')
]
for pattern, desc in patterns:
    regex = re.compile(pattern)
    match = regex.search(text)
    print("'{}' ({})\n".format(pattern, desc))
    print('  ', match.groups())
    print()









    



This is some text -- with punctuation

'^(\w+)' (word at start of string)

   ('This',)

'(\w+)\S*$' (word at end, with optional punctuation)

   ('punctuation',)

'(\bt\w+)\W+(\w+)' (word starting with t, another word)

   ('text', 'with')

'(\w+t)\b' (word ending with t)

   ('text',)



In [30]:

    
import re
text = 'This is some text -- with punctuation'
print(text)
print()

patterns = [
    r'(?P<first_word>\w+)',
    r'(?P<last_word>\w+)\S*$',
    r'(?P<t_word>\bt\w+)\W+(?P<other_word>\w+)',
    r'(?P<ends_with_t>\w+t)\b'
]

for pattern in patterns:
    regex = re.compile(pattern)
    match = regex.search(text)
    print("'{}'".format(pattern))
    print('  ', match.groups())
    print('  ', match.groupdict())
    print()









    



This is some text -- with punctuation

'(?P<first_word>\w+)'
   ('This',)
   {'first_word': 'This'}

'(?P<last_word>\w+)\S*$'
   ('punctuation',)
   {'last_word': 'punctuation'}

'(?P<t_word>\bt\w+)\W+(?P<other_word>\w+)'
   ('text', 'with')
   {'t_word': 'text', 'other_word': 'with'}

'(?P<ends_with_t>\w+t)\b'
   ('text',)
   {'ends_with_t': 'text'}

8 Search Options

8.1 Case-insensitive match



In [31]:

    
import re

text = 'This is some text -- with punctuation.'
pattern = r'\bT\w+'
with_case = re.compile(pattern)
without_case = re.compile(pattern, re.IGNORECASE)

print('Text:\n  {!r}'.format(text))
print('Pattern:\n  {}'.format(pattern))
print('Case-sensitive:')
for match in with_case.findall(text):
    print('  {!r}'.format(match))
print('Case-insensitive:')
for match in without_case.findall(text):
    print('  {!r}'.format(match))









    



Text:
  'This is some text -- with punctuation.'
Pattern:
  \bT\w+
Case-sensitive:
  'This'
Case-insensitive:
  'This'
  'text'

8.1 Input with mulitline



In [32]:

    
import re

text = 'This is some text -- with punctuation.\nA second line.'
pattern = r'(^\w+)|(\w+\S*$)'
single_line = re.compile(pattern)
multiline = re.compile(pattern, re.MULTILINE)

print('Text:\n  {!r}'.format(text))
print('Pattern:\n  {}'.format(pattern))
print('Single Line :')
for match in single_line.findall(text):
    print('  {!r}'.format(match))
print('Multline    :')
for match in multiline.findall(text):
    print('  {!r}'.format(match))









    



Text:
  'This is some text -- with punctuation.\nA second line.'
Pattern:
  (^\w+)|(\w+\S*$)
Single Line :
  ('This', '')
  ('', 'line.')
Multline    :
  ('This', '')
  ('', 'punctuation.')
  ('A', '')
  ('', 'line.')

9 Unicode



In [35]:

    
import re

text = u'Français złoty Österreich 中国矿业大学'
pattern = r'\w+'
ascii_pattern = re.compile(pattern, re.ASCII)
unicode_pattern = re.compile(pattern)

print('Text    :', text)
print('Pattern :', pattern)
print('ASCII   :', list(ascii_pattern.findall(text)))
print('Unicode :', list(unicode_pattern.findall(text)))









    



Text    : Français złoty Österreich 中国矿业大学
Pattern : \w+
ASCII   : ['Fran', 'ais', 'z', 'oty', 'sterreich']
Unicode : ['Français', 'złoty', 'Österreich', '中国矿业大学']

10 Verbose Expression Syntax



In [36]:

    
import re

address = re.compile(
    '''
    [\w\d.+-]+       # username
    @
    ([\w\d.]+\.)+    # domain name prefix
    (com|org|edu)    # TODO: support more top-level domains
    ''',
    re.VERBOSE)

candidates = [
    u'first.last@example.com',
    u'first.last+category@gmail.com',
    u'valid-address@mail.example.com',
    u'not-valid@example.foo',
]

for candidate in candidates:
    match = address.search(candidate)
    print('{:<30}  {}'.format(
        candidate, 'Matches' if match else 'No match'),
    )









    



first.last@example.com          Matches
first.last+category@gmail.com   Matches
valid-address@mail.example.com  Matches
not-valid@example.foo           No match

11 Modifying Strings with Patterns



In [38]:

    
import re

bold = re.compile(r'\*{2}(.*?)\*{2}')

text = 'Make this **bold**.  This **too**.'

print('Text:', text)
print('Bold:', bold.sub(r'<b>\1</b>', text))









    



Text: Make this **bold**.  This **too**.
Bold: Make this <b>bold</b>.  This <b>too</b>.



In [39]:

    
import re

bold = re.compile(r'\*{2}(?P<bold_text>.*?)\*{2}')

text = 'Make this **bold**.  This **too**.'

print('Text:', text)
print('Bold:', bold.sub(r'<b>\g<bold_text></b>', text))









    



Text: Make this **bold**.  This **too**.
Bold: Make this <b>bold</b>.  This <b>too</b>.

12 Spliting with patterns



In [40]:

    
import re

text = '''Paragraph one
on two lines.

Paragraph two.


Paragraph three.'''

print('With findall:')
for num, para in enumerate(re.findall(r'(.+?)(\n{2,}|$)',
                                      text,
                                      flags=re.DOTALL)):
    print(num, repr(para))
    print()

print()
print('With split:')
for num, para in enumerate(re.split(r'\n{2,}', text)):
    print(num, repr(para))
    print()









    



With findall:
0 ('Paragraph one\non two lines.', '\n\n')

1 ('Paragraph two.', '\n\n\n')

2 ('Paragraph three.', '')


With split:
0 'Paragraph one\non two lines.'

1 'Paragraph two.'

2 'Paragraph three.'