In [4]:
import re
pattern = 'this'
text = 'Does this text match the pattern'
match = re.search(pattern, text)
s = match.start()
e = match.end()
print('Found "{}" \n in "{}" from {} to {} ("{}")'.format(match.re.pattern,match.string, s, e, text[s:e]))
In [6]:
import re
regexes = [
re.compile(p)
for p in ['this', 'that']
]
text = 'Does this text match the pattern'
print('Text: {!r}\n'.format(text))
for regex in regexes:
print('Seeking "{}"->'.format(regex.pattern), end= '')
if regex.search(text):
print('match!')
else:
print('no match')
In [7]:
import re
text = 'abbaaabbbbaaaaa'
pattern = 'ab'
for match in re.findall(pattern, text):
print('Found {!r}'.format(match))
In [8]:
for match in re.finditer(pattern, text):
s = match.start()
e = match.end()
print('Found at {:d}:{:d}'.format(s,e))
In [10]:
import re
def test_patterns(text, patterns):
"""Given source text and a list of patterns, look for
matches for each pattern within the text and print
them to stdout.
"""
# Look for each pattern in the text and print the results
for pattern, desc in patterns:
print("'{}' ({})\n".format(pattern, desc))
print(" '{}'".format(text))
for match in re.finditer(pattern, text):
s = match.start()
e = match.end()
substr = text[s:e]
n_backslashes = text[:s].count('\\')
prefix = '.' * (s + n_backslashes)
print(" {}'{}'".format(prefix, substr))
print()
return
test_patterns('abbaabbba',
[
('ab*','a followdd by zero or more b'),
('ab+','a followed by one or more b'),
('ab?', 'a followed by zero or one b'),
('ab{3}','a followed by three b'),
('ab{2,3}', 'a followed by two or three b')
]
)
When processing a repetition instruction, re will usually consume as much of the input as possible while matching the pattern. This so-called greedy behavior may result in fewer individual matches, or the matches may include more of the input text than intended. Greediness can be turned off by following the repetition instruction with ?.
In [11]:
test_patterns('abbaabbba',
[
('ab*?','a followdd by zero or more b'),
('ab+?','a followed by one or more b'),
('ab??', 'a followed by zero or one b'),
('ab{3}?','a followed by three b'),
('ab{2,3}?', 'a followed by two or three b')
]
)
In [12]:
test_patterns(
'abbaabbba',
[
('[ab]', 'either a or b'),
('a[ab]+', 'a followed by one or more a or b'),
('a[ab]+?', 'a followed by one or more a or b, not greedy'),
]
)
In [13]:
test_patterns(
'This is some text -- with punctuation',
[
('[^-. ]+', 'sequence withouct -, ., or space')
]
)
In [14]:
test_patterns(
'This is some text -- with punctuation',
[
('[a-z]+', 'sequence of lowercase letters'),
('[A-Z]+', 'sequecne of uppercase letters'),
('[a-zA-Z]+', 'sequecne of letters of either case'),
('[A-Z][a-z]+', 'one uppercase followed by lowercase')
]
)
In [15]:
test_patterns(
'abbaabbba',
[
('a.', 'a followed by any one character'),
('b.', 'b followed by any one character'),
('a.*b', 'a followed by anything, end in b'),
('a.*?b', 'a followed by anythin, end in b')
]
)
In [16]:
test_patterns(
'A prime #1 example!',
[
(r'\d+', 'sequece of digits'),
(r'\D+', 'sequence of non-digits'),
(r'\s+', 'sequence of whitespace'),
(r'\S+', 'sequence of non-whitespace'),
(r'\w+', 'alphanumeric characters'),
(r'\W+', 'non-alphanumeric')
]
)
In [19]:
test_patterns(
'This is some text -- with punctuation.',
[(r'^\w+', 'word at start of string'),
(r'\A\w+', 'word at start of string'),
(r'\w+\S*$', 'word near end of string'),
(r'\w+\S*\Z', 'word near end of string'),
(r'\w*t\w*', 'word containing t'),
(r'\bt\w+', 't at start of word'),
(r'\w+t\b', 't at end of word'),
(r'\Bt\B', 't, not start or end of word')],
)Constraining the Search
In [20]:
import re
text = 'This is some text --with punctuation.'
pattern = 'is'
print('Text :',text)
print('pattern:', pattern)
m = re.match(pattern, text)
print('Match', m)
s = re.search(pattern ,text)
print('Search', s)
In [21]:
test_patterns(
'abbaaabbbbaaaaa',
[
('a(ab)', 'a followed by literal ab'),
('a(a*b*)','a followed by 0-n a and 0-b b'),
('a(ab)*', 'a followed by 0-n ab'),
('a(ab)+', 'a followed by 1-n ab')
]
)
In [25]:
import re
text = 'This is some text -- with punctuation'
print(text)
print()
patterns = [
(r'^(\w+)', 'word at start of string'),
(r'(\w+)\S*$', 'word at end, with optional punctuation'),
(r'(\bt\w+)\W+(\w+)', 'word starting with t, another word'),
(r'(\w+t)\b', 'word ending with t')
]
for pattern, desc in patterns:
regex = re.compile(pattern)
match = regex.search(text)
print("'{}' ({})\n".format(pattern, desc))
print(' ', match.groups())
print()
In [30]:
import re
text = 'This is some text -- with punctuation'
print(text)
print()
patterns = [
r'(?P<first_word>\w+)',
r'(?P<last_word>\w+)\S*$',
r'(?P<t_word>\bt\w+)\W+(?P<other_word>\w+)',
r'(?P<ends_with_t>\w+t)\b'
]
for pattern in patterns:
regex = re.compile(pattern)
match = regex.search(text)
print("'{}'".format(pattern))
print(' ', match.groups())
print(' ', match.groupdict())
print()
In [31]:
import re
text = 'This is some text -- with punctuation.'
pattern = r'\bT\w+'
with_case = re.compile(pattern)
without_case = re.compile(pattern, re.IGNORECASE)
print('Text:\n {!r}'.format(text))
print('Pattern:\n {}'.format(pattern))
print('Case-sensitive:')
for match in with_case.findall(text):
print(' {!r}'.format(match))
print('Case-insensitive:')
for match in without_case.findall(text):
print(' {!r}'.format(match))
In [32]:
import re
text = 'This is some text -- with punctuation.\nA second line.'
pattern = r'(^\w+)|(\w+\S*$)'
single_line = re.compile(pattern)
multiline = re.compile(pattern, re.MULTILINE)
print('Text:\n {!r}'.format(text))
print('Pattern:\n {}'.format(pattern))
print('Single Line :')
for match in single_line.findall(text):
print(' {!r}'.format(match))
print('Multline :')
for match in multiline.findall(text):
print(' {!r}'.format(match))
In [35]:
import re
text = u'Français złoty Österreich 中国矿业大学'
pattern = r'\w+'
ascii_pattern = re.compile(pattern, re.ASCII)
unicode_pattern = re.compile(pattern)
print('Text :', text)
print('Pattern :', pattern)
print('ASCII :', list(ascii_pattern.findall(text)))
print('Unicode :', list(unicode_pattern.findall(text)))
In [36]:
import re
address = re.compile(
'''
[\w\d.+-]+ # username
@
([\w\d.]+\.)+ # domain name prefix
(com|org|edu) # TODO: support more top-level domains
''',
re.VERBOSE)
candidates = [
u'first.last@example.com',
u'first.last+category@gmail.com',
u'valid-address@mail.example.com',
u'not-valid@example.foo',
]
for candidate in candidates:
match = address.search(candidate)
print('{:<30} {}'.format(
candidate, 'Matches' if match else 'No match'),
)
In [38]:
import re
bold = re.compile(r'\*{2}(.*?)\*{2}')
text = 'Make this **bold**. This **too**.'
print('Text:', text)
print('Bold:', bold.sub(r'<b>\1</b>', text))
In [39]:
import re
bold = re.compile(r'\*{2}(?P<bold_text>.*?)\*{2}')
text = 'Make this **bold**. This **too**.'
print('Text:', text)
print('Bold:', bold.sub(r'<b>\g<bold_text></b>', text))
In [40]:
import re
text = '''Paragraph one
on two lines.
Paragraph two.
Paragraph three.'''
print('With findall:')
for num, para in enumerate(re.findall(r'(.+?)(\n{2,}|$)',
text,
flags=re.DOTALL)):
print(num, repr(para))
print()
print()
print('With split:')
for num, para in enumerate(re.split(r'\n{2,}', text)):
print(num, repr(para))
print()