Regular Expressions in Python

Notebook by Max Schwartz, edits by Rachel Rakov


In [33]:
import re

Search


In [34]:
text_to_search = 'How do you do, fellow kids?'

In [36]:
regex_search = r'do'
regmine = r'how'
searched = re.search(regex_search, text_to_search)
also_searched = re.search(regmine, text_to_search)
print type(searched)
print type (also_searched)
print

##if you don't remember to print type, you will find out it's an object anyway
print searched
print also_searched


<type '_sre.SRE_Match'>
<type 'NoneType'>

<_sre.SRE_Match object at 0x0338AA30>
None

In [37]:
if searched:
    print searched.group(0)
else:
    print 'Nothing found'


do

Match


In [42]:
regex_match_fail = r'fellow'
not_matched = re.match(regex_match_fail, text_to_search)
print type(not_matched)


<type 'NoneType'>

In [39]:
if not_matched:
    print not_matched.group(0)
else:
    print 'Nothing matched'


Nothing matched

In [40]:
regex_match = r'How'
matched = re.match(regex_match, text_to_search)
print type(matched)


<type '_sre.SRE_Match'>

In [41]:
if matched:
    print matched.group(0)
else:
    print 'Nothing matched'


How

Find All


In [43]:
regex_findall = r'\wo\w?'
## A breakdown of this RE:
# \w == alphanumeric!
## o == o
## \w == alpahnumeric
## ? == optional

found = re.findall(regex_findall, text_to_search)
print type(found)


<type 'list'>

In [44]:
print found
## Note that this returns "low", which is not a word in our corpus!


['How', 'do', 'you', 'do', 'low']

Sub


In [46]:
text_to_change = "frog bog log cog nog fog"
more_text_to_change = "frog bog log cog nog fog schlog nschlog grog"
## add schlog
## add nschlog
## add grog

In [47]:
regex_sub = r'[^og ]{1,2}'
#find not o and not g and not space
#1 or 2 of these things
subbed = re.sub(regex_sub, 'd', text_to_change)
subbed_again = re.sub(regex_sub, 'd', more_text_to_change)
print type(subbed)


<type 'str'>

In [48]:
print subbed
print subbed_again


dog dog dog dog dog dog
dog dog dog dog dog dog ddog dddog gdog

Split


In [ ]:
text_to_split = 'This1is2some34text567to89split'

In [ ]:
split_text = re.split(r'\d+', text_to_split)
print split_text

Groups


In [50]:
regex_groups = r'How(.*(fellow).*)\?'
groups = re.search(regex_groups, text_to_search)
print 'Group 0: ' + groups.group(0)
print 'Group 1: ' + groups.group(1)
print 'Group 2: ' + groups.group(2)


Group 0: How do you do, fellow kids?
Group 1:  do you do, fellow kids
Group 2: fellow

In [54]:
newtext = "How do you do fellow kids How"
badtext = "fish cat fish cat"
regex_backreference = r'(How|do).*\1'
backreference_found = re.search(regex_backreference, text_to_search)
backreference_also_found = re.search(regex_backreference, newtext)
backreference_not_found = re.search(regex_backreference, badtext)

print backreference_found.group(0)
print backreference_also_found.group(0)

#print backreference_not_found.group(0)  ---> returns an attribute error


do you do
How do you do fellow kids How

Greed


In [55]:
html = '<h1>Header!</h1> <p>Paragraph!</p>'

In [56]:
regex_greedy = r'<.*>'
greedy = re.findall(regex_greedy, html)
print greedy


['<h1>Header!</h1> <p>Paragraph!</p>']

In [57]:
regex_not_greedy = r'<.*?>'
regex_not_greedy_words = r'<.*?>(.*?)<.*?>'
## *? == not greedy!

not_greedy = re.findall(regex_not_greedy, html)
not_greedy_words = re.findall(regex_not_greedy_words, html)

print not_greedy
print not_greedy_words

##NOTE:  Python returns the highest group number by default when using findall


['<h1>', '</h1>', '<p>', '</p>']
['Header!', 'Paragraph!']