Regular Expressions in Python

Notebook by Max Schwartz, edits by Rachel Rakov



In [33]:

    
import re

Search



In [34]:

    
text_to_search = 'How do you do, fellow kids?'



In [36]:

    
regex_search = r'do'
regmine = r'how'
searched = re.search(regex_search, text_to_search)
also_searched = re.search(regmine, text_to_search)
print type(searched)
print type (also_searched)
print

##if you don't remember to print type, you will find out it's an object anyway
print searched
print also_searched









    



<type '_sre.SRE_Match'>
<type 'NoneType'>

<_sre.SRE_Match object at 0x0338AA30>
None



In [37]:

    
if searched:
    print searched.group(0)
else:
    print 'Nothing found'

do

Match



In [42]:

    
regex_match_fail = r'fellow'
not_matched = re.match(regex_match_fail, text_to_search)
print type(not_matched)









    



<type 'NoneType'>



In [39]:

    
if not_matched:
    print not_matched.group(0)
else:
    print 'Nothing matched'









    



Nothing matched



In [40]:

    
regex_match = r'How'
matched = re.match(regex_match, text_to_search)
print type(matched)









    



<type '_sre.SRE_Match'>



In [41]:

    
if matched:
    print matched.group(0)
else:
    print 'Nothing matched'

How

Find All



In [43]:

    
regex_findall = r'\wo\w?'
## A breakdown of this RE:
# \w == alphanumeric!
## o == o
## \w == alpahnumeric
## ? == optional

found = re.findall(regex_findall, text_to_search)
print type(found)









    



<type 'list'>



In [44]:

    
print found
## Note that this returns "low", which is not a word in our corpus!









    



['How', 'do', 'you', 'do', 'low']

Sub



In [46]:

    
text_to_change = "frog bog log cog nog fog"
more_text_to_change = "frog bog log cog nog fog schlog nschlog grog"
## add schlog
## add nschlog
## add grog



In [47]:

    
regex_sub = r'[^og ]{1,2}'
#find not o and not g and not space
#1 or 2 of these things
subbed = re.sub(regex_sub, 'd', text_to_change)
subbed_again = re.sub(regex_sub, 'd', more_text_to_change)
print type(subbed)









    



<type 'str'>



In [48]:

    
print subbed
print subbed_again









    



dog dog dog dog dog dog
dog dog dog dog dog dog ddog dddog gdog

Split



In [ ]:

    
text_to_split = 'This1is2some34text567to89split'



In [ ]:

    
split_text = re.split(r'\d+', text_to_split)
print split_text

Groups



In [50]:

    
regex_groups = r'How(.*(fellow).*)\?'
groups = re.search(regex_groups, text_to_search)
print 'Group 0: ' + groups.group(0)
print 'Group 1: ' + groups.group(1)
print 'Group 2: ' + groups.group(2)









    



Group 0: How do you do, fellow kids?
Group 1:  do you do, fellow kids
Group 2: fellow



In [54]:

    
newtext = "How do you do fellow kids How"
badtext = "fish cat fish cat"
regex_backreference = r'(How|do).*\1'
backreference_found = re.search(regex_backreference, text_to_search)
backreference_also_found = re.search(regex_backreference, newtext)
backreference_not_found = re.search(regex_backreference, badtext)

print backreference_found.group(0)
print backreference_also_found.group(0)

#print backreference_not_found.group(0)  ---> returns an attribute error









    



do you do
How do you do fellow kids How

Greed



In [55]:

    
html = '<h1>Header!</h1> <p>Paragraph!</p>'



In [56]:

    
regex_greedy = r'<.*>'
greedy = re.findall(regex_greedy, html)
print greedy









    



['<h1>Header!</h1> <p>Paragraph!</p>']



In [57]:

    
regex_not_greedy = r'<.*?>'
regex_not_greedy_words = r'<.*?>(.*?)<.*?>'
## *? == not greedy!

not_greedy = re.findall(regex_not_greedy, html)
not_greedy_words = re.findall(regex_not_greedy_words, html)

print not_greedy
print not_greedy_words

##NOTE:  Python returns the highest group number by default when using findall









    



['<h1>', '</h1>', '<p>', '</p>']
['Header!', 'Paragraph!']