Notebook by Max Schwartz, edits by Rachel Rakov
In [33]:
import re
In [34]:
text_to_search = 'How do you do, fellow kids?'
In [36]:
regex_search = r'do'
regmine = r'how'
searched = re.search(regex_search, text_to_search)
also_searched = re.search(regmine, text_to_search)
print type(searched)
print type (also_searched)
print
##if you don't remember to print type, you will find out it's an object anyway
print searched
print also_searched
In [37]:
if searched:
print searched.group(0)
else:
print 'Nothing found'
In [42]:
regex_match_fail = r'fellow'
not_matched = re.match(regex_match_fail, text_to_search)
print type(not_matched)
In [39]:
if not_matched:
print not_matched.group(0)
else:
print 'Nothing matched'
In [40]:
regex_match = r'How'
matched = re.match(regex_match, text_to_search)
print type(matched)
In [41]:
if matched:
print matched.group(0)
else:
print 'Nothing matched'
In [43]:
regex_findall = r'\wo\w?'
## A breakdown of this RE:
# \w == alphanumeric!
## o == o
## \w == alpahnumeric
## ? == optional
found = re.findall(regex_findall, text_to_search)
print type(found)
In [44]:
print found
## Note that this returns "low", which is not a word in our corpus!
In [46]:
text_to_change = "frog bog log cog nog fog"
more_text_to_change = "frog bog log cog nog fog schlog nschlog grog"
## add schlog
## add nschlog
## add grog
In [47]:
regex_sub = r'[^og ]{1,2}'
#find not o and not g and not space
#1 or 2 of these things
subbed = re.sub(regex_sub, 'd', text_to_change)
subbed_again = re.sub(regex_sub, 'd', more_text_to_change)
print type(subbed)
In [48]:
print subbed
print subbed_again
In [ ]:
text_to_split = 'This1is2some34text567to89split'
In [ ]:
split_text = re.split(r'\d+', text_to_split)
print split_text
In [50]:
regex_groups = r'How(.*(fellow).*)\?'
groups = re.search(regex_groups, text_to_search)
print 'Group 0: ' + groups.group(0)
print 'Group 1: ' + groups.group(1)
print 'Group 2: ' + groups.group(2)
In [54]:
newtext = "How do you do fellow kids How"
badtext = "fish cat fish cat"
regex_backreference = r'(How|do).*\1'
backreference_found = re.search(regex_backreference, text_to_search)
backreference_also_found = re.search(regex_backreference, newtext)
backreference_not_found = re.search(regex_backreference, badtext)
print backreference_found.group(0)
print backreference_also_found.group(0)
#print backreference_not_found.group(0) ---> returns an attribute error
In [55]:
html = '<h1>Header!</h1> <p>Paragraph!</p>'
In [56]:
regex_greedy = r'<.*>'
greedy = re.findall(regex_greedy, html)
print greedy
In [57]:
regex_not_greedy = r'<.*?>'
regex_not_greedy_words = r'<.*?>(.*?)<.*?>'
## *? == not greedy!
not_greedy = re.findall(regex_not_greedy, html)
not_greedy_words = re.findall(regex_not_greedy_words, html)
print not_greedy
print not_greedy_words
##NOTE: Python returns the highest group number by default when using findall