In [ ]:
# regular expression
# pattern matching and regex
# reference
# https://docs.python.org/2/library/re.html
# https://docs.python.org/2/howto/regex.html
In [5]:
answer = raw_input("do you want to come to movies ?")
if answer == 'yes' or answer == 'YES':
print "I will buy you a popcorn."
else:
print "You better buy your own popcorn."
In [6]:
import re
In [7]:
print dir(re)
In [8]:
my_string="python"
In [9]:
print help(re.match)
# Note: a pattern is always a subset
In [10]:
print re.match('py',my_string)
In [11]:
print type(re.match('py',my_string))
In [12]:
print re.match('yt',my_string)
In [13]:
print re.match("Pyth",my_string)
In [14]:
# IGNORECASE
print re.match("Pyth",my_string,re.I)
In [17]:
import re
answer = raw_input("do you want to come to movies ?")
#if answer == 'yes' or answer == 'YES':
if re.match(answer,'yes',re.I):
print "I will buy you a popcorn."
else:
print "You better buy your own popcorn."
In [18]:
# re.search
print help(re.search)
In [19]:
my_sentence1 = "python is a good language."
my_sentence2 = "one of the good languages is python."
In [20]:
print re.match("python",my_sentence1) # true
print re.match("python",my_sentence2) # false
print re.search("python",my_sentence1) # true
print re.search("python",my_sentence2) # true
In [21]:
# compile
In [22]:
print help(re.compile)
In [23]:
reg1 = re.compile("python")
In [25]:
print reg1
print type(reg1)
print dir(reg1)
In [26]:
print help(reg1.match)
In [27]:
print reg1.match(my_sentence1)
print reg1.match(my_sentence2)
print reg1.search(my_sentence1)
print reg1.search(my_sentence2)
In [28]:
# special characters
# ^ -> caret -> beginning of line/sentence.
# $ -> dollar -> end of the line/sentence.
# . -> dot -> one character.
In [33]:
# ^ - caret
my_sentence1 = "python is a good language."
my_sentence2 = "one of the good languages is python."
In [31]:
reg1 = re.compile("^python")
In [32]:
print reg1.match(my_sentence1) # true
print reg1.match(my_sentence2) # false
print reg1.search(my_sentence1) # true
print reg1.search(my_sentence2) # false
In [39]:
# $ - dollar
my_sentence1 = "python is a good language."
my_sentence2 = "one of the good languages is python."
In [35]:
reg1 = re.compile("python$")
In [40]:
print reg1.match(my_sentence1)
print reg1.match(my_sentence2)
print reg1.search(my_sentence1)
print reg1.search(my_sentence2)
In [41]:
# . - dot
In [42]:
my_string = "python"
print re.match('...',my_string)
In [46]:
# group
print re.match('...',my_string)
In [47]:
# lenght of words
my_students = ['abhi','rakesh','vishu','akshay','anuhya','varun','viraja']
In [51]:
import re
for value in my_students:
if re.match('^.....$',value):
print re.match('^.....$',value).group()
In [52]:
# DOTALL
my_string = "django\n"
In [53]:
print my_string
In [54]:
print re.match('.......',my_string)
In [55]:
print re.match('.......',my_string,re.DOTALL)
In [56]:
print re.match('.......',my_string,re.DOTALL).group()
In [57]:
# globbling characters
# greedy
In [58]:
# * -> zero or more characters.
# + -> one or more characters.
# ? -> zero or one character
In [59]:
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"
In [60]:
reg = re.compile('a*shique') # a* -> a repeated zero or more times.
In [61]:
print reg.match(film1)
print reg.match(film2)
print reg.match(film3)
print reg.match(film4)
print reg.search(film1)
print reg.search(film2)
print reg.search(film3)
print reg.search(film4)
In [62]:
# +
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"
reg = re.compile('a+shique') # a+ -> a repeated one or more times.
print reg.match(film1) # true
print reg.match(film2) # true
print reg.match(film3) # true
print reg.match(film4) # false
print reg.search(film1) # true
print reg.search(film2) # true
print reg.search(film3) # true
print reg.search(film4) # false
In [64]:
# ?
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"
reg = re.compile('a?shique') # a repeated zero or one time
print reg.match(film1) # true
print reg.match(film2) # false
print reg.match(film3) # false
print reg.match(film4) # true
print reg.search(film1) # true
print reg.search(film2) # true
print reg.search(film3) # true
print reg.search(film4) # true
In [66]:
# example
my_string1 = "<movie1>bahubali</movie1>"
my_string2 = "<book2>alchemy</book1>"
my_string3 = "<food3>biriyani</food1>"
In [68]:
# *,+ always go for maximal matching.
import re
reg = re.compile('<.*>')
print reg.match(my_string1).group()
print reg.match(my_string2).group()
print reg.match(my_string3).group()
In [69]:
# *?,+? always go for minimal matching.
import re
reg = re.compile('<.*?>')
print reg.match(my_string1).group()
print reg.match(my_string2).group()
print reg.match(my_string3).group()
In [70]:
# Anchors
# {m} -> m represents a number -> m repeated n number of times.
# {m,} -> m represents a number -> m repeated more than m number of times.
# {m,n} -> m represented between m and n nmber of times.
In [ ]:
# character sets
# [a-z] -> Any characters between a to z.
# [0-9] -> Any number between 0 to 9.
# [^a-z] -> negation -> not having a to z.
# ^[a-z] -> starting with a to z.
# [.+*?] -> your regular expressions behave like symbols.
In [71]:
my_sentence="today its raining hard."
In [76]:
print re.match('[a-z]+\s+',my_sentence).group()
In [78]:
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence).group()
In [80]:
print re.match('([a-z]+\s+){3}[a-z]+[.]',my_sentence).group()
In [81]:
print re.match('(\w+\s+){3}\w+[.]',my_sentence).group()
In [83]:
# re.VERBOSE
print re.match('''
(\w+\s+) # a word and a space ,ex: ladoo
{3} # similar word as above repeated 3 times.
\w+ # a word without any space
[.] # remember as sentence should have a dot.
''',my_sentence,re.VERBOSE).group()
In [97]:
# grouping
# group() -> states what exactly matched.
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence).group()
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence).group(0)
print re.match('([a-z]+)\s+[a-z]+\s+([a-z]+)\s+[a-z]+[.]',my_sentence).group()
# index based
print re.match('([a-z]+)\s+[a-z]+\s+([a-z]+)\s+[a-z]+[.]',my_sentence).group(1)
print re.match('([a-z]+)\s+[a-z]+\s+([a-z]+)\s+[a-z]+[.]',my_sentence).group(2)
print re.match('([a-z]+)\s+[a-z]+\s+([a-z]+)\s+[a-z]+[.]',my_sentence).group(1,2)
print re.match('([a-z]+)\s+[a-z]+\s+([a-z]+)\s+[a-z]+[.]',my_sentence).groups()
# key based
print re.match('(?P<ajtak>[a-z]+)\s+[a-z]+\s+(?P<barish>[a-z]+)\s+[a-z]+[.]',my_sentence).group()
print re.match('(?P<ajtak>[a-z]+)\s+[a-z]+\s+(?P<barish>[a-z]+)\s+[a-z]+[.]',my_sentence).group('barish')
print re.match('(?P<ajtak>[a-z]+)\s+[a-z]+\s+(?P<barish>[a-z]+)\s+[a-z]+[.]',my_sentence).group('ajtak')
In [98]:
# MULTILINE
my_sentence="Today is cloudy.\nToday i want to eat haleem.\nToday i may rain."
In [99]:
print my_sentence
In [100]:
# match - always looks at the beginning of the string.
print re.match('today',my_sentence,re.I).group()
In [101]:
# search - scans till it finds a string.
print re.search('today',my_sentence,re.I).group()
In [102]:
# findall
print help(re.findall)
In [103]:
print re.findall('today',my_sentence,re.I)
In [104]:
print re.findall('^today',my_sentence,re.I)
In [105]:
my_sentence
Out[105]:
In [106]:
print re.findall('^today',my_sentence,(re.I|re.M))
In [ ]: