In [ ]:
# regular expression
# pattern matching and regex
# reference
# https://docs.python.org/2/library/re.html
# https://docs.python.org/2/howto/regex.html

In [5]:
answer = raw_input("do you want to come to movies ?")
if answer == 'yes' or answer == 'YES':
    print "I will buy you a popcorn."
else:
    print "You better buy your own popcorn."


do you want to come to movies ?YES
I will buy you a popcorn.

In [6]:
import re

In [7]:
print dir(re)


['DEBUG', 'DOTALL', 'I', 'IGNORECASE', 'L', 'LOCALE', 'M', 'MULTILINE', 'S', 'Scanner', 'T', 'TEMPLATE', 'U', 'UNICODE', 'VERBOSE', 'X', '_MAXCACHE', '__all__', '__builtins__', '__doc__', '__file__', '__name__', '__package__', '__version__', '_alphanum', '_cache', '_cache_repl', '_compile', '_compile_repl', '_expand', '_pattern_type', '_pickle', '_subx', 'compile', 'copy_reg', 'error', 'escape', 'findall', 'finditer', 'match', 'purge', 'search', 'split', 'sre_compile', 'sre_parse', 'sub', 'subn', 'sys', 'template']

In [8]:
my_string="python"

In [9]:
print help(re.match)
# Note: a pattern is always a subset


Help on function match in module re:

match(pattern, string, flags=0)
    Try to apply the pattern at the start of the string, returning
    a match object, or None if no match was found.

None

In [10]:
print re.match('py',my_string)


<_sre.SRE_Match object at 0x7f82ac95d2a0>

In [11]:
print type(re.match('py',my_string))


<type '_sre.SRE_Match'>

In [12]:
print re.match('yt',my_string)


None

In [13]:
print re.match("Pyth",my_string)


None

In [14]:
# IGNORECASE
print re.match("Pyth",my_string,re.I)


<_sre.SRE_Match object at 0x7f82ac95d168>

In [17]:
import re
answer = raw_input("do you want to come to movies ?")
#if answer == 'yes' or answer == 'YES':
if re.match(answer,'yes',re.I):
    print "I will buy you a popcorn."
else:
    print "You better buy your own popcorn."


do you want to come to movies ?no
You better buy your own popcorn.

In [18]:
# re.search
print help(re.search)


Help on function search in module re:

search(pattern, string, flags=0)
    Scan through string looking for a match to the pattern, returning
    a match object, or None if no match was found.

None

In [19]:
my_sentence1 = "python is a good language."
my_sentence2 = "one of the good languages is python."

In [20]:
print re.match("python",my_sentence1)  # true
print re.match("python",my_sentence2)  # false
print re.search("python",my_sentence1) # true
print re.search("python",my_sentence2) # true


<_sre.SRE_Match object at 0x7f82ac95db28>
None
<_sre.SRE_Match object at 0x7f82ac95db28>
<_sre.SRE_Match object at 0x7f82ac95db28>

In [21]:
# compile

In [22]:
print help(re.compile)


Help on function compile in module re:

compile(pattern, flags=0)
    Compile a regular expression pattern, returning a pattern object.

None

In [23]:
reg1 = re.compile("python")

In [25]:
print reg1
print type(reg1)
print dir(reg1)


<_sre.SRE_Pattern object at 0x7f82b6e87618>
<type '_sre.SRE_Pattern'>
['__class__', '__copy__', '__deepcopy__', '__delattr__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'findall', 'finditer', 'flags', 'groupindex', 'groups', 'match', 'pattern', 'scanner', 'search', 'split', 'sub', 'subn']

In [26]:
print help(reg1.match)


Help on built-in function match:

match(...)
    match(string[, pos[, endpos]]) --> match object or None.
    Matches zero or more characters at the beginning of the string

None

In [27]:
print reg1.match(my_sentence1)
print reg1.match(my_sentence2)
print reg1.search(my_sentence1)
print reg1.search(my_sentence2)


<_sre.SRE_Match object at 0x7f82ac95dd30>
None
<_sre.SRE_Match object at 0x7f82ac95dd30>
<_sre.SRE_Match object at 0x7f82ac95dd30>

In [28]:
# special characters
# ^ -> caret -> beginning of line/sentence.
# $ -> dollar -> end of the line/sentence.
# . -> dot -> one character.

In [33]:
# ^ - caret
my_sentence1 = "python is a good language."
my_sentence2 = "one of the good languages is python."

In [31]:
reg1 = re.compile("^python")

In [32]:
print reg1.match(my_sentence1) # true
print reg1.match(my_sentence2) # false
print reg1.search(my_sentence1) # true
print reg1.search(my_sentence2) # false


<_sre.SRE_Match object at 0x7f82ac0e5100>
None
<_sre.SRE_Match object at 0x7f82ac0e5100>
None

In [39]:
# $ - dollar
my_sentence1 = "python is a good language."
my_sentence2 = "one of the good languages is python."

In [35]:
reg1 = re.compile("python$")

In [40]:
print reg1.match(my_sentence1)
print reg1.match(my_sentence2)
print reg1.search(my_sentence1)
print reg1.search(my_sentence2)


None
None
None
None

In [41]:
# . - dot

In [42]:
my_string = "python"
print re.match('...',my_string)


<_sre.SRE_Match object at 0x7f82ac0e54a8>

In [46]:
# group
print re.match('...',my_string)


<_sre.SRE_Match object at 0x7f82ac0c96b0>

In [47]:
# lenght of words
my_students = ['abhi','rakesh','vishu','akshay','anuhya','varun','viraja']

In [51]:
import re
for value in my_students:
    if re.match('^.....$',value):
        print re.match('^.....$',value).group()


vishu
varun

In [52]:
# DOTALL
my_string = "django\n"

In [53]:
print my_string


django


In [54]:
print re.match('.......',my_string)


None

In [55]:
print re.match('.......',my_string,re.DOTALL)


<_sre.SRE_Match object at 0x7f82ac0c9ac0>

In [56]:
print re.match('.......',my_string,re.DOTALL).group()


django


In [57]:
# globbling characters
# greedy

In [58]:
# * -> zero or more characters.
# + -> one or more characters.
# ? -> zero or one character

In [59]:
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"

In [60]:
reg = re.compile('a*shique') # a* -> a repeated zero or more times.

In [61]:
print reg.match(film1)
print reg.match(film2)
print reg.match(film3)
print reg.match(film4)
print reg.search(film1)
print reg.search(film2)
print reg.search(film3)
print reg.search(film4)


<_sre.SRE_Match object at 0x7f82ac0c9ed0>
<_sre.SRE_Match object at 0x7f82ac0c9ed0>
<_sre.SRE_Match object at 0x7f82ac0c9ed0>
<_sre.SRE_Match object at 0x7f82ac0c9ed0>
<_sre.SRE_Match object at 0x7f82ac0c9ed0>
<_sre.SRE_Match object at 0x7f82ac0c9ed0>
<_sre.SRE_Match object at 0x7f82ac0c9ed0>
<_sre.SRE_Match object at 0x7f82ac0c9ed0>

In [62]:
# +
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"
reg = re.compile('a+shique') # a+ -> a repeated one or more times.
print reg.match(film1) # true
print reg.match(film2) # true
print reg.match(film3) # true
print reg.match(film4) # false
print reg.search(film1) # true
print reg.search(film2) # true
print reg.search(film3) # true
print reg.search(film4) # false


<_sre.SRE_Match object at 0x7f82ac0c9f38>
<_sre.SRE_Match object at 0x7f82ac0c9f38>
<_sre.SRE_Match object at 0x7f82ac0c9f38>
None
<_sre.SRE_Match object at 0x7f82ac0c9f38>
<_sre.SRE_Match object at 0x7f82ac0c9f38>
<_sre.SRE_Match object at 0x7f82ac0c9f38>
None

In [64]:
# ?
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"

reg = re.compile('a?shique') # a repeated zero or one time

print reg.match(film1) # true
print reg.match(film2) # false
print reg.match(film3) # false
print reg.match(film4) # true
print reg.search(film1) # true
print reg.search(film2) # true
print reg.search(film3) # true
print reg.search(film4) # true


<_sre.SRE_Match object at 0x7f8297fc3100>
None
None
<_sre.SRE_Match object at 0x7f8297fc3100>
<_sre.SRE_Match object at 0x7f8297fc3100>
<_sre.SRE_Match object at 0x7f8297fc3100>
<_sre.SRE_Match object at 0x7f8297fc3100>
<_sre.SRE_Match object at 0x7f8297fc3100>

In [66]:
# example
my_string1 = "<movie1>bahubali</movie1>"
my_string2 = "<book2>alchemy</book1>"
my_string3 = "<food3>biriyani</food1>"

In [68]:
# *,+ always go for maximal matching.
import re
reg = re.compile('<.*>')
print reg.match(my_string1).group()
print reg.match(my_string2).group()
print reg.match(my_string3).group()


<movie1>bahubali</movie1>
<book2>alchemy</book1>
<food3>biriyani</food1>

In [69]:
# *?,+? always go for minimal matching.
import re
reg = re.compile('<.*?>')
print reg.match(my_string1).group()
print reg.match(my_string2).group()
print reg.match(my_string3).group()


<movie1>
<book2>
<food3>

In [70]:
# Anchors
# {m} -> m represents a number -> m repeated n number of times.
# {m,} -> m represents a number -> m repeated more than m number of times.
# {m,n} -> m represented between m and n nmber of times.

In [ ]:
# character sets
# [a-z] -> Any characters between a to z.
# [0-9] -> Any number between 0 to 9.
# [^a-z] -> negation -> not having a to z.
# ^[a-z] -> starting with a to z.
# [.+*?] -> your regular expressions behave like symbols.

In [71]:
my_sentence="today its raining hard."

In [76]:
print re.match('[a-z]+\s+',my_sentence).group()


today 

In [78]:
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence).group()


today its raining hard.

In [80]:
print re.match('([a-z]+\s+){3}[a-z]+[.]',my_sentence).group()


today its raining hard.

In [81]:
print re.match('(\w+\s+){3}\w+[.]',my_sentence).group()


today its raining hard.

In [83]:
# re.VERBOSE
print re.match('''
(\w+\s+)   # a word and a space ,ex: ladoo
{3}        # similar word as above repeated 3 times.
\w+        # a word without any space
[.]        # remember as sentence should have a dot.
''',my_sentence,re.VERBOSE).group()


today its raining hard.

In [97]:
# grouping
# group() -> states what exactly matched.
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence).group()
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence).group(0)
print re.match('([a-z]+)\s+[a-z]+\s+([a-z]+)\s+[a-z]+[.]',my_sentence).group()
# index based
print re.match('([a-z]+)\s+[a-z]+\s+([a-z]+)\s+[a-z]+[.]',my_sentence).group(1)
print re.match('([a-z]+)\s+[a-z]+\s+([a-z]+)\s+[a-z]+[.]',my_sentence).group(2)
print re.match('([a-z]+)\s+[a-z]+\s+([a-z]+)\s+[a-z]+[.]',my_sentence).group(1,2)
print re.match('([a-z]+)\s+[a-z]+\s+([a-z]+)\s+[a-z]+[.]',my_sentence).groups()
# key based
print re.match('(?P<ajtak>[a-z]+)\s+[a-z]+\s+(?P<barish>[a-z]+)\s+[a-z]+[.]',my_sentence).group()
print re.match('(?P<ajtak>[a-z]+)\s+[a-z]+\s+(?P<barish>[a-z]+)\s+[a-z]+[.]',my_sentence).group('barish')
print re.match('(?P<ajtak>[a-z]+)\s+[a-z]+\s+(?P<barish>[a-z]+)\s+[a-z]+[.]',my_sentence).group('ajtak')


today its raining hard.
today its raining hard.
today its raining hard.
today
raining
('today', 'raining')
('today', 'raining')
today its raining hard.
raining
today

In [98]:
# MULTILINE
my_sentence="Today is cloudy.\nToday i want to eat haleem.\nToday i may rain."

In [99]:
print my_sentence


Today is cloudy.
Today i want to eat haleem.
Today i may rain.

In [100]:
# match - always looks at the beginning of the string.
print re.match('today',my_sentence,re.I).group()


Today

In [ ]: