notebook.community

Edit and run



In [ ]:

    
# regular expression
# pattern matching and regex
# reference
# https://docs.python.org/2/library/re.html
# https://docs.python.org/2/howto/regex.html



In [5]:

    
answer = raw_input("do you want to come to movies ?")
if answer == 'yes' or answer == 'YES':
    print "I will buy you a popcorn."
else:
    print "You better buy your own popcorn."









    



do you want to come to movies ?YES
I will buy you a popcorn.



In [6]:

    
import re



In [7]:

    
print dir(re)









    



['DEBUG', 'DOTALL', 'I', 'IGNORECASE', 'L', 'LOCALE', 'M', 'MULTILINE', 'S', 'Scanner', 'T', 'TEMPLATE', 'U', 'UNICODE', 'VERBOSE', 'X', '_MAXCACHE', '__all__', '__builtins__', '__doc__', '__file__', '__name__', '__package__', '__version__', '_alphanum', '_cache', '_cache_repl', '_compile', '_compile_repl', '_expand', '_pattern_type', '_pickle', '_subx', 'compile', 'copy_reg', 'error', 'escape', 'findall', 'finditer', 'match', 'purge', 'search', 'split', 'sre_compile', 'sre_parse', 'sub', 'subn', 'sys', 'template']



In [8]:

    
my_string="python"



In [9]:

    
print help(re.match)
# Note: a pattern is always a subset









    



Help on function match in module re:

match(pattern, string, flags=0)
    Try to apply the pattern at the start of the string, returning
    a match object, or None if no match was found.

None



In [10]:

    
print re.match('py',my_string)









    



<_sre.SRE_Match object at 0x7f82ac95d2a0>



In [11]:

    
print type(re.match('py',my_string))









    



<type '_sre.SRE_Match'>



In [12]:

    
print re.match('yt',my_string)









    



None



In [13]:

    
print re.match("Pyth",my_string)









    



None



In [14]:

    
# IGNORECASE
print re.match("Pyth",my_string,re.I)









    



<_sre.SRE_Match object at 0x7f82ac95d168>



In [17]:

    
import re
answer = raw_input("do you want to come to movies ?")
#if answer == 'yes' or answer == 'YES':
if re.match(answer,'yes',re.I):
    print "I will buy you a popcorn."
else:
    print "You better buy your own popcorn."









    



do you want to come to movies ?no
You better buy your own popcorn.



In [18]:

    
# re.search
print help(re.search)









    



Help on function search in module re:

search(pattern, string, flags=0)
    Scan through string looking for a match to the pattern, returning
    a match object, or None if no match was found.

None



In [19]:

    
my_sentence1 = "python is a good language."
my_sentence2 = "one of the good languages is python."



In [20]:

    
print re.match("python",my_sentence1)  # true
print re.match("python",my_sentence2)  # false
print re.search("python",my_sentence1) # true
print re.search("python",my_sentence2) # true









    



<_sre.SRE_Match object at 0x7f82ac95db28>
None
<_sre.SRE_Match object at 0x7f82ac95db28>
<_sre.SRE_Match object at 0x7f82ac95db28>



In [21]:

    
# compile



In [22]:

    
print help(re.compile)









    



Help on function compile in module re:

compile(pattern, flags=0)
    Compile a regular expression pattern, returning a pattern object.

None



In [23]:

    
reg1 = re.compile("python")



In [25]:

    
print reg1
print type(reg1)
print dir(reg1)









    



<_sre.SRE_Pattern object at 0x7f82b6e87618>
<type '_sre.SRE_Pattern'>
['__class__', '__copy__', '__deepcopy__', '__delattr__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'findall', 'finditer', 'flags', 'groupindex', 'groups', 'match', 'pattern', 'scanner', 'search', 'split', 'sub', 'subn']



In [26]:

    
print help(reg1.match)









    



Help on built-in function match:

match(...)
    match(string[, pos[, endpos]]) --> match object or None.
    Matches zero or more characters at the beginning of the string

None



In [27]:

    
print reg1.match(my_sentence1)
print reg1.match(my_sentence2)
print reg1.search(my_sentence1)
print reg1.search(my_sentence2)









    



<_sre.SRE_Match object at 0x7f82ac95dd30>
None
<_sre.SRE_Match object at 0x7f82ac95dd30>
<_sre.SRE_Match object at 0x7f82ac95dd30>



In [28]:

    
# special characters
# ^ -> caret -> beginning of line/sentence.
# $ -> dollar -> end of the line/sentence.
# . -> dot -> one character.



In [33]:

    
# ^ - caret
my_sentence1 = "python is a good language."
my_sentence2 = "one of the good languages is python."



In [31]:

    
reg1 = re.compile("^python")



In [32]:

    
print reg1.match(my_sentence1) # true
print reg1.match(my_sentence2) # false
print reg1.search(my_sentence1) # true
print reg1.search(my_sentence2) # false









    



<_sre.SRE_Match object at 0x7f82ac0e5100>
None
<_sre.SRE_Match object at 0x7f82ac0e5100>
None



In [39]:

    
# $ - dollar
my_sentence1 = "python is a good language."
my_sentence2 = "one of the good languages is python."



In [35]:

    
reg1 = re.compile("python$")



In [40]:

    
print reg1.match(my_sentence1)
print reg1.match(my_sentence2)
print reg1.search(my_sentence1)
print reg1.search(my_sentence2)









    



None
None
None
None



In [41]:

    
# . - dot



In [42]:

    
my_string = "python"
print re.match('...',my_string)









    



<_sre.SRE_Match object at 0x7f82ac0e54a8>



In [46]:

    
# group
print re.match('...',my_string)









    



<_sre.SRE_Match object at 0x7f82ac0c96b0>



In [47]:

    
# lenght of words
my_students = ['abhi','rakesh','vishu','akshay','anuhya','varun','viraja']



In [51]:

    
import re
for value in my_students:
    if re.match('^.....$',value):
        print re.match('^.....$',value).group()









    



vishu
varun



In [52]:

    
# DOTALL
my_string = "django\n"



In [53]:

    
print my_string









    



django



In [54]:

    
print re.match('.......',my_string)









    



None



In [55]:

    
print re.match('.......',my_string,re.DOTALL)









    



<_sre.SRE_Match object at 0x7f82ac0c9ac0>



In [56]:

    
print re.match('.......',my_string,re.DOTALL).group()









    



django



In [57]:

    
# globbling characters
# greedy



In [58]:

    
# * -> zero or more characters.
# + -> one or more characters.
# ? -> zero or one character



In [59]:

    
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"



In [60]:

    
reg = re.compile('a*shique') # a* -> a repeated zero or more times.



In [61]:

    
print reg.match(film1)
print reg.match(film2)
print reg.match(film3)
print reg.match(film4)
print reg.search(film1)
print reg.search(film2)
print reg.search(film3)
print reg.search(film4)









    



<_sre.SRE_Match object at 0x7f82ac0c9ed0>
<_sre.SRE_Match object at 0x7f82ac0c9ed0>
<_sre.SRE_Match object at 0x7f82ac0c9ed0>
<_sre.SRE_Match object at 0x7f82ac0c9ed0>
<_sre.SRE_Match object at 0x7f82ac0c9ed0>
<_sre.SRE_Match object at 0x7f82ac0c9ed0>
<_sre.SRE_Match object at 0x7f82ac0c9ed0>
<_sre.SRE_Match object at 0x7f82ac0c9ed0>



In [62]:

    
# +
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"
reg = re.compile('a+shique') # a+ -> a repeated one or more times.
print reg.match(film1) # true
print reg.match(film2) # true
print reg.match(film3) # true
print reg.match(film4) # false
print reg.search(film1) # true
print reg.search(film2) # true
print reg.search(film3) # true
print reg.search(film4) # false









    



<_sre.SRE_Match object at 0x7f82ac0c9f38>
<_sre.SRE_Match object at 0x7f82ac0c9f38>
<_sre.SRE_Match object at 0x7f82ac0c9f38>
None
<_sre.SRE_Match object at 0x7f82ac0c9f38>
<_sre.SRE_Match object at 0x7f82ac0c9f38>
<_sre.SRE_Match object at 0x7f82ac0c9f38>
None



In [64]:

    
# ?
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"

reg = re.compile('a?shique') # a repeated zero or one time

print reg.match(film1) # true
print reg.match(film2) # false
print reg.match(film3) # false
print reg.match(film4) # true
print reg.search(film1) # true
print reg.search(film2) # true
print reg.search(film3) # true
print reg.search(film4) # true









    



<_sre.SRE_Match object at 0x7f8297fc3100>
None
None
<_sre.SRE_Match object at 0x7f8297fc3100>
<_sre.SRE_Match object at 0x7f8297fc3100>
<_sre.SRE_Match object at 0x7f8297fc3100>
<_sre.SRE_Match object at 0x7f8297fc3100>
<_sre.SRE_Match object at 0x7f8297fc3100>



In [66]:

    
# example
my_string1 = "<movie1>bahubali</movie1>"
my_string2 = "<book2>alchemy</book1>"
my_string3 = "<food3>biriyani</food1>"



In [68]:

    
# *,+ always go for maximal matching.
import re
reg = re.compile('<.*>')
print reg.match(my_string1).group()
print reg.match(my_string2).group()
print reg.match(my_string3).group()









    



<movie1>bahubali</movie1>
<book2>alchemy</book1>
<food3>biriyani</food1>



In [69]:

    
# *?,+? always go for minimal matching.
import re
reg = re.compile('<.*?>')
print reg.match(my_string1).group()
print reg.match(my_string2).group()
print reg.match(my_string3).group()









    



<movie1>
<book2>
<food3>



In [70]:

    
# Anchors
# {m} -> m represents a number -> m repeated n number of times.
# {m,} -> m represents a number -> m repeated more than m number of times.
# {m,n} -> m represented between m and n nmber of times.



In [ ]:

    
# character sets
# [a-z] -> Any characters between a to z.
# [0-9] -> Any number between 0 to 9.
# [^a-z] -> negation -> not having a to z.
# ^[a-z] -> starting with a to z.
# [.+*?] -> your regular expressions behave like symbols.



In [71]:

    
my_sentence="today its raining hard."



In [76]:

    
print re.match('[a-z]+\s+',my_sentence).group()









    



today



In [78]:

    
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence).group()









    



today its raining hard.



In [80]:

    
print re.match('([a-z]+\s+){3}[a-z]+[.]',my_sentence).group()









    



today its raining hard.



In [81]:

    
print re.match('(\w+\s+){3}\w+[.]',my_sentence).group()









    



today its raining hard.



In [83]:

    
# re.VERBOSE
print re.match('''
(\w+\s+)   # a word and a space ,ex: ladoo
{3}        # similar word as above repeated 3 times.
\w+        # a word without any space
[.]        # remember as sentence should have a dot.
''',my_sentence,re.VERBOSE).group()









    



today its raining hard.



In [97]:

    
# grouping
# group() -> states what exactly matched.
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence).group()
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence).group(0)
print re.match('([a-z]+)\s+[a-z]+\s+([a-z]+)\s+[a-z]+[.]',my_sentence).group()
# index based
print re.match('([a-z]+)\s+[a-z]+\s+([a-z]+)\s+[a-z]+[.]',my_sentence).group(1)
print re.match('([a-z]+)\s+[a-z]+\s+([a-z]+)\s+[a-z]+[.]',my_sentence).group(2)
print re.match('([a-z]+)\s+[a-z]+\s+([a-z]+)\s+[a-z]+[.]',my_sentence).group(1,2)
print re.match('([a-z]+)\s+[a-z]+\s+([a-z]+)\s+[a-z]+[.]',my_sentence).groups()
# key based
print re.match('(?P<ajtak>[a-z]+)\s+[a-z]+\s+(?P<barish>[a-z]+)\s+[a-z]+[.]',my_sentence).group()
print re.match('(?P<ajtak>[a-z]+)\s+[a-z]+\s+(?P<barish>[a-z]+)\s+[a-z]+[.]',my_sentence).group('barish')
print re.match('(?P<ajtak>[a-z]+)\s+[a-z]+\s+(?P<barish>[a-z]+)\s+[a-z]+[.]',my_sentence).group('ajtak')









    



today its raining hard.
today its raining hard.
today its raining hard.
today
raining
('today', 'raining')
('today', 'raining')
today its raining hard.
raining
today



In [98]:

    
# MULTILINE
my_sentence="Today is cloudy.\nToday i want to eat haleem.\nToday i may rain."



In [99]:

    
print my_sentence









    



Today is cloudy.
Today i want to eat haleem.
Today i may rain.



In [100]:

    
# match - always looks at the beginning of the string.
print re.match('today',my_sentence,re.I).group()









    



Today



In [101]:

    
# search - scans till it finds a string.
print re.search('today',my_sentence,re.I).group()









    



Today



In [102]:

    
# findall
print help(re.findall)









    



Help on function findall in module re:

findall(pattern, string, flags=0)
    Return a list of all non-overlapping matches in the string.
    
    If one or more groups are present in the pattern, return a
    list of groups; this will be a list of tuples if the pattern
    has more than one group.
    
    Empty matches are included in the result.

None



In [103]:

    
print re.findall('today',my_sentence,re.I)









    



['Today', 'Today', 'Today']



In [104]:

    
print re.findall('^today',my_sentence,re.I)









    



['Today']



In [105]:

    
my_sentence









    Out[105]:





'Today is cloudy.\nToday i want to eat haleem.\nToday i may rain.'



In [106]:

    
print re.findall('^today',my_sentence,(re.I|re.M))









    



['Today', 'Today', 'Today']



In [ ]: