notebook.community

Edit and run



In [ ]:

    
# what are regular expressions or what is the need of regular expression.
# re are common in most of the programming languages.
# regular expression/regex/pattern matching



In [1]:

    
answer = raw_input("do you want to come to the movie:")
if answer == 'yes':
    print "you are welcome to the movie"
else:
    print "better luck next time"









    



do you want to come to the movie:yes
you are welcome to the movie



In [2]:

    
answer = raw_input("do you want to come to the movie - yes/no:")
if answer == 'yes':
    print "you are welcome to the movie"
else:
    print "better luck next time"









    



do you want to come to the movie - yes/no:no
better luck next time



In [3]:

    
answer = raw_input("do you want to come to the movie:")
if answer == 'yes':    # 2 ** 3 == 8 combination
    print "you are welcome to the movie"
else:
    print "better luck next time"









    



do you want to come to the movie:Yes
better luck next time



In [4]:

    
import re



In [9]:

    
my_string = "python"



In [5]:

    
print dir(re)









    



['DEBUG', 'DOTALL', 'I', 'IGNORECASE', 'L', 'LOCALE', 'M', 'MULTILINE', 'S', 'Scanner', 'T', 'TEMPLATE', 'U', 'UNICODE', 'VERBOSE', 'X', '_MAXCACHE', '__all__', '__builtins__', '__doc__', '__file__', '__name__', '__package__', '__version__', '_alphanum', '_cache', '_cache_repl', '_compile', '_compile_repl', '_expand', '_locale', '_pattern_type', '_pickle', '_subx', 'compile', 'copy_reg', 'error', 'escape', 'findall', 'finditer', 'match', 'purge', 'search', 'split', 'sre_compile', 'sre_parse', 'sub', 'subn', 'sys', 'template']



In [6]:

    
# match



In [7]:

    
print help(re.match)









    



Help on function match in module re:

match(pattern, string, flags=0)
    Try to apply the pattern at the start of the string, returning
    a match object, or None if no match was found.

None



In [ ]:

    
# patter is basically a subset of your string.
# string - python
# few subsets of patter- ex: p,py,pyt,pyth



In [10]:

    
print re.match('p',my_string) # we got a match pattern









    



<_sre.SRE_Match object at 0x7ff0ffff8510>



In [11]:

    
print re.match('yth',my_string) #  this pattern fails,as the match has be from left to right.









    



None



In [13]:

    
print re.match('pyth',my_string)
print type(re.match('pyth',my_string))









    



<_sre.SRE_Match object at 0x7ff0ffff85e0>
<type '_sre.SRE_Match'>



In [14]:

    
print re.match('Pyth',my_string)  # we have a upper case letter this failes.









    



None



In [15]:

    
# flag - 'I', 'IGNORECASE'
# flags - added options

print re.match('Pyth',my_string,re.I)









    



<_sre.SRE_Match object at 0x7ff0ffff8718>



In [16]:

    
# Applying the regular expression
import re

answer = raw_input("do you want to come to the movie:")
#if answer == 'yes':    # 2 ** 3 == 8 combination
if re.match(answer,'yes',re.I):
    print "you are welcome to the movie"
else:
    print "better luck next time"









    



do you want to come to the movie:YeS
you are welcome to the movie



In [17]:

    
# search



In [18]:

    
print help(re.search)









    



Help on function search in module re:

search(pattern, string, flags=0)
    Scan through string looking for a match to the pattern, returning
    a match object, or None if no match was found.

None



In [19]:

    
my_string="python"



In [20]:

    
print re.match("py",my_string)









    



<_sre.SRE_Match object at 0x7ff0ffff89f0>



In [21]:

    
print re.match("pyt",my_string)









    



<_sre.SRE_Match object at 0x7ff0ffff8ac0>



In [22]:

    
print re.match('yt',my_string)









    



None



In [23]:

    
print re.search('yt',my_string)









    



<_sre.SRE_Match object at 0x7ff0ffff8b90>



In [25]:

    
my_sentence1 = "python is my first language"
my_sentence2 = "one of my first lanaguage is python"



In [26]:

    
print re.match("python",my_sentence1)  # True
print re.match("python",my_sentence2)  # False
print re.search("python",my_sentence1) # True
print re.search("python",my_sentence2) # True









    



<_sre.SRE_Match object at 0x7ff0ffff8cc8>
None
<_sre.SRE_Match object at 0x7ff0ffff8cc8>
<_sre.SRE_Match object at 0x7ff0ffff8cc8>



In [ ]:

    
# Special characters



In [27]:

    
# ^ (caret) -> beginning of a sentence.
# $ (dollar) -> end of a sentence.
# . (dot)  -> represents one character in the sentence.



In [28]:

    
my_sentence1 = "python is my first language"
my_sentence2 = "one of my first lanaguage is python"



In [30]:

    
print re.search("python",my_sentence1) # true
print re.search("python",my_sentence2) # true









    



<_sre.SRE_Match object at 0x7ff0ffff8e68>
<_sre.SRE_Match object at 0x7ff0ffff8e68>



In [29]:

    
# ^
print re.search("^python",my_sentence1) # True
print re.search("^python",my_sentence2) # False









    



<_sre.SRE_Match object at 0x7ff0ffff8e00>
None



In [31]:

    
# $
print re.search("python$",my_sentence1) # False
print re.search("python$",my_sentence2) # True









    



None
<_sre.SRE_Match object at 0x7ff0ffff8ed0>



In [34]:

    
# .
my_string = "python"
my_sentence1 = "python is my first language"
print re.match('.....',my_string)
print re.match('...',my_sentence1)









    



<_sre.SRE_Match object at 0x7ff0fff81100>
<_sre.SRE_Match object at 0x7ff0fff81100>



In [37]:

    
# group : group will work on a patter match provided it is true.
print re.match('.....',my_string).group()
print re.match('...',my_sentence1).group()
print re.search('f....',my_sentence1)
print re.search('f....',my_sentence1).group()









    



pytho
pyt
<_sre.SRE_Match object at 0x7ff0fff81370>
first



In [39]:

    
# find out the names of friends with exactly five characters
import re
my_students = ['aditya','kiran','arbaaz','harshita','tarun','kumar']

for value in my_students:
    if re.search('.....',value): # true
        print re.search('.....',value).group()









    



adity
kiran
arbaa
harsh
tarun
kumar



In [40]:

    
# slight modification
import re
my_students = ['aditya','kiran','arbaaz','harshita','tarun','kumar']

for value in my_students:
    if re.search('^.....$',value): # true
        print re.search('^.....$',value).group()









    



kiran
tarun
kumar



In [ ]:

    
# globbling characters
# * -> represents zero or more characters.
# + -> represents one or more characters_.
# ? -> represents zero or one character.



In [41]:

    
my_film1 = "ashique"
my_film2 = "aashique"
my_film3 = "aaashique"
my_film4 = "shique"



In [45]:

    
# *
# a* -> a repeated zero or more times.
print re.match('a*shique',my_film1)  
print re.match('a*shique',my_film2)  
print re.match('a*shique',my_film3)  
print re.match('a*shique',my_film4)









    



<_sre.SRE_Match object at 0x7ff0fff81718>
<_sre.SRE_Match object at 0x7ff0fff81718>
<_sre.SRE_Match object at 0x7ff0fff81718>
<_sre.SRE_Match object at 0x7ff0fff81718>



In [46]:

    
# +
# a+ -> a repeated one or more times.
print re.match('a+shique',my_film1)  
print re.match('a+shique',my_film2)  
print re.match('a+shique',my_film3)  
print re.match('a+shique',my_film4)









    



<_sre.SRE_Match object at 0x7ff0fff81780>
<_sre.SRE_Match object at 0x7ff0fff81780>
<_sre.SRE_Match object at 0x7ff0fff81780>
None



In [47]:

    
my_film1 = "ashique"
my_film2 = "aashique"
my_film3 = "aaashique"
my_film4 = "shique"

# ?
# a? -> a repeated zero or one time.
# ashique or shique
print re.match('a?shique',my_film1)  # TRUE
print re.match('a?shique',my_film2)  # FALSE
print re.match('a?shique',my_film3)  # FALSE
print re.match('a?shique',my_film4)  # TRUE









    



<_sre.SRE_Match object at 0x7ff0fff81850>
None
None
<_sre.SRE_Match object at 0x7ff0fff81850>



In [48]:

    
# # ashique or shique
print re.search('a?shique',my_film1)  # TRUE  
print re.search('a?shique',my_film2)  # TRUE
print re.search('a?shique',my_film3)  # TRUE
print re.search('a?shique',my_film4)  # TRUE









    



<_sre.SRE_Match object at 0x7ff0fff81920>
<_sre.SRE_Match object at 0x7ff0fff81920>
<_sre.SRE_Match object at 0x7ff0fff81920>
<_sre.SRE_Match object at 0x7ff0fff81920>



In [50]:

    
# globbing characters are greedy.
# *?,+?,?? - minimal matching
# by default globbling characters are greedy or they do maximal matching.

my_string1 = "<H1>hello<H1>"
my_string2 = "<HILL>hello1<HILL>"
my_string3 = "<HELL>hello2<HELL>"

print re.search('<.*>',my_string1).group()
print re.search('<.*>',my_string2).group()
print re.search('<.*>',my_string3).group()

# "<H1>hello<H1>"
# <.*>
# different patterns
# <H1> ,<.*>
# <H1>hello<H1> ,<.*> - maximal matching
# <.*?>
# <H1> - minimal matching









    



<H1>hello<H1>
<HILL>hello1<HILL>
<HELL>hello2<HELL>



In [51]:

    
print re.search('<.*?>',my_string1).group()
print re.search('<.*?>',my_string2).group()
print re.search('<.*?>',my_string3).group()









    



<H1>
<HILL>
<HELL>



In [52]:

    
# Anchors
# {m} -> a character repetead m number of times.
# {m,n} -> a character repeated between m and n number of times.
# {m,} -> a character repeated more than m number of times.

my_film1 = "ashique"
my_film2 = "aashique"
my_film3 = "aaashique"
my_film4 = "shique"

print re.match('a{2}shique',my_film1)
print re.match('a{2}shique',my_film2)
print re.match('a{2}shique',my_film3)
print re.match('a{2}shique',my_film4)









    



None
<_sre.SRE_Match object at 0x7ff0fff81ac0>
None
None



In [53]:

    
print re.match('a{0,2}shique',my_film1)
print re.match('a{0,2}shique',my_film2)
print re.match('a{0,2}shique',my_film3)
print re.match('a{0,2}shique',my_film4)









    



<_sre.SRE_Match object at 0x7ff0fff81bf8>
<_sre.SRE_Match object at 0x7ff0fff81bf8>
None
<_sre.SRE_Match object at 0x7ff0fff81bf8>



In [54]:

    
# compile
print help(re.compile)









    



Help on function compile in module re:

compile(pattern, flags=0)
    Compile a regular expression pattern, returning a pattern object.

None



In [58]:

    
# compile will save your pattern one time.
reg = re.compile('a{1,}shique',re.I)
reg1 = re.compile('a{0,3}shique',re.I)



In [56]:

    
print help(reg.match)









    



Help on built-in function match:

match(...)
    match(string[, pos[, endpos]]) --> match object or None.
    Matches zero or more characters at the beginning of the string

None



In [57]:

    
print reg.match(my_film1)
print reg.match(my_film2)
print reg.match(my_film3)
print reg.match(my_film4)









    



<_sre.SRE_Match object at 0x7ff0fff81f38>
<_sre.SRE_Match object at 0x7ff0fff81f38>
<_sre.SRE_Match object at 0x7ff0fff81f38>
None



In [59]:

    
print re.match('a{0,2}shique',my_film1)
print re.match('a{0,2}shique',my_film2)
print re.match('a{0,2}shique',my_film3)
print re.match('a{0,2}shique',my_film4)









    



<_sre.SRE_Match object at 0x7ff0fff81c60>
<_sre.SRE_Match object at 0x7ff0fff81c60>
None
<_sre.SRE_Match object at 0x7ff0fff81c60>



In [61]:

    
print reg1.match(my_film1)
print reg1.match(my_film2)
print reg1.match(my_film3)
print reg1.match(my_film4)









    



<_sre.SRE_Match object at 0x7ff0fff81d98>
<_sre.SRE_Match object at 0x7ff0fff81d98>
None
<_sre.SRE_Match object at 0x7ff0fff81d98>



In [ ]:

    
# Character set
# [a-z] - one characters from a to z.
# ^[a-z] - characters starting from a to z.
# [^a-z] - not having characters a to z.
# [0-9] - characters having numbers.
# [*+?.] - all the specical charactes are treated as literal symbols.



In [62]:

    
my_sentence1 = "python is a good language"



In [64]:

    
print re.match('[a-z]',my_sentence1)
print re.match('[a-z]',my_sentence1).group()









    



<_sre.SRE_Match object at 0x7ff0fff96100>
p



In [65]:

    
print re.match('[a-z]+',my_sentence1)
print re.match('[a-z]+',my_sentence1).group()









    



<_sre.SRE_Match object at 0x7ff0fff961d0>
python



In [66]:

    
#\s - one space
#\s+ - A group of spaces
print re.match('[a-z]+\s+',my_sentence1).group()









    



python



In [67]:

    
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+',my_sentence1).group()









    



python is a good language



In [68]:

    
print re.match('([a-z]+\s+){4}[a-z]+',my_sentence1).group()









    



python is a good language



In [69]:

    
print re.match('(\w+\s+){4}[a-z]+',my_sentence1).group()









    



python is a good language



In [112]:

    
# grouping
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+',my_sentence1).group()
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+',my_sentence1).group(0)
# index based grouping
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+([a-z]+)',my_sentence1).group(1)
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+([a-z]+)',my_sentence1).group(2)
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+([a-z]+)',my_sentence1).groups()
# a dictionary based grouping
print re.match('(?P<why>[a-z]+)\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+(?P<what>[a-z]+)',my_sentence1).group('why')









    



python is a good language
python is a good language
python
language
('python', 'language')
python



In [87]:

    
# VERBOSE
print re.match('''
(\w+\s+)     # a word and a space
{4}          # the word and space repeated 4 times.
[a-z]+       # the word repeated again without space
''',my_sentence1,re.X).group()









    



python is a good language



In [92]:

    
# search,match,findall
my_string = "python is a great language.python is good for starters.python is my first language."
print re.match("python",my_string).group()    # Match only from left to right and will stop if it find the match.
print re.search("python",my_string).group()   # search will scan through the whole string and stop at first possible match.
print re.findall("python",my_string)









    



python
python
['python', 'python', 'python']



In [97]:

    
# Multiline flag - re.M
my_string1 = "python is a great language.\npython is good for starters.\npython is my first language."
print my_string1
print re.findall("python",my_string1)
print re.findall("^python",my_string1)
print re.findall('^python',my_string1,re.M)









    



python is a great language.
python is good for starters.
python is my first language.
['python', 'python', 'python']
['python']
['python', 'python', 'python']



In [84]:

    
# example1
my_string = "enx020756033063 Link encap:Ethernet  HWaddr 02:07:56:03:30:63"
print re.search('[0-9]{2}:',my_string).group()
print re.search('([0-9]{2}:){5}[0-9]+',my_string).group()
print re.search('(\d{2}:){5}\d+',my_string).group()









    



02:
02:07:56:03:30:63
02:07:56:03:30:63



In [101]:

    
# example 2
email = """
my valid email address is tuxfux.hlp@gmail.com
my valid email address is tuxfux.hlp@yahoo.co.in
my valid email address is tuxfux.hlp
my valid email address is tuxfux.hlp@google.com
my valid email address is @gmail.com
my valid email address is tuxfux123.hlp@yahoo.co.in
"""

reg = re.compile("[0-9a-zA-Z.]+@[a-zA-Z.]+")
reg.findall(email)









    Out[101]:





['tuxfux.hlp@gmail.com',
 'tuxfux.hlp@yahoo.co.in',
 'tuxfux.hlp@google.com',
 'tuxfux123.hlp@yahoo.co.in']



In [106]:

    
# example 3
ip_address = '''
enp2s0    Link encap:Ethernet  HWaddr 34:17:eb:84:7d:29  
          UP BROADCAST MULTICAST  MTU:1500  Metric:1
          RX packets:0 errors:0 dropped:0 overruns:0 frame:0
          TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:1000 
          RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)

enx020756033063 Link encap:Ethernet  HWaddr 02:07:56:03:30:63  
          inet addr:192.168.42.151  Bcast:192.168.42.255  Mask:255.255.255.0
          inet6 addr: fe80::7:56ff:fe03:3063/64 Scope:Link
          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
          RX packets:319842 errors:5 dropped:0 overruns:0 frame:5
          TX packets:381352 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:1000 
          RX bytes:23969227 (23.9 MB)  TX bytes:374266709 (374.2 MB)

lo        Link encap:Local Loopback  
          inet addr:127.0.0.1  Mask:255.0.0.0
          inet6 addr: ::1/128 Scope:Host
          UP LOOPBACK RUNNING  MTU:65536  Metric:1
          RX packets:4859 errors:0 dropped:0 overruns:0 frame:0
          TX packets:4859 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:0 
          RX bytes:4896839 (4.8 MB)  TX bytes:4896839 (4.8 MB)

wlp3s0    Link encap:Ethernet  HWaddr d0:7e:35:5d:58:18  
          UP BROADCAST MULTICAST  MTU:1500  Metric:1
          RX packets:0 errors:0 dropped:0 overruns:0 frame:0
          TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:1000 
          RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
'''

print re.search('.*bcast',ip_address,re.I).group()
print re.search('.*:.*\s+bcast',ip_address,re.I).group()









    



          inet addr:192.168.42.151  Bcast
          inet addr:192.168.42.151  Bcast



In [ ]:

    
# REFERENCES
# https://docs.python.org/2/library/re.html
# https://docs.python.org/2/howto/regex.html