In [ ]:
# what are regular expressions or what is the need of regular expression.
# re are common in most of the programming languages.
# regular expression/regex/pattern matching
In [1]:
answer = raw_input("do you want to come to the movie:")
if answer == 'yes':
print "you are welcome to the movie"
else:
print "better luck next time"
In [2]:
answer = raw_input("do you want to come to the movie - yes/no:")
if answer == 'yes':
print "you are welcome to the movie"
else:
print "better luck next time"
In [3]:
answer = raw_input("do you want to come to the movie:")
if answer == 'yes': # 2 ** 3 == 8 combination
print "you are welcome to the movie"
else:
print "better luck next time"
In [4]:
import re
In [9]:
my_string = "python"
In [5]:
print dir(re)
In [6]:
# match
In [7]:
print help(re.match)
In [ ]:
# patter is basically a subset of your string.
# string - python
# few subsets of patter- ex: p,py,pyt,pyth
In [10]:
print re.match('p',my_string) # we got a match pattern
In [11]:
print re.match('yth',my_string) # this pattern fails,as the match has be from left to right.
In [13]:
print re.match('pyth',my_string)
print type(re.match('pyth',my_string))
In [14]:
print re.match('Pyth',my_string) # we have a upper case letter this failes.
In [15]:
# flag - 'I', 'IGNORECASE'
# flags - added options
print re.match('Pyth',my_string,re.I)
In [16]:
# Applying the regular expression
import re
answer = raw_input("do you want to come to the movie:")
#if answer == 'yes': # 2 ** 3 == 8 combination
if re.match(answer,'yes',re.I):
print "you are welcome to the movie"
else:
print "better luck next time"
In [17]:
# search
In [18]:
print help(re.search)
In [19]:
my_string="python"
In [20]:
print re.match("py",my_string)
In [21]:
print re.match("pyt",my_string)
In [22]:
print re.match('yt',my_string)
In [23]:
print re.search('yt',my_string)
In [25]:
my_sentence1 = "python is my first language"
my_sentence2 = "one of my first lanaguage is python"
In [26]:
print re.match("python",my_sentence1) # True
print re.match("python",my_sentence2) # False
print re.search("python",my_sentence1) # True
print re.search("python",my_sentence2) # True
In [ ]:
# Special characters
In [27]:
# ^ (caret) -> beginning of a sentence.
# $ (dollar) -> end of a sentence.
# . (dot) -> represents one character in the sentence.
In [28]:
my_sentence1 = "python is my first language"
my_sentence2 = "one of my first lanaguage is python"
In [30]:
print re.search("python",my_sentence1) # true
print re.search("python",my_sentence2) # true
In [29]:
# ^
print re.search("^python",my_sentence1) # True
print re.search("^python",my_sentence2) # False
In [31]:
# $
print re.search("python$",my_sentence1) # False
print re.search("python$",my_sentence2) # True
In [34]:
# .
my_string = "python"
my_sentence1 = "python is my first language"
print re.match('.....',my_string)
print re.match('...',my_sentence1)
In [37]:
# group : group will work on a patter match provided it is true.
print re.match('.....',my_string).group()
print re.match('...',my_sentence1).group()
print re.search('f....',my_sentence1)
print re.search('f....',my_sentence1).group()
In [39]:
# find out the names of friends with exactly five characters
import re
my_students = ['aditya','kiran','arbaaz','harshita','tarun','kumar']
for value in my_students:
if re.search('.....',value): # true
print re.search('.....',value).group()
In [40]:
# slight modification
import re
my_students = ['aditya','kiran','arbaaz','harshita','tarun','kumar']
for value in my_students:
if re.search('^.....$',value): # true
print re.search('^.....$',value).group()
In [ ]:
# globbling characters
# * -> represents zero or more characters.
# + -> represents one or more characters_.
# ? -> represents zero or one character.
In [41]:
my_film1 = "ashique"
my_film2 = "aashique"
my_film3 = "aaashique"
my_film4 = "shique"
In [45]:
# *
# a* -> a repeated zero or more times.
print re.match('a*shique',my_film1)
print re.match('a*shique',my_film2)
print re.match('a*shique',my_film3)
print re.match('a*shique',my_film4)
In [46]:
# +
# a+ -> a repeated one or more times.
print re.match('a+shique',my_film1)
print re.match('a+shique',my_film2)
print re.match('a+shique',my_film3)
print re.match('a+shique',my_film4)
In [47]:
my_film1 = "ashique"
my_film2 = "aashique"
my_film3 = "aaashique"
my_film4 = "shique"
# ?
# a? -> a repeated zero or one time.
# ashique or shique
print re.match('a?shique',my_film1) # TRUE
print re.match('a?shique',my_film2) # FALSE
print re.match('a?shique',my_film3) # FALSE
print re.match('a?shique',my_film4) # TRUE
In [48]:
# # ashique or shique
print re.search('a?shique',my_film1) # TRUE
print re.search('a?shique',my_film2) # TRUE
print re.search('a?shique',my_film3) # TRUE
print re.search('a?shique',my_film4) # TRUE
In [50]:
# globbing characters are greedy.
# *?,+?,?? - minimal matching
# by default globbling characters are greedy or they do maximal matching.
my_string1 = "<H1>hello<H1>"
my_string2 = "<HILL>hello1<HILL>"
my_string3 = "<HELL>hello2<HELL>"
print re.search('<.*>',my_string1).group()
print re.search('<.*>',my_string2).group()
print re.search('<.*>',my_string3).group()
# "<H1>hello<H1>"
# <.*>
# different patterns
# <H1> ,<.*>
# <H1>hello<H1> ,<.*> - maximal matching
# <.*?>
# <H1> - minimal matching
In [51]:
print re.search('<.*?>',my_string1).group()
print re.search('<.*?>',my_string2).group()
print re.search('<.*?>',my_string3).group()
In [52]:
# Anchors
# {m} -> a character repetead m number of times.
# {m,n} -> a character repeated between m and n number of times.
# {m,} -> a character repeated more than m number of times.
my_film1 = "ashique"
my_film2 = "aashique"
my_film3 = "aaashique"
my_film4 = "shique"
print re.match('a{2}shique',my_film1)
print re.match('a{2}shique',my_film2)
print re.match('a{2}shique',my_film3)
print re.match('a{2}shique',my_film4)
In [53]:
print re.match('a{0,2}shique',my_film1)
print re.match('a{0,2}shique',my_film2)
print re.match('a{0,2}shique',my_film3)
print re.match('a{0,2}shique',my_film4)
In [54]:
# compile
print help(re.compile)
In [58]:
# compile will save your pattern one time.
reg = re.compile('a{1,}shique',re.I)
reg1 = re.compile('a{0,3}shique',re.I)
In [56]:
print help(reg.match)
In [57]:
print reg.match(my_film1)
print reg.match(my_film2)
print reg.match(my_film3)
print reg.match(my_film4)
In [59]:
print re.match('a{0,2}shique',my_film1)
print re.match('a{0,2}shique',my_film2)
print re.match('a{0,2}shique',my_film3)
print re.match('a{0,2}shique',my_film4)
In [61]:
print reg1.match(my_film1)
print reg1.match(my_film2)
print reg1.match(my_film3)
print reg1.match(my_film4)
In [ ]:
# Character set
# [a-z] - one characters from a to z.
# ^[a-z] - characters starting from a to z.
# [^a-z] - not having characters a to z.
# [0-9] - characters having numbers.
# [*+?.] - all the specical charactes are treated as literal symbols.
In [62]:
my_sentence1 = "python is a good language"
In [64]:
print re.match('[a-z]',my_sentence1)
print re.match('[a-z]',my_sentence1).group()
In [65]:
print re.match('[a-z]+',my_sentence1)
print re.match('[a-z]+',my_sentence1).group()
In [66]:
#\s - one space
#\s+ - A group of spaces
print re.match('[a-z]+\s+',my_sentence1).group()
In [67]:
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+',my_sentence1).group()
In [68]:
print re.match('([a-z]+\s+){4}[a-z]+',my_sentence1).group()
In [69]:
print re.match('(\w+\s+){4}[a-z]+',my_sentence1).group()
In [113]:
# grouping
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+',my_sentence1).group()
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+',my_sentence1).group(0)
# index based grouping
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+([a-z]+)',my_sentence1).group(1)
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+([a-z]+)',my_sentence1).group(2)
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+([a-z]+)',my_sentence1).groups()
# a dictionary based grouping
print re.match('(?P<why>[a-z]+)\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+(?P<what>[a-z]+)',my_sentence1).group('why')
print re.match('(?P<why>[a-z]+)\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+(?P<what>[a-z]+)',my_sentence1).group('what')
In [87]:
# VERBOSE
print re.match('''
(\w+\s+) # a word and a space
{4} # the word and space repeated 4 times.
[a-z]+ # the word repeated again without space
''',my_sentence1,re.X).group()
In [92]:
# search,match,findall
my_string = "python is a great language.python is good for starters.python is my first language."
print re.match("python",my_string).group() # Match only from left to right and will stop if it find the match.
print re.search("python",my_string).group() # search will scan through the whole string and stop at first possible match.
print re.findall("python",my_string)
In [97]:
# Multiline flag - re.M
my_string1 = "python is a great language.\npython is good for starters.\npython is my first language."
print my_string1
print re.findall("python",my_string1)
print re.findall("^python",my_string1)
print re.findall('^python',my_string1,re.M)
In [84]:
# example1
my_string = "enx020756033063 Link encap:Ethernet HWaddr 02:07:56:03:30:63"
print re.search('[0-9]{2}:',my_string).group()
print re.search('([0-9]{2}:){5}[0-9]+',my_string).group()
print re.search('(\d{2}:){5}\d+',my_string).group()
In [101]:
# example 2
email = """
my valid email address is tuxfux.hlp@gmail.com
my valid email address is tuxfux.hlp@yahoo.co.in
my valid email address is tuxfux.hlp
my valid email address is tuxfux.hlp@google.com
my valid email address is @gmail.com
my valid email address is tuxfux123.hlp@yahoo.co.in
"""
reg = re.compile("[0-9a-zA-Z.]+@[a-zA-Z.]+")
reg.findall(email)
Out[101]:
In [115]:
# example 3
ip_address = '''
enp2s0 Link encap:Ethernet HWaddr 34:17:eb:84:7d:29
UP BROADCAST MULTICAST MTU:1500 Metric:1
RX packets:0 errors:0 dropped:0 overruns:0 frame:0
TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:0 (0.0 B) TX bytes:0 (0.0 B)
enx020756033063 Link encap:Ethernet HWaddr 02:07:56:03:30:63
inet addr:192.168.42.151 Bcast:192.168.42.255 Mask:255.255.255.0
inet6 addr: fe80::7:56ff:fe03:3063/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:319842 errors:5 dropped:0 overruns:0 frame:5
TX packets:381352 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:23969227 (23.9 MB) TX bytes:374266709 (374.2 MB)
lo Link encap:Local Loopback
inet addr:127.0.0.1 Mask:255.0.0.0
inet6 addr: ::1/128 Scope:Host
UP LOOPBACK RUNNING MTU:65536 Metric:1
RX packets:4859 errors:0 dropped:0 overruns:0 frame:0
TX packets:4859 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:0
RX bytes:4896839 (4.8 MB) TX bytes:4896839 (4.8 MB)
wlp3s0 Link encap:Ethernet HWaddr d0:7e:35:5d:58:18
UP BROADCAST MULTICAST MTU:1500 Metric:1
RX packets:0 errors:0 dropped:0 overruns:0 frame:0
TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:0 (0.0 B) TX bytes:0 (0.0 B)
'''
print re.search('.*bcast',ip_address,re.I).group()
my_ip=re.search('.*:(.*)\s+bcast',ip_address,re.I).group(1)
print my_ip
In [ ]:
# REFERENCES
# https://docs.python.org/2/library/re.html
# https://docs.python.org/2/howto/regex.html