In [ ]:
# what are regular expressions or what is the need of regular expression.
# re are common in most of the programming languages.
# regular expression/regex/pattern matching

In [1]:
answer = raw_input("do you want to come to the movie:")
if answer == 'yes':
    print "you are welcome to the movie"
else:
    print "better luck next time"


do you want to come to the movie:yes
you are welcome to the movie

In [2]:
answer = raw_input("do you want to come to the movie - yes/no:")
if answer == 'yes':
    print "you are welcome to the movie"
else:
    print "better luck next time"


do you want to come to the movie - yes/no:no
better luck next time

In [3]:
answer = raw_input("do you want to come to the movie:")
if answer == 'yes':    # 2 ** 3 == 8 combination
    print "you are welcome to the movie"
else:
    print "better luck next time"


do you want to come to the movie:Yes
better luck next time

In [4]:
import re

In [9]:
my_string = "python"

In [5]:
print dir(re)


['DEBUG', 'DOTALL', 'I', 'IGNORECASE', 'L', 'LOCALE', 'M', 'MULTILINE', 'S', 'Scanner', 'T', 'TEMPLATE', 'U', 'UNICODE', 'VERBOSE', 'X', '_MAXCACHE', '__all__', '__builtins__', '__doc__', '__file__', '__name__', '__package__', '__version__', '_alphanum', '_cache', '_cache_repl', '_compile', '_compile_repl', '_expand', '_locale', '_pattern_type', '_pickle', '_subx', 'compile', 'copy_reg', 'error', 'escape', 'findall', 'finditer', 'match', 'purge', 'search', 'split', 'sre_compile', 'sre_parse', 'sub', 'subn', 'sys', 'template']

In [6]:
# match

In [7]:
print help(re.match)


Help on function match in module re:

match(pattern, string, flags=0)
    Try to apply the pattern at the start of the string, returning
    a match object, or None if no match was found.

None

In [ ]:
# patter is basically a subset of your string.
# string - python
# few subsets of patter- ex: p,py,pyt,pyth

In [10]:
print re.match('p',my_string) # we got a match pattern


<_sre.SRE_Match object at 0x7ff0ffff8510>

In [11]:
print re.match('yth',my_string) #  this pattern fails,as the match has be from left to right.


None

In [13]:
print re.match('pyth',my_string)
print type(re.match('pyth',my_string))


<_sre.SRE_Match object at 0x7ff0ffff85e0>
<type '_sre.SRE_Match'>

In [14]:
print re.match('Pyth',my_string)  # we have a upper case letter this failes.


None

In [15]:
# flag - 'I', 'IGNORECASE'
# flags - added options

print re.match('Pyth',my_string,re.I)


<_sre.SRE_Match object at 0x7ff0ffff8718>

In [16]:
# Applying the regular expression
import re

answer = raw_input("do you want to come to the movie:")
#if answer == 'yes':    # 2 ** 3 == 8 combination
if re.match(answer,'yes',re.I):
    print "you are welcome to the movie"
else:
    print "better luck next time"


do you want to come to the movie:YeS
you are welcome to the movie

In [17]:
# search

In [18]:
print help(re.search)


Help on function search in module re:

search(pattern, string, flags=0)
    Scan through string looking for a match to the pattern, returning
    a match object, or None if no match was found.

None

In [19]:
my_string="python"

In [20]:
print re.match("py",my_string)


<_sre.SRE_Match object at 0x7ff0ffff89f0>

In [21]:
print re.match("pyt",my_string)


<_sre.SRE_Match object at 0x7ff0ffff8ac0>

In [22]:
print re.match('yt',my_string)


None

In [23]:
print re.search('yt',my_string)


<_sre.SRE_Match object at 0x7ff0ffff8b90>

In [25]:
my_sentence1 = "python is my first language"
my_sentence2 = "one of my first lanaguage is python"

In [26]:
print re.match("python",my_sentence1)  # True
print re.match("python",my_sentence2)  # False
print re.search("python",my_sentence1) # True
print re.search("python",my_sentence2) # True


<_sre.SRE_Match object at 0x7ff0ffff8cc8>
None
<_sre.SRE_Match object at 0x7ff0ffff8cc8>
<_sre.SRE_Match object at 0x7ff0ffff8cc8>

In [ ]:
# Special characters

In [27]:
# ^ (caret) -> beginning of a sentence.
# $ (dollar) -> end of a sentence.
# . (dot)  -> represents one character in the sentence.

In [28]:
my_sentence1 = "python is my first language"
my_sentence2 = "one of my first lanaguage is python"

In [30]:
print re.search("python",my_sentence1) # true
print re.search("python",my_sentence2) # true


<_sre.SRE_Match object at 0x7ff0ffff8e68>
<_sre.SRE_Match object at 0x7ff0ffff8e68>

In [29]:
# ^
print re.search("^python",my_sentence1) # True
print re.search("^python",my_sentence2) # False


<_sre.SRE_Match object at 0x7ff0ffff8e00>
None

In [31]:
# $
print re.search("python$",my_sentence1) # False
print re.search("python$",my_sentence2) # True


None
<_sre.SRE_Match object at 0x7ff0ffff8ed0>

In [34]:
# .
my_string = "python"
my_sentence1 = "python is my first language"
print re.match('.....',my_string)
print re.match('...',my_sentence1)


<_sre.SRE_Match object at 0x7ff0fff81100>
<_sre.SRE_Match object at 0x7ff0fff81100>

In [37]:
# group : group will work on a patter match provided it is true.
print re.match('.....',my_string).group()
print re.match('...',my_sentence1).group()
print re.search('f....',my_sentence1)
print re.search('f....',my_sentence1).group()


pytho
pyt
<_sre.SRE_Match object at 0x7ff0fff81370>
first

In [39]:
# find out the names of friends with exactly five characters
import re
my_students = ['aditya','kiran','arbaaz','harshita','tarun','kumar']

for value in my_students:
    if re.search('.....',value): # true
        print re.search('.....',value).group()


adity
kiran
arbaa
harsh
tarun
kumar

In [40]:
# slight modification
import re
my_students = ['aditya','kiran','arbaaz','harshita','tarun','kumar']

for value in my_students:
    if re.search('^.....$',value): # true
        print re.search('^.....$',value).group()


kiran
tarun
kumar

In [ ]:
# globbling characters
# * -> represents zero or more characters.
# + -> represents one or more characters_.
# ? -> represents zero or one character.

In [41]:
my_film1 = "ashique"
my_film2 = "aashique"
my_film3 = "aaashique"
my_film4 = "shique"

In [45]:
# *
# a* -> a repeated zero or more times.
print re.match('a*shique',my_film1)  
print re.match('a*shique',my_film2)  
print re.match('a*shique',my_film3)  
print re.match('a*shique',my_film4)


<_sre.SRE_Match object at 0x7ff0fff81718>
<_sre.SRE_Match object at 0x7ff0fff81718>
<_sre.SRE_Match object at 0x7ff0fff81718>
<_sre.SRE_Match object at 0x7ff0fff81718>

In [46]:
# +
# a+ -> a repeated one or more times.
print re.match('a+shique',my_film1)  
print re.match('a+shique',my_film2)  
print re.match('a+shique',my_film3)  
print re.match('a+shique',my_film4)


<_sre.SRE_Match object at 0x7ff0fff81780>
<_sre.SRE_Match object at 0x7ff0fff81780>
<_sre.SRE_Match object at 0x7ff0fff81780>
None

In [47]:
my_film1 = "ashique"
my_film2 = "aashique"
my_film3 = "aaashique"
my_film4 = "shique"

# ?
# a? -> a repeated zero or one time.
# ashique or shique
print re.match('a?shique',my_film1)  # TRUE
print re.match('a?shique',my_film2)  # FALSE
print re.match('a?shique',my_film3)  # FALSE
print re.match('a?shique',my_film4)  # TRUE


<_sre.SRE_Match object at 0x7ff0fff81850>
None
None
<_sre.SRE_Match object at 0x7ff0fff81850>

In [48]:
# # ashique or shique
print re.search('a?shique',my_film1)  # TRUE  
print re.search('a?shique',my_film2)  # TRUE
print re.search('a?shique',my_film3)  # TRUE
print re.search('a?shique',my_film4)  # TRUE


<_sre.SRE_Match object at 0x7ff0fff81920>
<_sre.SRE_Match object at 0x7ff0fff81920>
<_sre.SRE_Match object at 0x7ff0fff81920>
<_sre.SRE_Match object at 0x7ff0fff81920>

In [50]:
# globbing characters are greedy.
# *?,+?,?? - minimal matching
# by default globbling characters are greedy or they do maximal matching.

my_string1 = "<H1>hello<H1>"
my_string2 = "<HILL>hello1<HILL>"
my_string3 = "<HELL>hello2<HELL>"

print re.search('<.*>',my_string1).group()
print re.search('<.*>',my_string2).group()
print re.search('<.*>',my_string3).group()

# "<H1>hello<H1>"
# <.*>
# different patterns
# <H1> ,<.*>
# <H1>hello<H1> ,<.*> - maximal matching
# <.*?>
# <H1> - minimal matching


<H1>hello<H1>
<HILL>hello1<HILL>
<HELL>hello2<HELL>

In [51]:
print re.search('<.*?>',my_string1).group()
print re.search('<.*?>',my_string2).group()
print re.search('<.*?>',my_string3).group()


<H1>
<HILL>
<HELL>

In [52]:
# Anchors
# {m} -> a character repetead m number of times.
# {m,n} -> a character repeated between m and n number of times.
# {m,} -> a character repeated more than m number of times.

my_film1 = "ashique"
my_film2 = "aashique"
my_film3 = "aaashique"
my_film4 = "shique"

print re.match('a{2}shique',my_film1)
print re.match('a{2}shique',my_film2)
print re.match('a{2}shique',my_film3)
print re.match('a{2}shique',my_film4)


None
<_sre.SRE_Match object at 0x7ff0fff81ac0>
None
None

In [53]:
print re.match('a{0,2}shique',my_film1)
print re.match('a{0,2}shique',my_film2)
print re.match('a{0,2}shique',my_film3)
print re.match('a{0,2}shique',my_film4)


<_sre.SRE_Match object at 0x7ff0fff81bf8>
<_sre.SRE_Match object at 0x7ff0fff81bf8>
None
<_sre.SRE_Match object at 0x7ff0fff81bf8>

In [54]:
# compile
print help(re.compile)


Help on function compile in module re:

compile(pattern, flags=0)
    Compile a regular expression pattern, returning a pattern object.

None

In [58]:
# compile will save your pattern one time.
reg = re.compile('a{1,}shique',re.I)
reg1 = re.compile('a{0,3}shique',re.I)

In [56]:
print help(reg.match)


Help on built-in function match:

match(...)
    match(string[, pos[, endpos]]) --> match object or None.
    Matches zero or more characters at the beginning of the string

None

In [57]:
print reg.match(my_film1)
print reg.match(my_film2)
print reg.match(my_film3)
print reg.match(my_film4)


<_sre.SRE_Match object at 0x7ff0fff81f38>
<_sre.SRE_Match object at 0x7ff0fff81f38>
<_sre.SRE_Match object at 0x7ff0fff81f38>
None

In [59]:
print re.match('a{0,2}shique',my_film1)
print re.match('a{0,2}shique',my_film2)
print re.match('a{0,2}shique',my_film3)
print re.match('a{0,2}shique',my_film4)


<_sre.SRE_Match object at 0x7ff0fff81c60>
<_sre.SRE_Match object at 0x7ff0fff81c60>
None
<_sre.SRE_Match object at 0x7ff0fff81c60>

In [61]:
print reg1.match(my_film1)
print reg1.match(my_film2)
print reg1.match(my_film3)
print reg1.match(my_film4)


<_sre.SRE_Match object at 0x7ff0fff81d98>
<_sre.SRE_Match object at 0x7ff0fff81d98>
None
<_sre.SRE_Match object at 0x7ff0fff81d98>

In [ ]:
# Character set
# [a-z] - one characters from a to z.
# ^[a-z] - characters starting from a to z.
# [^a-z] - not having characters a to z.
# [0-9] - characters having numbers.
# [*+?.] - all the specical charactes are treated as literal symbols.

In [62]:
my_sentence1 = "python is a good language"

In [64]:
print re.match('[a-z]',my_sentence1)
print re.match('[a-z]',my_sentence1).group()


<_sre.SRE_Match object at 0x7ff0fff96100>
p

In [65]:
print re.match('[a-z]+',my_sentence1)
print re.match('[a-z]+',my_sentence1).group()


<_sre.SRE_Match object at 0x7ff0fff961d0>
python

In [66]:
#\s - one space
#\s+ - A group of spaces
print re.match('[a-z]+\s+',my_sentence1).group()


python 

In [67]:
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+',my_sentence1).group()


python is a good language

In [68]:
print re.match('([a-z]+\s+){4}[a-z]+',my_sentence1).group()


python is a good language

In [69]:
print re.match('(\w+\s+){4}[a-z]+',my_sentence1).group()


python is a good language

In [112]:
# grouping
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+',my_sentence1).group()
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+',my_sentence1).group(0)
# index based grouping
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+([a-z]+)',my_sentence1).group(1)
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+([a-z]+)',my_sentence1).group(2)
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+([a-z]+)',my_sentence1).groups()
# a dictionary based grouping
print re.match('(?P<why>[a-z]+)\s+[a-z]+\s+[a-z]+\s+[a-z]+\s+(?P<what>[a-z]+)',my_sentence1).group('why')


python is a good language
python is a good language
python
language
('python', 'language')
python

In [87]:
# VERBOSE
print re.match('''
(\w+\s+)     # a word and a space
{4}          # the word and space repeated 4 times.
[a-z]+       # the word repeated again without space
''',my_sentence1,re.X).group()


python is a good language

In [92]:
# search,match,findall
my_string = "python is a great language.python is good for starters.python is my first language."
print re.match("python",my_string).group()    # Match only from left to right and will stop if it find the match.
print re.search("python",my_string).group()   # search will scan through the whole string and stop at first possible match.
print re.findall("python",my_string)


python
python
['python', 'python', 'python']

In [97]:
# Multiline flag - re.M
my_string1 = "python is a great language.\npython is good for starters.\npython is my first language."
print my_string1
print re.findall("python",my_string1)
print re.findall("^python",my_string1)
print re.findall('^python',my_string1,re.M)


python is a great language.
python is good for starters.
python is my first language.
['python', 'python', 'python']
['python']
['python', 'python', 'python']

In [84]:
# example1
my_string = "enx020756033063 Link encap:Ethernet  HWaddr 02:07:56:03:30:63"
print re.search('[0-9]{2}:',my_string).group()
print re.search('([0-9]{2}:){5}[0-9]+',my_string).group()
print re.search('(\d{2}:){5}\d+',my_string).group()


02:
02:07:56:03:30:63
02:07:56:03:30:63

In [101]:
# example 2
email = """
my valid email address is tuxfux.hlp@gmail.com
my valid email address is tuxfux.hlp@yahoo.co.in
my valid email address is tuxfux.hlp
my valid email address is tuxfux.hlp@google.com
my valid email address is @gmail.com
my valid email address is tuxfux123.hlp@yahoo.co.in
"""

reg = re.compile("[0-9a-zA-Z.]+@[a-zA-Z.]+")
reg.findall(email)


Out[101]:
['tuxfux.hlp@gmail.com',
 'tuxfux.hlp@yahoo.co.in',
 'tuxfux.hlp@google.com',
 'tuxfux123.hlp@yahoo.co.in']

In [106]:
# example 3
ip_address = '''
enp2s0    Link encap:Ethernet  HWaddr 34:17:eb:84:7d:29  
          UP BROADCAST MULTICAST  MTU:1500  Metric:1
          RX packets:0 errors:0 dropped:0 overruns:0 frame:0
          TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:1000 
          RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)

enx020756033063 Link encap:Ethernet  HWaddr 02:07:56:03:30:63  
          inet addr:192.168.42.151  Bcast:192.168.42.255  Mask:255.255.255.0
          inet6 addr: fe80::7:56ff:fe03:3063/64 Scope:Link
          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
          RX packets:319842 errors:5 dropped:0 overruns:0 frame:5
          TX packets:381352 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:1000 
          RX bytes:23969227 (23.9 MB)  TX bytes:374266709 (374.2 MB)

lo        Link encap:Local Loopback  
          inet addr:127.0.0.1  Mask:255.0.0.0
          inet6 addr: ::1/128 Scope:Host
          UP LOOPBACK RUNNING  MTU:65536  Metric:1
          RX packets:4859 errors:0 dropped:0 overruns:0 frame:0
          TX packets:4859 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:0 
          RX bytes:4896839 (4.8 MB)  TX bytes:4896839 (4.8 MB)

wlp3s0    Link encap:Ethernet  HWaddr d0:7e:35:5d:58:18  
          UP BROADCAST MULTICAST  MTU:1500  Metric:1
          RX packets:0 errors:0 dropped:0 overruns:0 frame:0
          TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:1000 
          RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
'''

print re.search('.*bcast',ip_address,re.I).group()
print re.search('.*:.*\s+bcast',ip_address,re.I).group()


          inet addr:192.168.42.151  Bcast
          inet addr:192.168.42.151  Bcast

In [ ]:
# REFERENCES
# https://docs.python.org/2/library/re.html
# https://docs.python.org/2/howto/regex.html