In [ ]:
# re,regex,pattern matching.
# references: https://docs.python.org/2/library/re.html
# https://docs.python.org/2/howto/regex.html

In [1]:
import re

In [2]:
print dir(re)


['DEBUG', 'DOTALL', 'I', 'IGNORECASE', 'L', 'LOCALE', 'M', 'MULTILINE', 'S', 'Scanner', 'T', 'TEMPLATE', 'U', 'UNICODE', 'VERBOSE', 'X', '_MAXCACHE', '__all__', '__builtins__', '__doc__', '__file__', '__name__', '__package__', '__version__', '_alphanum', '_cache', '_cache_repl', '_compile', '_compile_repl', '_expand', '_pattern_type', '_pickle', '_subx', 'compile', 'copy_reg', 'error', 'escape', 'findall', 'finditer', 'match', 'purge', 'search', 'split', 'sre_compile', 'sre_parse', 'sub', 'subn', 'sys', 'template']

In [3]:
print help(re.match)


Help on function match in module re:

match(pattern, string, flags=0)
    Try to apply the pattern at the start of the string, returning
    a match object, or None if no match was found.

None

In [4]:
my_string="python"

In [5]:
print re.match("p",my_string)


<_sre.SRE_Match object at 0x7fcac87dd6b0>

In [6]:
print re.match("py",my_string) # match
print re.match("pythons",my_string) # no match
print re.match("yt",my_string) # no match


<_sre.SRE_Match object at 0x7fcac87dd5e0>
None
None

In [8]:
print re.match("Py",my_string)
print re.match("Py",my_string,re.I)


None
<_sre.SRE_Match object at 0x7fcac87dd578>

In [ ]:
# search

In [9]:
print help(re.search)


Help on function search in module re:

search(pattern, string, flags=0)
    Scan through string looking for a match to the pattern, returning
    a match object, or None if no match was found.

None

In [10]:
print re.search("yt",my_string)


<_sre.SRE_Match object at 0x7fcac87dd440>

In [11]:
# sentences
my_sentence1 = "python is a great language."
my_sentence2 = "one of the great languages is python."

print re.match("python",my_sentence1) # true
print re.match("python",my_sentence2) # False
print re.search('python',my_sentence1) # true
print re.search('python',my_sentence2) # true


<_sre.SRE_Match object at 0x7fcac87dd510>
None
<_sre.SRE_Match object at 0x7fcac87dd510>
<_sre.SRE_Match object at 0x7fcac87dd510>

In [12]:
# compile
print help(re.compile)


Help on function compile in module re:

compile(pattern, flags=0)
    Compile a regular expression pattern, returning a pattern object.

None

In [14]:
reg = re.compile("python",re.I)

print help(reg.match)


Help on built-in function match:

match(...)
    match(string[, pos[, endpos]]) --> match object or None.
    Matches zero or more characters at the beginning of the string

None

In [15]:
print reg.match(my_sentence1)  # true
print reg.match(my_sentence2)  # False
print reg.search(my_sentence1) # true
print reg.search(my_sentence2) # true


<_sre.SRE_Match object at 0x7fcac87dd168>
None
<_sre.SRE_Match object at 0x7fcac87dd168>
<_sre.SRE_Match object at 0x7fcac87dd168>

In [ ]:
# special characters
# ^ -> caret -> beginning of a sentence.
# $ -> dollar -> end of a sentence.
# . -> dot -> one character.

In [16]:
# sentences
my_sentence1 = "python is a great language."
my_sentence2 = "one of the great languages is python."

In [17]:
reg = re.compile('python',re.I)
reg1 = re.compile('^python',re.I)
print reg1.search(my_sentence1) # true
print reg.search(my_sentence1)  # true
print reg.search(my_sentence2)  # true
print reg1.search(my_sentence2) # false


<_sre.SRE_Match object at 0x7fcac87dd370>
<_sre.SRE_Match object at 0x7fcac87dd370>
<_sre.SRE_Match object at 0x7fcac87dd370>
None

In [24]:
# dot (.)
# . doest not repressent a \n or \r .
my_string="python"
my_string1 = "python\n"
print re.match('......',my_string,re.I)
print re.match('.......',my_string1,re.I)
print re.match('.......',my_string1,(re.I|re.DOTALL))
print re.match('...',my_string,re.I).group()


<_sre.SRE_Match object at 0x7fcac87ddcc8>
None
<_sre.SRE_Match object at 0x7fcac87ddcc8>
pyt

In [7]:
# example.
import re

reg = re.compile('^.....$',re.I)
students = ['khiri','Sujani','rahulji','Ramyaji','kirankumar','kumar']

for value in students:
    if reg.match(value):
        print reg.match(value).group()


khiri
kumar

In [ ]:
# globbling characters

In [ ]:
# * -> zero or more charactes.
# + -> one or more characters.
# ? -> zero or one characters.

In [8]:
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"

In [9]:
reg = re.compile("a*shique")

In [10]:
print reg.match(film1)
print reg.match(film2)
print reg.match(film3)
print reg.match(film4)


<_sre.SRE_Match object at 0x7f5003b97cc8>
<_sre.SRE_Match object at 0x7f5003b97cc8>
<_sre.SRE_Match object at 0x7f5003b97cc8>
<_sre.SRE_Match object at 0x7f5003b97cc8>

In [11]:
reg1 = re.compile("a+shique")

In [12]:
print reg1.match(film1)
print reg1.match(film2)
print reg1.match(film3)
print reg1.match(film4)


<_sre.SRE_Match object at 0x7f5003b97e68>
<_sre.SRE_Match object at 0x7f5003b97e68>
<_sre.SRE_Match object at 0x7f5003b97e68>
None

In [13]:
reg2 = re.compile("a?shique")

In [14]:
print reg2.match(film1)
print reg2.match(film2)
print reg2.match(film3)
print reg2.match(film4)


<_sre.SRE_Match object at 0x7f5003ba8168>
None
None
<_sre.SRE_Match object at 0x7f5003ba8168>

In [15]:
'''
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"
'''

print reg2.search(film1) # true
print reg2.search(film2) # true,false
print reg2.search(film3) # true,false
print reg2.search(film4) # true,false


<_sre.SRE_Match object at 0x7f5003ba82a0>
<_sre.SRE_Match object at 0x7f5003ba82a0>
<_sre.SRE_Match object at 0x7f5003ba82a0>
<_sre.SRE_Match object at 0x7f5003ba82a0>

In [17]:
# exercise
# globbling as greedy.
my_name1 = "<n1>kumar</n1>"
my_name2 = "<n2>hello</n2>"
my_name3 = "<hello>raj</hello>"
my_name4 = "<hai>whatsup</hai>"

In [18]:
reg3 = re.compile('<.*>')

In [20]:
print reg3.match(my_name1).group()
print reg3.match(my_name2).group()
print reg3.match(my_name3).group()
print reg3.match(my_name4).group()


<n1>kumar</n1>
<n2>hello</n2>
<hello>raj<hello>
<hai>whatsup</hai>

In [ ]:
# ??,*?,+? -> minimal matchings.

In [24]:
reg4 = re.compile('<.*?>')

In [25]:
print reg4.match(my_name1).group()
print reg4.match(my_name2).group()
print reg4.match(my_name3).group()
print reg4.match(my_name4).group()


<n1>
<n2>
<hello>
<hai>

In [26]:
# anchors

In [27]:
# {m} -> exactly m number of characters.
# {m,n} -> exactly between m and n number of characters.
# {m,} -> more than m number of charcters.

In [29]:
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"

In [30]:
print re.match('a{2}shique',film1) # no
print re.match('a{2}shique',film2) # yes
print re.match('a{2}shique',film3) # no
print re.match('a{2}shique',film4) # no


None
<_sre.SRE_Match object at 0x7f5003ba89f0>
None
None

In [31]:
# character sets

In [32]:
# [a-z] => match characters from a to z
# [0-9] => all numbers from 0 to 9
# ^[a-z] => starting with a to z
# [^a-z] => not having a to z
# [.+?*] => it converts your regular expression to symbols.

In [33]:
# 
my_sentence = "Today day is tuesday."

In [35]:
print re.match('[a-z]',my_sentence,re.I).group()


T

In [36]:
print re.match('[a-z]+',my_sentence,re.I).group()


Today

In [38]:
# handling space
print re.match('[a-z]+ ',my_sentence,re.I).group()
print re.match('[a-z]+\s+',my_sentence,re.I).group()


Today 
Today 

In [39]:
# other words
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+',my_sentence,re.I).group()


Today day is tuesday

In [40]:
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence,re.I).group()


Today day is tuesday.

In [41]:
#https://docs.python.org/2/howto/regex.html
print re.match('(\w+\s+){3}[a-z]+[.]',my_sentence,re.I).group()


Today day is tuesday.

In [42]:
# verbose
print re.match('''
(\w+\s+)   # a word and a space
{3}        # similar words repeated 3 times just like above word.
[a-z]+[.]  # a word and a dot.
''',my_sentence,(re.I|re.VERBOSE)).group()


Today day is tuesday.

In [52]:
# grouping
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence,re.I).group()
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence,re.I).group(0)
# indexing
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+([a-z]+)[.]',my_sentence,re.I).group(1)
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+([a-z]+)[.]',my_sentence,re.I).group(2)
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+([a-z]+)[.]',my_sentence,re.I).groups()
# keybased
# (?P<[key]>pattern)
print re.match('(?P<T>[a-z]+)\s+[a-z]+\s+[a-z]+\s+(?P<t>[a-z]+)[.]',my_sentence,re.I).group('t')
print re.match('(?P<T>[a-z]+)\s+[a-z]+\s+[a-z]+\s+(?P<t>[a-z]+)[.]',my_sentence,re.I).group('T')


Today day is tuesday.
Today day is tuesday.
Today
tuesday
('Today', 'tuesday')
tuesday
Today

In [60]:
# findall
my_sentence = "Python is a great language.\nPython is my first choice.\nPython is good.\n"

In [55]:
# match
print re.match("python",my_sentence,re.I).group()


Python

In [56]:
# search
print re.search("python",my_sentence,re.I).group()


Python

In [58]:
# findall
print help(re.findall)


Help on function findall in module re:

findall(pattern, string, flags=0)
    Return a list of all non-overlapping matches in the string.
    
    If one or more groups are present in the pattern, return a
    list of groups; this will be a list of tuples if the pattern
    has more than one group.
    
    Empty matches are included in the result.

None

In [57]:
# findall
print re.findall("python",my_sentence,re.I)


['Python', 'Python', 'Python']

In [64]:
# MULTILINE

print my_sentence
print re.findall("^python",my_sentence,(re.I|re.M))


Python is a great language.
Python is my first choice.
Python is good.

['Python', 'Python', 'Python']

In [65]:
# examples

In [75]:
# example1
my_email = '''
my valid email address is khiri@gmail.com
my valid email address is sujani@yahoo.co.in
my valid email address is rahul@
my valid email address is @yahoo.co.in
my valid email address is 123ramya@gmail.com
my valid email adress is tuxfux.hlp@gmail.co.in

'''

In [76]:
reg = re.compile('[a-z0-9.]+@[a-z0-9.]+')

In [77]:
print reg.findall(my_email)


['khiri@gmail.com', 'sujani@yahoo.co.in', '123ramya@gmail.com', 'tuxfux.hlp@gmail.co.in']

In [ ]:
# example2
# remove tcloudost-VirtualBox from string and print it.

In [79]:
my_log="Apr 25 09:47:56 tcloudost-VirtualBox mtp-probe: bus: 1, device: 3 was not an MTP device"

In [108]:
m = re.search('[a-z]+-[a-z]+[:]',my_log,re.I)

In [109]:
print dir(m)


['__class__', '__copy__', '__deepcopy__', '__delattr__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'end', 'endpos', 'expand', 'group', 'groupdict', 'groups', 'lastgroup', 'lastindex', 'pos', 're', 'regs', 'span', 'start', 'string']

In [110]:
print m.start()
print m.end()
print m.group()


37
47
mtp-probe:

In [111]:
print my_log[:m.start()] + my_log[m.end():]


Apr 25 09:47:56 tcloudost-VirtualBox  bus: 1, device: 3 was not an MTP device

In [ ]:


In [ ]: