notebook.community

Edit and run



In [ ]:

    
# re,regex,pattern matching.
# references: https://docs.python.org/2/library/re.html
# https://docs.python.org/2/howto/regex.html



In [1]:

    
import re



In [2]:

    
print dir(re)









    



['DEBUG', 'DOTALL', 'I', 'IGNORECASE', 'L', 'LOCALE', 'M', 'MULTILINE', 'S', 'Scanner', 'T', 'TEMPLATE', 'U', 'UNICODE', 'VERBOSE', 'X', '_MAXCACHE', '__all__', '__builtins__', '__doc__', '__file__', '__name__', '__package__', '__version__', '_alphanum', '_cache', '_cache_repl', '_compile', '_compile_repl', '_expand', '_pattern_type', '_pickle', '_subx', 'compile', 'copy_reg', 'error', 'escape', 'findall', 'finditer', 'match', 'purge', 'search', 'split', 'sre_compile', 'sre_parse', 'sub', 'subn', 'sys', 'template']



In [3]:

    
print help(re.match)









    



Help on function match in module re:

match(pattern, string, flags=0)
    Try to apply the pattern at the start of the string, returning
    a match object, or None if no match was found.

None



In [4]:

    
my_string="python"



In [5]:

    
print re.match("p",my_string)









    



<_sre.SRE_Match object at 0x7fcac87dd6b0>



In [6]:

    
print re.match("py",my_string) # match
print re.match("pythons",my_string) # no match
print re.match("yt",my_string) # no match









    



<_sre.SRE_Match object at 0x7fcac87dd5e0>
None
None



In [8]:

    
print re.match("Py",my_string)
print re.match("Py",my_string,re.I)









    



None
<_sre.SRE_Match object at 0x7fcac87dd578>



In [ ]:

    
# search



In [9]:

    
print help(re.search)









    



Help on function search in module re:

search(pattern, string, flags=0)
    Scan through string looking for a match to the pattern, returning
    a match object, or None if no match was found.

None



In [10]:

    
print re.search("yt",my_string)









    



<_sre.SRE_Match object at 0x7fcac87dd440>



In [11]:

    
# sentences
my_sentence1 = "python is a great language."
my_sentence2 = "one of the great languages is python."

print re.match("python",my_sentence1) # true
print re.match("python",my_sentence2) # False
print re.search('python',my_sentence1) # true
print re.search('python',my_sentence2) # true









    



<_sre.SRE_Match object at 0x7fcac87dd510>
None
<_sre.SRE_Match object at 0x7fcac87dd510>
<_sre.SRE_Match object at 0x7fcac87dd510>



In [12]:

    
# compile
print help(re.compile)









    



Help on function compile in module re:

compile(pattern, flags=0)
    Compile a regular expression pattern, returning a pattern object.

None



In [14]:

    
reg = re.compile("python",re.I)

print help(reg.match)









    



Help on built-in function match:

match(...)
    match(string[, pos[, endpos]]) --> match object or None.
    Matches zero or more characters at the beginning of the string

None



In [15]:

    
print reg.match(my_sentence1)  # true
print reg.match(my_sentence2)  # False
print reg.search(my_sentence1) # true
print reg.search(my_sentence2) # true









    



<_sre.SRE_Match object at 0x7fcac87dd168>
None
<_sre.SRE_Match object at 0x7fcac87dd168>
<_sre.SRE_Match object at 0x7fcac87dd168>



In [ ]:

    
# special characters
# ^ -> caret -> beginning of a sentence.
# $ -> dollar -> end of a sentence.
# . -> dot -> one character.



In [16]:

    
# sentences
my_sentence1 = "python is a great language."
my_sentence2 = "one of the great languages is python."



In [17]:

    
reg = re.compile('python',re.I)
reg1 = re.compile('^python',re.I)
print reg1.search(my_sentence1) # true
print reg.search(my_sentence1)  # true
print reg.search(my_sentence2)  # true
print reg1.search(my_sentence2) # false









    



<_sre.SRE_Match object at 0x7fcac87dd370>
<_sre.SRE_Match object at 0x7fcac87dd370>
<_sre.SRE_Match object at 0x7fcac87dd370>
None



In [24]:

    
# dot (.)
# . doest not repressent a \n or \r .
my_string="python"
my_string1 = "python\n"
print re.match('......',my_string,re.I)
print re.match('.......',my_string1,re.I)
print re.match('.......',my_string1,(re.I|re.DOTALL))
print re.match('...',my_string,re.I).group()









    



<_sre.SRE_Match object at 0x7fcac87ddcc8>
None
<_sre.SRE_Match object at 0x7fcac87ddcc8>
pyt



In [7]:

    
# example.
import re

reg = re.compile('^.....$',re.I)
students = ['khiri','Sujani','rahulji','Ramyaji','kirankumar','kumar']

for value in students:
    if reg.match(value):
        print reg.match(value).group()









    



khiri
kumar



In [ ]:

    
# globbling characters



In [ ]:

    
# * -> zero or more charactes.
# + -> one or more characters.
# ? -> zero or one characters.



In [8]:

    
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"



In [9]:

    
reg = re.compile("a*shique")



In [10]:

    
print reg.match(film1)
print reg.match(film2)
print reg.match(film3)
print reg.match(film4)









    



<_sre.SRE_Match object at 0x7f5003b97cc8>
<_sre.SRE_Match object at 0x7f5003b97cc8>
<_sre.SRE_Match object at 0x7f5003b97cc8>
<_sre.SRE_Match object at 0x7f5003b97cc8>



In [11]:

    
reg1 = re.compile("a+shique")



In [12]:

    
print reg1.match(film1)
print reg1.match(film2)
print reg1.match(film3)
print reg1.match(film4)









    



<_sre.SRE_Match object at 0x7f5003b97e68>
<_sre.SRE_Match object at 0x7f5003b97e68>
<_sre.SRE_Match object at 0x7f5003b97e68>
None



In [13]:

    
reg2 = re.compile("a?shique")



In [14]:

    
print reg2.match(film1)
print reg2.match(film2)
print reg2.match(film3)
print reg2.match(film4)









    



<_sre.SRE_Match object at 0x7f5003ba8168>
None
None
<_sre.SRE_Match object at 0x7f5003ba8168>



In [15]:

    
'''
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"
'''

print reg2.search(film1) # true
print reg2.search(film2) # true,false
print reg2.search(film3) # true,false
print reg2.search(film4) # true,false









    



<_sre.SRE_Match object at 0x7f5003ba82a0>
<_sre.SRE_Match object at 0x7f5003ba82a0>
<_sre.SRE_Match object at 0x7f5003ba82a0>
<_sre.SRE_Match object at 0x7f5003ba82a0>



In [17]:

    
# exercise
# globbling as greedy.
my_name1 = "<n1>kumar</n1>"
my_name2 = "<n2>hello</n2>"
my_name3 = "<hello>raj</hello>"
my_name4 = "<hai>whatsup</hai>"



In [18]:

    
reg3 = re.compile('<.*>')



In [20]:

    
print reg3.match(my_name1).group()
print reg3.match(my_name2).group()
print reg3.match(my_name3).group()
print reg3.match(my_name4).group()









    



<n1>kumar</n1>
<n2>hello</n2>
<hello>raj<hello>
<hai>whatsup</hai>



In [ ]:

    
# ??,*?,+? -> minimal matchings.



In [24]:

    
reg4 = re.compile('<.*?>')



In [25]:

    
print reg4.match(my_name1).group()
print reg4.match(my_name2).group()
print reg4.match(my_name3).group()
print reg4.match(my_name4).group()









    



<n1>
<n2>
<hello>
<hai>



In [26]:

    
# anchors



In [27]:

    
# {m} -> exactly m number of characters.
# {m,n} -> exactly between m and n number of characters.
# {m,} -> more than m number of charcters.



In [29]:

    
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"



In [30]:

    
print re.match('a{2}shique',film1) # no
print re.match('a{2}shique',film2) # yes
print re.match('a{2}shique',film3) # no
print re.match('a{2}shique',film4) # no









    



None
<_sre.SRE_Match object at 0x7f5003ba89f0>
None
None



In [31]:

    
# character sets



In [32]:

    
# [a-z] => match characters from a to z
# [0-9] => all numbers from 0 to 9
# ^[a-z] => starting with a to z
# [^a-z] => not having a to z
# [.+?*] => it converts your regular expression to symbols.



In [33]:

    
# 
my_sentence = "Today day is tuesday."



In [35]:

    
print re.match('[a-z]',my_sentence,re.I).group()



In [36]:

    
print re.match('[a-z]+',my_sentence,re.I).group()









    



Today



In [38]:

    
# handling space
print re.match('[a-z]+ ',my_sentence,re.I).group()
print re.match('[a-z]+\s+',my_sentence,re.I).group()









    



Today 
Today



In [39]:

    
# other words
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+',my_sentence,re.I).group()









    



Today day is tuesday



In [40]:

    
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence,re.I).group()









    



Today day is tuesday.



In [41]:

    
#https://docs.python.org/2/howto/regex.html
print re.match('(\w+\s+){3}[a-z]+[.]',my_sentence,re.I).group()









    



Today day is tuesday.



In [42]:

    
# verbose
print re.match('''
(\w+\s+)   # a word and a space
{3}        # similar words repeated 3 times just like above word.
[a-z]+[.]  # a word and a dot.
''',my_sentence,(re.I|re.VERBOSE)).group()









    



Today day is tuesday.



In [52]:

    
# grouping
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence,re.I).group()
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence,re.I).group(0)
# indexing
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+([a-z]+)[.]',my_sentence,re.I).group(1)
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+([a-z]+)[.]',my_sentence,re.I).group(2)
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+([a-z]+)[.]',my_sentence,re.I).groups()
# keybased
# (?P<[key]>pattern)
print re.match('(?P<T>[a-z]+)\s+[a-z]+\s+[a-z]+\s+(?P<t>[a-z]+)[.]',my_sentence,re.I).group('t')
print re.match('(?P<T>[a-z]+)\s+[a-z]+\s+[a-z]+\s+(?P<t>[a-z]+)[.]',my_sentence,re.I).group('T')









    



Today day is tuesday.
Today day is tuesday.
Today
tuesday
('Today', 'tuesday')
tuesday
Today



In [60]:

    
# findall
my_sentence = "Python is a great language.\nPython is my first choice.\nPython is good.\n"



In [55]:

    
# match
print re.match("python",my_sentence,re.I).group()









    



Python



In [56]:

    
# search
print re.search("python",my_sentence,re.I).group()









    



Python



In [58]:

    
# findall
print help(re.findall)









    



Help on function findall in module re:

findall(pattern, string, flags=0)
    Return a list of all non-overlapping matches in the string.
    
    If one or more groups are present in the pattern, return a
    list of groups; this will be a list of tuples if the pattern
    has more than one group.
    
    Empty matches are included in the result.

None



In [57]:

    
# findall
print re.findall("python",my_sentence,re.I)









    



['Python', 'Python', 'Python']



In [64]:

    
# MULTILINE

print my_sentence
print re.findall("^python",my_sentence,(re.I|re.M))









    



Python is a great language.
Python is my first choice.
Python is good.

['Python', 'Python', 'Python']



In [65]:

    
# examples



In [75]:

    
# example1
my_email = '''
my valid email address is khiri@gmail.com
my valid email address is sujani@yahoo.co.in
my valid email address is rahul@
my valid email address is @yahoo.co.in
my valid email address is 123ramya@gmail.com
my valid email adress is tuxfux.hlp@gmail.co.in

'''



In [76]:

    
reg = re.compile('[a-z0-9.]+@[a-z0-9.]+')



In [77]:

    
print reg.findall(my_email)









    



['khiri@gmail.com', 'sujani@yahoo.co.in', '123ramya@gmail.com', 'tuxfux.hlp@gmail.co.in']



In [ ]:

    
# example2
# remove tcloudost-VirtualBox from string and print it.



In [79]:

    
my_log="Apr 25 09:47:56 tcloudost-VirtualBox mtp-probe: bus: 1, device: 3 was not an MTP device"



In [108]:

    
m = re.search('[a-z]+-[a-z]+[:]',my_log,re.I)



In [109]:

    
print dir(m)









    



['__class__', '__copy__', '__deepcopy__', '__delattr__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'end', 'endpos', 'expand', 'group', 'groupdict', 'groups', 'lastgroup', 'lastindex', 'pos', 're', 'regs', 'span', 'start', 'string']



In [110]:

    
print m.start()
print m.end()
print m.group()









    



37
47
mtp-probe:



In [111]:

    
print my_log[:m.start()] + my_log[m.end():]









    



Apr 25 09:47:56 tcloudost-VirtualBox  bus: 1, device: 3 was not an MTP device



In [ ]:



In [ ]: