In [ ]:
# re,regex,pattern matching.
# references: https://docs.python.org/2/library/re.html
# https://docs.python.org/2/howto/regex.html
In [1]:
import re
In [2]:
print dir(re)
In [3]:
print help(re.match)
In [4]:
my_string="python"
In [5]:
print re.match("p",my_string)
In [6]:
print re.match("py",my_string) # match
print re.match("pythons",my_string) # no match
print re.match("yt",my_string) # no match
In [8]:
print re.match("Py",my_string)
print re.match("Py",my_string,re.I)
In [ ]:
# search
In [9]:
print help(re.search)
In [10]:
print re.search("yt",my_string)
In [11]:
# sentences
my_sentence1 = "python is a great language."
my_sentence2 = "one of the great languages is python."
print re.match("python",my_sentence1) # true
print re.match("python",my_sentence2) # False
print re.search('python',my_sentence1) # true
print re.search('python',my_sentence2) # true
In [12]:
# compile
print help(re.compile)
In [14]:
reg = re.compile("python",re.I)
print help(reg.match)
In [15]:
print reg.match(my_sentence1) # true
print reg.match(my_sentence2) # False
print reg.search(my_sentence1) # true
print reg.search(my_sentence2) # true
In [ ]:
# special characters
# ^ -> caret -> beginning of a sentence.
# $ -> dollar -> end of a sentence.
# . -> dot -> one character.
In [16]:
# sentences
my_sentence1 = "python is a great language."
my_sentence2 = "one of the great languages is python."
In [17]:
reg = re.compile('python',re.I)
reg1 = re.compile('^python',re.I)
print reg1.search(my_sentence1) # true
print reg.search(my_sentence1) # true
print reg.search(my_sentence2) # true
print reg1.search(my_sentence2) # false
In [24]:
# dot (.)
# . doest not repressent a \n or \r .
my_string="python"
my_string1 = "python\n"
print re.match('......',my_string,re.I)
print re.match('.......',my_string1,re.I)
print re.match('.......',my_string1,(re.I|re.DOTALL))
print re.match('...',my_string,re.I).group()
In [7]:
# example.
import re
reg = re.compile('^.....$',re.I)
students = ['khiri','Sujani','rahulji','Ramyaji','kirankumar','kumar']
for value in students:
if reg.match(value):
print reg.match(value).group()
In [ ]:
# globbling characters
In [ ]:
# * -> zero or more charactes.
# + -> one or more characters.
# ? -> zero or one characters.
In [8]:
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"
In [9]:
reg = re.compile("a*shique")
In [10]:
print reg.match(film1)
print reg.match(film2)
print reg.match(film3)
print reg.match(film4)
In [11]:
reg1 = re.compile("a+shique")
In [12]:
print reg1.match(film1)
print reg1.match(film2)
print reg1.match(film3)
print reg1.match(film4)
In [13]:
reg2 = re.compile("a?shique")
In [14]:
print reg2.match(film1)
print reg2.match(film2)
print reg2.match(film3)
print reg2.match(film4)
In [15]:
'''
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"
'''
print reg2.search(film1) # true
print reg2.search(film2) # true,false
print reg2.search(film3) # true,false
print reg2.search(film4) # true,false
In [17]:
# exercise
# globbling as greedy.
my_name1 = "<n1>kumar</n1>"
my_name2 = "<n2>hello</n2>"
my_name3 = "<hello>raj</hello>"
my_name4 = "<hai>whatsup</hai>"
In [18]:
reg3 = re.compile('<.*>')
In [20]:
print reg3.match(my_name1).group()
print reg3.match(my_name2).group()
print reg3.match(my_name3).group()
print reg3.match(my_name4).group()
In [ ]:
# ??,*?,+? -> minimal matchings.
In [24]:
reg4 = re.compile('<.*?>')
In [25]:
print reg4.match(my_name1).group()
print reg4.match(my_name2).group()
print reg4.match(my_name3).group()
print reg4.match(my_name4).group()
In [26]:
# anchors
In [27]:
# {m} -> exactly m number of characters.
# {m,n} -> exactly between m and n number of characters.
# {m,} -> more than m number of charcters.
In [29]:
film1 = "ashique"
film2 = "aashique"
film3 = "aaashique"
film4 = "shique"
In [30]:
print re.match('a{2}shique',film1) # no
print re.match('a{2}shique',film2) # yes
print re.match('a{2}shique',film3) # no
print re.match('a{2}shique',film4) # no
In [31]:
# character sets
In [32]:
# [a-z] => match characters from a to z
# [0-9] => all numbers from 0 to 9
# ^[a-z] => starting with a to z
# [^a-z] => not having a to z
# [.+?*] => it converts your regular expression to symbols.
In [33]:
#
my_sentence = "Today day is tuesday."
In [35]:
print re.match('[a-z]',my_sentence,re.I).group()
In [36]:
print re.match('[a-z]+',my_sentence,re.I).group()
In [38]:
# handling space
print re.match('[a-z]+ ',my_sentence,re.I).group()
print re.match('[a-z]+\s+',my_sentence,re.I).group()
In [39]:
# other words
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+',my_sentence,re.I).group()
In [40]:
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence,re.I).group()
In [41]:
#https://docs.python.org/2/howto/regex.html
print re.match('(\w+\s+){3}[a-z]+[.]',my_sentence,re.I).group()
In [42]:
# verbose
print re.match('''
(\w+\s+) # a word and a space
{3} # similar words repeated 3 times just like above word.
[a-z]+[.] # a word and a dot.
''',my_sentence,(re.I|re.VERBOSE)).group()
In [52]:
# grouping
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence,re.I).group()
print re.match('[a-z]+\s+[a-z]+\s+[a-z]+\s+[a-z]+[.]',my_sentence,re.I).group(0)
# indexing
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+([a-z]+)[.]',my_sentence,re.I).group(1)
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+([a-z]+)[.]',my_sentence,re.I).group(2)
print re.match('([a-z]+)\s+[a-z]+\s+[a-z]+\s+([a-z]+)[.]',my_sentence,re.I).groups()
# keybased
# (?P<[key]>pattern)
print re.match('(?P<T>[a-z]+)\s+[a-z]+\s+[a-z]+\s+(?P<t>[a-z]+)[.]',my_sentence,re.I).group('t')
print re.match('(?P<T>[a-z]+)\s+[a-z]+\s+[a-z]+\s+(?P<t>[a-z]+)[.]',my_sentence,re.I).group('T')
In [60]:
# findall
my_sentence = "Python is a great language.\nPython is my first choice.\nPython is good.\n"
In [55]:
# match
print re.match("python",my_sentence,re.I).group()
In [56]:
# search
print re.search("python",my_sentence,re.I).group()
In [58]:
# findall
print help(re.findall)
In [57]:
# findall
print re.findall("python",my_sentence,re.I)
In [64]:
# MULTILINE
print my_sentence
print re.findall("^python",my_sentence,(re.I|re.M))
In [65]:
# examples
In [75]:
# example1
my_email = '''
my valid email address is khiri@gmail.com
my valid email address is sujani@yahoo.co.in
my valid email address is rahul@
my valid email address is @yahoo.co.in
my valid email address is 123ramya@gmail.com
my valid email adress is tuxfux.hlp@gmail.co.in
'''
In [76]:
reg = re.compile('[a-z0-9.]+@[a-z0-9.]+')
In [77]:
print reg.findall(my_email)
In [ ]:
# example2
# remove tcloudost-VirtualBox from string and print it.
In [79]:
my_log="Apr 25 09:47:56 tcloudost-VirtualBox mtp-probe: bus: 1, device: 3 was not an MTP device"
In [108]:
m = re.search('[a-z]+-[a-z]+[:]',my_log,re.I)
In [109]:
print dir(m)
In [110]:
print m.start()
print m.end()
print m.group()
In [111]:
print my_log[:m.start()] + my_log[m.end():]
In [ ]:
In [ ]: