. | match any character expect the newline |
^ | match the start of the string or each |
$ | match the end of the string or just before the newline |
* | match the precedent re any times. 'ab'* match 'a', 'ab', and 'abbbbb' (any numbers of b) |
+ | same as * but doesn't match 'a' only (any numbers of b but zero) |
? | same as * but only match 1 or 0 repetition ('a' and 'ab' only) |
*?,+?,?? | using ? after *,+,? perform the match in a minimal fashion |
{m} m | copies of the previous re should be matched |
{m,n} | match from m to n repetitions of the previous re, default m value is zero, default n value is inf |
{m,n}? | same as {m,n} but in a minimal fashion (if 'aaaab' and re='a{2,4}?' it will only match the first two aa) |
\ | allows you to match *,?, etc.. |
[] | match a set of characters (e.g. [abc], [a-z] will match lowercase from a to z, [0-5][0-6] will match 00 or 06, to match - use -, special characters lose their meaning and are simple literal, character classes are accepted, when the set start with ^ a complementing match is performed [^5] match all but 5, use [ ] to match [ and ] inside the set) |
(..) | match the re wrote in .., this form is called a group |
(?P |
same as (..) but give a name to the matched substring to recall it in another part of the re |
(?P=name) | to match the same text matched by (?P |
(?#..) | comment that will not be considered |
(?=..) | match if .. matches next but don't consume the string |
(?!=..) | same as above but require the not-match |
(?<=..) | similar to above but check if the .. match previous. It can be used only with fixed lenght after the command (e.g. (?<=abc)def will match abcdef) |
(?<!=..) | same as above but require not match |
\number | repeat the number-th° group written (start from \1 to \99, e.g. '(.+) \1' match 'the the') |
\A | match only the start of the string |
\b | match the empty string only at the begin or end of a word (e.g. '\bfoo\b match 'foo' but not '\foo3') |
\B | match the empty string but not at the begin or endb (e.g. 'py\B' match 'python' but not 'py') |
\d | match any decimal digit 0-9 |
\D | match any non-digit, same as [^0-9] |
\s | match any whitespace character |
\S | match all but whitespace character |
\w | match any alphanumeric character |
\W | match all but an alphanumeric character |
\X | match only at the end of the string |
In [1]:
import re
In [2]:
phrase = "Oggi piove, ma domani al 50% non dovrebbe piovere, maaaaaaa"
In [19]:
# match tries to match the entire string. Use search to have the first match, findall for everyone.
re.match('ma', phrase), re.search('ma', phrase)
Out[19]:
In [3]:
# create the regular expression obtject with a certain match
pippo = re.compile(pattern='ma*')
In [4]:
# match at the beginning of the string, return a MatchObject or None if no match are found
print(pippo.match(string=phrase))
In [5]:
# find the first substring that match the re, return a MatchObject or None if no match are found,
# pos and endpos gives the position where to start and to stop the pattern search in the string
mysearch = pippo.search(string=phrase, pos=6, endpos=40)
mysearch.string[mysearch.start(): mysearch.end()]
Out[5]:
In [6]:
# to perform case-insensitive matching
print(re.IGNORECASE)
re.IGNORECASE =True
print(re.IGNORECASE)
In [7]:
# perform searches in multiline mode, each \n is consider to be a new string (only for some commands)
re.MULTILINE
Out[7]:
In [8]:
# split the string by the occurrences of pattern or return the entire string
re.split(pattern=' ', string=phrase)
Out[8]:
In [9]:
# return all non-overlapping matches
re.findall(pattern="ma*", string=phrase)
Out[9]:
In [10]:
# return iterator yielding match pattern in string
for i in re.finditer(pattern="ma*", string=phrase):
print(i)
print(
i.end(),
i.endpos,
i.pos,
i.start(),
i.span(),
i.string,
i.string[i.start():i.end()],
sep='\n')
In [11]:
# replace pattern with repl only for the first occurrence
re.sub(pattern="ma*", repl='_MA_', string = phrase)
Out[11]:
In [12]:
# same as re.sub but return the number of replacements
re.subn(pattern='ma*', repl='_MA_',string=phrase)
Out[12]:
In [13]:
# MatchObject always have a boolean value of True
if mysearch:
print('ok')
In [14]:
# return one or more subgroups, number of the group=0 return the entire match,
# if the group as a name you can pass the name of the group
mysearch.group(0)
Out[14]:
In [15]:
# return all the groups found
mysearch.groups()
# return all groups with name into a group
mysearch.groupdict()
Out[15]:
In [16]:
# return start and end of the group number
mysearch.span(0), mysearch.start(0), mysearch.end(0)
Out[16]:
In [17]:
# get a number
s = "Your number is <b>123</b>"
m = re.search(r"\d+", s)
print(m.group())
In [22]:
regex = re.compile(r'([a-z][a-z-\']+[a-z])')
print(regex.findall("HELLO W-O-R-L-D")) # this has uppercase
print(regex.findall("HELLO W-O-R-L-D".lower())) # lets lowercase
print(regex.findall("123hello456world789"))
In [25]:
r = re.compile("Sent from my (iPhone|iPod)")
In [27]:
print(r.match('Sent from my Ipad'))
In [1]:
import pandas as pd
In [ ]:
pd.DataFrame
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: