In [1]:
# Import Dependenices
import os
import re
In [2]:
# Random Sentence
sent = 'Hello World 2018.'
In [3]:
# Print sequence of length "zero or more" characters from sentence
re.match(r'.*',sent)
Out[3]:
In [4]:
# Print sequence of length "one or more" characters from sentence
re.match(r'.+',sent)
Out[4]:
In [5]:
# Just match the word characters and not digits
# Returns just the first word as there is a space after the first word and space is not a word/character
re.match(r'[a-zA-Z]+',sent)
Out[5]:
In [6]:
# Search for an "a" and after that a "zero or exactly one "b""
s = "abb"
re.match(r'ab?', s)
Out[6]:
In [7]:
# Match does a local search and Search does a global search
sent = '1992 was the year I was born.'
re.match(r'[a-zA-Z]+',sent)
In [8]:
re.search(r'[a-zA-Z]+',sent)
Out[8]:
In [9]:
# Check if sentence starts with a specific pattern
if (re.match(r'^1992',sent)):
print('matched')
else:
print('not matched')
In [10]:
# Check if sentence starts with a specific pattern
if (re.match(r'^I',sent)):
print('matched')
else:
print('not matched')
In [11]:
# Check if sentence ends with a specific pattern
# format: 'string$'
if (re.search(r'born.$',sent)):
print('matched')
else:
print('not matched')
In [12]:
# Check if sentence ends with a specific pattern
# format: 'string$'
if (re.search(r'year$',sent)):
print('matched')
else:
print('not matched')
In [13]:
sent = 'I like the movie Armageddon'
In [14]:
# Substitute words in string
# 'sub' does global search and global replace
print(re.sub('Armageddon','Top Gun',sent))
In [15]:
# Substitute all characters in words by a number
print(re.sub(r'[a-zA-Z]', '9', sent))
In [16]:
# Substitute all characters in words by a number
# 're.I': flag meaning case insensitive for characters
print(re.sub(r'[a-z]', '9', sent, flags=re.I))
In [17]:
# Substitute all characters in words by a number
# 're.I': flag meaning case insensitive for characters
# 'count': Tells that how many characters to substitute
print(re.sub(r'[a-z]', '9', sent, count=1, flags=re.I))
In [18]:
# Substitute all characters in words by a number
# 're.I': flag meaning case insensitive for characters
# 'count': Tells that how many characters to substitute
print(re.sub(r'[a-z]', '9', sent, count=5, flags=re.I))
In [19]:
sent1 = 'Hello World 2018'
sent2 = 'I love you :D'
sent3 = "Just ~% +++---------- arrived at Jim's place. #having_fun"
In [20]:
# Remove all digits from sentence
sent1_mod = re.sub(r'\d','',sent1)
print(sent1_mod)
# \s+ : Remove sequence of one or more spaces with a single space
sent2_mod = re.sub(r'\s+',' ',sent2)
print(sent2_mod)
sent2_mod = re.sub(r'\s+love\s+',' hate ',sent2)
print(sent2_mod)
# If sentence has any of these, remove it from string
sent3_mod = re.sub(r'[~%+-\.#]','',sent3)
print(sent3_mod)
# \w: word class
sent3_mod = re.sub(r'\w',' ',sent3)
print(sent3_mod)
# \w: word class
sent3_mod = re.sub(r'\W',' ',sent3)
print(sent3_mod)
sent3_mod = re.sub(r'\s+',' ',sent3_mod)
print(sent3_mod)
sent3_mod = re.sub(r'\s+[a-zA-Z]\s+',' ',sent3_mod)
print(sent3_mod)
In [21]:
X = ['This is a wolf #Scary',
'Welcome to the jungle #missing-people',
'11334532 the number to remember',
'Remember the name Bond, James Bond',
'I love you']
In [22]:
for i in range(len(X)):
# Remove non word characters
X[i] = re.sub(r'\W',' ',X[i])
# Remove all digits
X[i] = re.sub(r'\d',' ',X[i])
# Remove single characters
X[i] = re.sub(r'\s+[a-z]\s+', ' ', X[i], flags=re.I)
# Remove extra spaces
X[i] = re.sub(r'\s+', ' ',X[i])
# Remove space at start of sentence
X[i] = re.sub(r'^\s+','',X[i])
# Remove space at end of sentence
X[i] = re.sub(r'\s$','',X[i])
In [23]:
print(X)