Finding Patterns in Text



In [1]:

    
# Import Dependenices
import os
import re



In [2]:

    
# Random Sentence
sent = 'Hello World 2018.'

re.match()



In [3]:

    
# Print sequence of length "zero or more" characters from sentence
re.match(r'.*',sent)









    Out[3]:





<_sre.SRE_Match object; span=(0, 17), match='Hello World 2018.'>



In [4]:

    
# Print sequence of length "one or more" characters from sentence
re.match(r'.+',sent)









    Out[4]:





<_sre.SRE_Match object; span=(0, 17), match='Hello World 2018.'>



In [5]:

    
# Just match the word characters and not digits
# Returns just the first word as there is a space after the first word and space is not a word/character
re.match(r'[a-zA-Z]+',sent)









    Out[5]:





<_sre.SRE_Match object; span=(0, 5), match='Hello'>



In [6]:

    
# Search for an "a" and after that a "zero or exactly one "b""
s = "abb"
re.match(r'ab?', s)









    Out[6]:





<_sre.SRE_Match object; span=(0, 2), match='ab'>

re.search()



In [7]:

    
# Match does a local search and Search does a global search
sent = '1992 was the year I was born.'
re.match(r'[a-zA-Z]+',sent)



In [8]:

    
re.search(r'[a-zA-Z]+',sent)









    Out[8]:





<_sre.SRE_Match object; span=(5, 8), match='was'>



In [9]:

    
# Check if sentence starts with a specific pattern
if (re.match(r'^1992',sent)):
    print('matched')
else:
    print('not matched')









    



matched



In [10]:

    
# Check if sentence starts with a specific pattern
if (re.match(r'^I',sent)):
    print('matched')
else:
    print('not matched')









    



not matched



In [11]:

    
# Check if sentence ends with a specific pattern
# format: 'string$'
if (re.search(r'born.$',sent)):
    print('matched')
else:
    print('not matched')









    



matched



In [12]:

    
# Check if sentence ends with a specific pattern
# format: 'string$'
if (re.search(r'year$',sent)):
    print('matched')
else:
    print('not matched')









    



not matched

Substituting Patterns in Text



In [13]:

    
sent = 'I like the movie Armageddon'



In [14]:

    
# Substitute words in string
# 'sub' does global search and global replace
print(re.sub('Armageddon','Top Gun',sent))









    



I like the movie Top Gun



In [15]:

    
# Substitute all characters in words by a number
print(re.sub(r'[a-zA-Z]', '9', sent))









    



9 9999 999 99999 9999999999



In [16]:

    
# Substitute all characters in words by a number
# 're.I': flag meaning case insensitive for characters
print(re.sub(r'[a-z]', '9', sent, flags=re.I))









    



9 9999 999 99999 9999999999



In [17]:

    
# Substitute all characters in words by a number
# 're.I': flag meaning case insensitive for characters
# 'count': Tells that how many characters to substitute
print(re.sub(r'[a-z]', '9', sent, count=1, flags=re.I))









    



9 like the movie Armageddon



In [18]:

    
# Substitute all characters in words by a number
# 're.I': flag meaning case insensitive for characters
# 'count': Tells that how many characters to substitute
print(re.sub(r'[a-z]', '9', sent, count=5, flags=re.I))









    



9 9999 the movie Armageddon

Shorthand Character Class



In [19]:

    
sent1 = 'Hello World 2018'
sent2 = 'I               love         you         :D'
sent3 = "Just ~% +++---------- arrived at Jim's place. #having_fun"



In [20]:

    
# Remove all digits from sentence
sent1_mod = re.sub(r'\d','',sent1)
print(sent1_mod)

# \s+ : Remove sequence of one or more spaces with a single space 
sent2_mod = re.sub(r'\s+',' ',sent2)
print(sent2_mod)

sent2_mod = re.sub(r'\s+love\s+',' hate ',sent2)
print(sent2_mod)

# If sentence has any of these, remove it from string
sent3_mod = re.sub(r'[~%+-\.#]','',sent3)
print(sent3_mod)

# \w: word class
sent3_mod = re.sub(r'\w',' ',sent3)
print(sent3_mod)

# \w: word class
sent3_mod = re.sub(r'\W',' ',sent3)
print(sent3_mod)

sent3_mod = re.sub(r'\s+',' ',sent3_mod)
print(sent3_mod)

sent3_mod = re.sub(r'\s+[a-zA-Z]\s+',' ',sent3_mod)
print(sent3_mod)









    



Hello World 
I love you :D
I hate you         :D
Just   arrived at Jim's place having_fun
     ~% +++----------               '       . #          
Just                  arrived at Jim s place   having_fun
Just arrived at Jim s place having_fun
Just arrived at Jim place having_fun

Preprocessing using Regex



In [21]:

    
X = ['This is a wolf #Scary',
    'Welcome to the jungle #missing-people',
    '11334532 the number to remember',
    'Remember the name Bond, James Bond',
    'I            love             you']



In [22]:

    
for i in range(len(X)):
    # Remove non word characters
    X[i] = re.sub(r'\W',' ',X[i])
    # Remove all digits
    X[i] = re.sub(r'\d',' ',X[i])
    # Remove single characters
    X[i] = re.sub(r'\s+[a-z]\s+', ' ', X[i], flags=re.I)
    # Remove extra spaces
    X[i] = re.sub(r'\s+', ' ',X[i])
    # Remove space at start of sentence
    X[i] = re.sub(r'^\s+','',X[i])
    # Remove space at end of sentence
    X[i] = re.sub(r'\s$','',X[i])



In [23]:

    
print(X)









    



['This is wolf Scary', 'Welcome to the jungle missing people', 'the number to remember', 'Remember the name Bond James Bond', 'I love you']