Finding Patterns in Text


In [1]:
# Import Dependenices
import os
import re

In [2]:
# Random Sentence
sent = 'Hello World 2018.'

re.match()


In [3]:
# Print sequence of length "zero or more" characters from sentence
re.match(r'.*',sent)


Out[3]:
<_sre.SRE_Match object; span=(0, 17), match='Hello World 2018.'>

In [4]:
# Print sequence of length "one or more" characters from sentence
re.match(r'.+',sent)


Out[4]:
<_sre.SRE_Match object; span=(0, 17), match='Hello World 2018.'>

In [5]:
# Just match the word characters and not digits
# Returns just the first word as there is a space after the first word and space is not a word/character
re.match(r'[a-zA-Z]+',sent)


Out[5]:
<_sre.SRE_Match object; span=(0, 5), match='Hello'>

In [6]:
# Search for an "a" and after that a "zero or exactly one "b""
s = "abb"
re.match(r'ab?', s)


Out[6]:
<_sre.SRE_Match object; span=(0, 2), match='ab'>

re.search()


In [7]:
# Match does a local search and Search does a global search
sent = '1992 was the year I was born.'
re.match(r'[a-zA-Z]+',sent)

In [8]:
re.search(r'[a-zA-Z]+',sent)


Out[8]:
<_sre.SRE_Match object; span=(5, 8), match='was'>

In [9]:
# Check if sentence starts with a specific pattern
if (re.match(r'^1992',sent)):
    print('matched')
else:
    print('not matched')


matched

In [10]:
# Check if sentence starts with a specific pattern
if (re.match(r'^I',sent)):
    print('matched')
else:
    print('not matched')


not matched

In [11]:
# Check if sentence ends with a specific pattern
# format: 'string$'
if (re.search(r'born.$',sent)):
    print('matched')
else:
    print('not matched')


matched

In [12]:
# Check if sentence ends with a specific pattern
# format: 'string$'
if (re.search(r'year$',sent)):
    print('matched')
else:
    print('not matched')


not matched

Substituting Patterns in Text


In [13]:
sent = 'I like the movie Armageddon'

In [14]:
# Substitute words in string
# 'sub' does global search and global replace
print(re.sub('Armageddon','Top Gun',sent))


I like the movie Top Gun

In [15]:
# Substitute all characters in words by a number
print(re.sub(r'[a-zA-Z]', '9', sent))


9 9999 999 99999 9999999999

In [16]:
# Substitute all characters in words by a number
# 're.I': flag meaning case insensitive for characters
print(re.sub(r'[a-z]', '9', sent, flags=re.I))


9 9999 999 99999 9999999999

In [17]:
# Substitute all characters in words by a number
# 're.I': flag meaning case insensitive for characters
# 'count': Tells that how many characters to substitute
print(re.sub(r'[a-z]', '9', sent, count=1, flags=re.I))


9 like the movie Armageddon

In [18]:
# Substitute all characters in words by a number
# 're.I': flag meaning case insensitive for characters
# 'count': Tells that how many characters to substitute
print(re.sub(r'[a-z]', '9', sent, count=5, flags=re.I))


9 9999 the movie Armageddon

Shorthand Character Class


In [19]:
sent1 = 'Hello World 2018'
sent2 = 'I               love         you         :D'
sent3 = "Just ~% +++---------- arrived at Jim's place. #having_fun"

In [20]:
# Remove all digits from sentence
sent1_mod = re.sub(r'\d','',sent1)
print(sent1_mod)

# \s+ : Remove sequence of one or more spaces with a single space 
sent2_mod = re.sub(r'\s+',' ',sent2)
print(sent2_mod)

sent2_mod = re.sub(r'\s+love\s+',' hate ',sent2)
print(sent2_mod)

# If sentence has any of these, remove it from string
sent3_mod = re.sub(r'[~%+-\.#]','',sent3)
print(sent3_mod)

# \w: word class
sent3_mod = re.sub(r'\w',' ',sent3)
print(sent3_mod)

# \w: word class
sent3_mod = re.sub(r'\W',' ',sent3)
print(sent3_mod)

sent3_mod = re.sub(r'\s+',' ',sent3_mod)
print(sent3_mod)

sent3_mod = re.sub(r'\s+[a-zA-Z]\s+',' ',sent3_mod)
print(sent3_mod)


Hello World 
I love you :D
I hate you         :D
Just   arrived at Jim's place having_fun
     ~% +++----------               '       . #          
Just                  arrived at Jim s place   having_fun
Just arrived at Jim s place having_fun
Just arrived at Jim place having_fun

Preprocessing using Regex


In [21]:
X = ['This is a wolf #Scary',
    'Welcome to the jungle #missing-people',
    '11334532 the number to remember',
    'Remember the name Bond, James Bond',
    'I            love             you']

In [22]:
for i in range(len(X)):
    # Remove non word characters
    X[i] = re.sub(r'\W',' ',X[i])
    # Remove all digits
    X[i] = re.sub(r'\d',' ',X[i])
    # Remove single characters
    X[i] = re.sub(r'\s+[a-z]\s+', ' ', X[i], flags=re.I)
    # Remove extra spaces
    X[i] = re.sub(r'\s+', ' ',X[i])
    # Remove space at start of sentence
    X[i] = re.sub(r'^\s+','',X[i])
    # Remove space at end of sentence
    X[i] = re.sub(r'\s$','',X[i])

In [23]:
print(X)


['This is wolf Scary', 'Welcome to the jungle missing people', 'the number to remember', 'Remember the name Bond James Bond', 'I love you']