In [1]:
import re
In [63]:
class StringScanner:
"""
A scanner very similar to Ruby's StringScanner.
It is mainly designed to make lexing easy, but I'm sure there
are loads of other useful applications too.
"""
def __init__(self, text, position=0):
self.text = text
self.pos = position
self.match = None
def search(self, pattern, advance_pointer=True,
return_string=True, from_pointer=True):
"""
"""
search_func = re.match if from_pointer else re.search
pattern = re.compile(pattern)
match = search_func(pattern, self.text[self.pos:])
# Set the match register using whatever we found
if match:
index = match.span()[0] + len(match.group())
self.match = self.text[self.pos:self.pos + index]
else:
self.match = None
# Advance the pointer if necessary
if advance_pointer:
self.pos += len(self.match) if self.match else 0
# And finally return something. Either the match itself
# or the number of characters found
if return_string:
return self.match
else:
return len(self.match) if self.match else 0
def check(self, pattern):
"""
This will check the string for a pattern, returning
the matched string or None if it wasn't found.
`check()` doesn't advance the scan pointer, but the
match register is still affected.
"""
return self.search(pattern, advance_pointer=false)
def scan(self, pattern):
"""
Checks for a match, using the same arguments as `check()`.
If a match is found, increment the scanner pointer accordingly
and return the matched string.
Failed matches return None instead of raising an error.
"""
return self.search(pattern)
def skip(self, pattern):
"""
Skip the specified pattern, returning the number of
characters skipped and setting the match register.
"""
return self.search(pattern, return_string=False)
def unscan(self):
"""
Using whatever is in the match register, revert to the
previous scanner state.
Note
----
You can only go back one step.
"""
self.pos -= len(self.match) if self.match else 0
self.match = None
def getch(self):
"""
Get the next character from the string.
"""
character = self.current_char
self.pos += 1
return character
def append(self, value):
"""
Append the string to the scanner's text.
"""
self.text += value
def peek(self, n=1):
"""
Get the next n characters.
"""
return self.text[self.pos: self.pos+n]
def __getitem__(self, value):
"""
Get a particular character or substring from the underlying
text.
"""
return self.text[value]
@property
def current_char(self):
"""
Get the current string. If we are at the end of the
text, then return None.
"""
if self.end_of_string:
return None
else:
return self.text[self.pos]
@property
def end_of_string(self):
"""
Check whether the scanner is at the end of the string.
"""
return self.pos == len(self.text)
def rest(self):
"""
Returns the "rest" of the string. (i.e. everything between the
scanner pointer and the end of the string)
"""
return self.text[self.pos:]
def __repr__(self):
max_chars = 30
return '<{}: position={} text="{}">'.format(
self.__class__.__name__,
self.pos,
self.text[:max_chars] + '...' if len(self.text) > max_chars else self.text)
In [65]:
some_string = 'Hello, my name is Michael.'
scanner = StringScanner(some_string)
print(scanner)
print(scanner.search('\w+'))
print(scanner.skip('[,\s]+'))
print(scanner.search('\w+'))
In [ ]: