import re class ParseError(Exception): def __init__(self, input, message): self.input = input.clone() # clone the parse cursor at the point of the error self.message = message def __str__(self): return f'{self.message} {self.input.loc()}' class Cursor: def __init__(self, text, skip_space=False, consume=True, space=r'\s+'): self.text = text self.pos = 0 # current text position self.start = 0 # last match start position before whitespace skipping self.skip = 0 # last match start position after whitespace skipping self.end = 0 # last match end position self.skip_space = skip_space self.consume = consume self.space = space def loc(self): # describe the cursor position in a human-readable form, suitable for error messages pos = self.pos text = self.text endline = re.compile(r'\n|$') # locate the line in which the current position is located line_start = 0 line_id = 0 while True: # determine line end position match = endline.search(text, pos=line_start) more_lines = match.group() == '\n' line_end = match.start() # we add 1 to include the newline in the positions covered (if present) # <<< at the end of the string, with no newline, it still kinda works okay I think if line_start <= pos < (line_end + 1): # pos is within the current line break if not more_lines: # pos is, somehow, somewhere past the end of the string # <<< for now, we'll just treat it as if pos was in the final line break line_start = line_end + 1 # skip newline line_id += 1 line_size = line_end - line_start line_number = line_id + 1 # line_offset is so ambiguous - is it offset *of* the line or offset of the cursor *within* the line? in this case, it's the latter line_offset = pos - line_start line_text = text[line_start:line_end] # excludes newline caret_spacing = re.sub(r'[^\t]', ' ', line_text[:line_offset]) return f'at line {line_number}, offset {line_offset}, line string {repr(line_text)}\n{line_text}\n{caret_spacing}^\n' def clone(self): # python's immutable strings should mean the actual string data for text is not copied clone = Cursor(text) # pos is the main purpose of the clone clone.pos = self.pos # this other stuff, we're just cloning for completeness clone.start = self.start clone.skip = self.skip clone.end = self.end return clone def string_match(self, string): ''' Check for an exact match between the provided string and the input. Note that it's a string, not a regex. Every character is literal. And it returns a bool, not a match object. ''' pos = self.pos self.start = pos self.skip = pos self.end = pos size = len(string) if self.text[self.pos:self.pos + size] == string: pos += size self.pos = pos self.end = pos return True else: return False def match(self, regex, skip_space=None, consume=None, space=None): ''' check if a regex matches at the cursor position given a match, update the cursor to consume the matched text (by default) Typical usage: if input.match(r'(\d+)'): # handle numbers value = int(input.m.group(1)) # ... elif input.match(r'"'): # handle double-quoted strings # ... elif input.match(r'for'): # "for" loop # ... elif input.match(r'\s*$'): # end of input break else: raise ''' if skip_space == None: skip_space = self.skip_space if consume == None: consume = self.consume if space == None: space = self.space pos = self.pos self.start = pos self.skip = pos self.end = pos if skip_space: space_compile_flags = re.DOTALL space = re.compile(space, space_compile_flags) # <<< todo: compile once and reuse space_match = space.match(self.text, pos=pos) if space_match: pos = space_match.end() self.skip = pos compile_flags = re.DOTALL pattern = re.compile(regex, compile_flags) match = pattern.match(self.text, pos=pos) if match: pos = match.end() self.end = pos if consume: self.pos = pos return match def match_exact(self, regex, skip_space=False, consume=True): # check if a regex matches at the cursor position # consume the matched text (by default) # skip initial whitespace (by default) return self.match(regex, skip_space=skip_space, consume=consume) def check(self, regex, skip_space=None, consume=False): # check if a regex matches at the cursor position # do not consume the matched text (by default) # skip initial whitespace (by default) # another suitable name for this would have been "lookahead" return self.match(regex, skip_space=skip_space, consume=consume)