# tokenizer.py # # Copyright 2021 James Westman # # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation; either version 3 of the # License, or (at your option) any later version. # # This file is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this program. If not, see . # # SPDX-License-Identifier: LGPL-3.0-or-later import re from enum import Enum from .errors import TokenizeError class TokenType(Enum): EOF = 0 DIRECTIVE = 1 IDENT = 2 QUOTED = 3 NUMBER = 4 OPEN_PAREN = 5 CLOSE_PAREN = 6 OPEN_BLOCK = 7 CLOSE_BLOCK = 8 STMT_END = 9 OP = 10 WHITESPACE = 11 COMMENT = 12 OPEN_BRACKET = 13 CLOSE_BRACKET = 14 COMMA = 15 _TOKENS = [ (TokenType.DIRECTIVE, r"@[\d\w\-_]+"), (TokenType.IDENT, r"[A-Za-z_][\d\w\-_]*"), (TokenType.QUOTED, r'"(\\"|[^"\n])+"'), (TokenType.QUOTED, r"'(\\'|[^'\n])+'"), (TokenType.NUMBER, r"[-+]?[\d_]+(\.[\d_]+)?"), (TokenType.NUMBER, r"0x[A-Fa-f0-9]+"), (TokenType.OPEN_PAREN, r"\("), (TokenType.CLOSE_PAREN, r"\)"), (TokenType.OPEN_BLOCK, r"\{"), (TokenType.CLOSE_BLOCK, r"\}"), (TokenType.STMT_END, r";"), (TokenType.OP, r"[:=\.=\|<>]+"), (TokenType.WHITESPACE, r"\s+"), (TokenType.COMMENT, r"\/\*.*?\*\/"), (TokenType.COMMENT, r"\/\/[^\n]*"), (TokenType.OPEN_BRACKET, r"\["), (TokenType.CLOSE_BRACKET, r"\]"), (TokenType.COMMA, r"\,"), ] _TOKENS = [(type, re.compile(regex)) for (type, regex) in _TOKENS] class Token: def __init__(self, type, start, end, string): self.type = type self.start = start self.end = end self.string = string def __str__(self): return self.string[self.start:self.end] def is_directive(self, directive) -> bool: if self.type != TokenType.DIRECTIVE: return False return str(self) == "@" + directive def get_number(self): if self.type != TokenType.NUMBER: return None string = str(self) if string.startswith("0x"): return int(string, 16) else: return float(string) def _tokenize(ui_ml: str): i = 0 while i < len(ui_ml): matched = False for (type, regex) in _TOKENS: match = regex.match(ui_ml, i) if match is not None: yield Token(type, match.start(), match.end(), ui_ml) i = match.end() matched = True break if not matched: raise TokenizeError(i) yield Token(TokenType.EOF, i, i, ui_ml) def tokenize(data: str) -> [Token]: return list(_tokenize(data))