92 lines
3.0 KiB
Python
92 lines
3.0 KiB
Python
from smnp.error.syntax import SyntaxException
|
|
from smnp.token.model import TokenList
|
|
from smnp.token.tokenizers.assign import tokenizeAssign
|
|
from smnp.token.tokenizers.asterisk import tokenizeAsterisk
|
|
from smnp.token.tokenizers.bracket import tokenizeOpenBracket, tokenizeCloseBracket
|
|
from smnp.token.tokenizers.comma import tokenizeComma
|
|
from smnp.token.tokenizers.comment import tokenizeComment
|
|
from smnp.token.tokenizers.dot import tokenizeDot
|
|
from smnp.token.tokenizers.identifier import tokenizeIdentifier
|
|
from smnp.token.tokenizers.integer import tokenizeInteger
|
|
from smnp.token.tokenizers.keyword import tokenizeType, tokenizeFunction, tokenizeReturn, tokenizeExtend, \
|
|
tokenizeImport, tokenizeFrom, tokenizeAs
|
|
from smnp.token.tokenizers.minus import tokenizeMinus
|
|
from smnp.token.tokenizers.note import tokenizeNote
|
|
from smnp.token.tokenizers.paren import tokenizeOpenParen, tokenizeCloseParen
|
|
from smnp.token.tokenizers.percent import tokenizePercent
|
|
from smnp.token.tokenizers.square import tokenizeOpenSquare, tokenizeCloseSquare
|
|
from smnp.token.tokenizers.string import tokenizeString
|
|
from smnp.token.tokenizers.whitespace import tokenizeWhitespaces
|
|
from smnp.token.type import TokenType
|
|
|
|
# TODO !!!
|
|
# Enable tokenizer to detect separators of tokens
|
|
# for example, "notes" instead of being tokenized
|
|
# to IDENTIFIER(notes) is tokenized to [TYPE(note), IDENTIFIER(s)]
|
|
|
|
tokenizers = (
|
|
tokenizeOpenParen,
|
|
tokenizeCloseParen,
|
|
tokenizeOpenSquare,
|
|
tokenizeCloseSquare,
|
|
tokenizeAsterisk,
|
|
tokenizeType,
|
|
tokenizeString,
|
|
tokenizeFunction,
|
|
tokenizeReturn,
|
|
tokenizeExtend,
|
|
tokenizeImport,
|
|
tokenizeFrom,
|
|
tokenizeAs,
|
|
tokenizeInteger,
|
|
tokenizeNote,
|
|
tokenizeIdentifier,
|
|
tokenizeComma,
|
|
tokenizeOpenBracket,
|
|
tokenizeCloseBracket,
|
|
tokenizeAssign,
|
|
tokenizePercent,
|
|
tokenizeMinus,
|
|
tokenizeDot,
|
|
tokenizeComment,
|
|
tokenizeWhitespaces,
|
|
)
|
|
|
|
filters = [
|
|
lambda token: token.type is not None,
|
|
lambda token: token.type != TokenType.COMMENT
|
|
]
|
|
|
|
def tokenize(lines):
|
|
tokens = []
|
|
for lineNumber, line in enumerate(lines):
|
|
current = 0
|
|
while current < len(line):
|
|
consumedChars, token = combinedTokenizer(line, current, lineNumber)
|
|
|
|
if consumedChars == 0:
|
|
raise SyntaxException(f"Unknown symbol '{line[current]}'", (lineNumber, current))
|
|
|
|
current += consumedChars
|
|
tokens.append(token)
|
|
|
|
return TokenList(filterTokens(filters, tokens))
|
|
|
|
|
|
def combinedTokenizer(line, current, lineNumber):
|
|
for tokenizer in tokenizers:
|
|
consumedChars, token = tokenizer(line, current, lineNumber)
|
|
if consumedChars > 0:
|
|
return (consumedChars, token)
|
|
return (0, None)
|
|
|
|
|
|
def filterTokens(filters, tokens):
|
|
if not filters:
|
|
return tokens
|
|
|
|
return list(filterTokens(filters[1:], (token for token in tokens if filters[0](token))))
|
|
|
|
|
|
__all__ = ["tokenize"]
|