Refactor tokenizer
This commit is contained in:
@@ -1,62 +1,57 @@
|
||||
from smnp.error.syntax import SyntaxException
|
||||
from smnp.token.model import TokenList
|
||||
from smnp.token.tokenizers.assign import tokenizeAssign
|
||||
from smnp.token.tokenizers.asterisk import tokenizeAsterisk
|
||||
from smnp.token.tokenizers.bracket import tokenizeOpenBracket, tokenizeCloseBracket
|
||||
from smnp.token.tokenizers.comma import tokenizeComma
|
||||
from smnp.token.tokenizers.comment import tokenizeComment
|
||||
from smnp.token.tokenizers.dot import tokenizeDot
|
||||
from smnp.token.tokenizers.identifier import tokenizeIdentifier
|
||||
from smnp.token.tokenizers.integer import tokenizeInteger
|
||||
from smnp.token.tokenizers.keyword import tokenizeType, tokenizeFunction, tokenizeReturn, tokenizeExtend, \
|
||||
tokenizeImport, tokenizeFrom, tokenizeAs
|
||||
from smnp.token.tokenizers.minus import tokenizeMinus
|
||||
from smnp.token.tokenizers.note import tokenizeNote
|
||||
from smnp.token.tokenizers.paren import tokenizeOpenParen, tokenizeCloseParen
|
||||
from smnp.token.tokenizers.percent import tokenizePercent
|
||||
from smnp.token.tokenizers.square import tokenizeOpenSquare, tokenizeCloseSquare
|
||||
from smnp.token.tokenizers.string import tokenizeString
|
||||
from smnp.token.tokenizers.whitespace import tokenizeWhitespaces
|
||||
from smnp.token.tokenizers.comment import commentTokenizer
|
||||
from smnp.token.tokenizers.identifier import identifierTokenizer
|
||||
from smnp.token.tokenizers.keyword import typeTokenizer
|
||||
from smnp.token.tokenizers.note import noteTokenizer
|
||||
from smnp.token.tokenizers.string import stringTokenizer
|
||||
from smnp.token.tokenizers.whitespace import whitespacesTokenizer
|
||||
from smnp.token.tools import defaultTokenizer, separated, regexPatternTokenizer
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
# TODO !!!
|
||||
# Enable tokenizer to detect separators of tokens
|
||||
# for example, "notes" instead of being tokenized
|
||||
# to IDENTIFIER(notes) is tokenized to [TYPE(note), IDENTIFIER(s)]
|
||||
|
||||
tokenizers = (
|
||||
tokenizeOpenParen,
|
||||
tokenizeCloseParen,
|
||||
tokenizeOpenSquare,
|
||||
tokenizeCloseSquare,
|
||||
tokenizeAsterisk,
|
||||
tokenizeType,
|
||||
tokenizeString,
|
||||
tokenizeFunction,
|
||||
tokenizeReturn,
|
||||
tokenizeExtend,
|
||||
tokenizeImport,
|
||||
tokenizeFrom,
|
||||
tokenizeAs,
|
||||
tokenizeInteger,
|
||||
tokenizeNote,
|
||||
tokenizeIdentifier,
|
||||
tokenizeComma,
|
||||
tokenizeOpenBracket,
|
||||
tokenizeCloseBracket,
|
||||
tokenizeAssign,
|
||||
tokenizePercent,
|
||||
tokenizeMinus,
|
||||
tokenizeDot,
|
||||
tokenizeComment,
|
||||
tokenizeWhitespaces,
|
||||
# Characters
|
||||
defaultTokenizer(TokenType.OPEN_BRACKET),
|
||||
defaultTokenizer(TokenType.CLOSE_BRACKET),
|
||||
defaultTokenizer(TokenType.OPEN_PAREN),
|
||||
defaultTokenizer(TokenType.OPEN_SQUARE),
|
||||
defaultTokenizer(TokenType.CLOSE_SQUARE),
|
||||
defaultTokenizer(TokenType.CLOSE_PAREN),
|
||||
defaultTokenizer(TokenType.ASTERISK),
|
||||
defaultTokenizer(TokenType.ASSIGN),
|
||||
defaultTokenizer(TokenType.COMMA),
|
||||
defaultTokenizer(TokenType.MINUS),
|
||||
defaultTokenizer(TokenType.DOT),
|
||||
|
||||
# Types
|
||||
separated(regexPatternTokenizer(TokenType.INTEGER, r'\d')),
|
||||
stringTokenizer,
|
||||
typeTokenizer,
|
||||
noteTokenizer,
|
||||
|
||||
# Keywords
|
||||
separated(defaultTokenizer(TokenType.FUNCTION)),
|
||||
separated(defaultTokenizer(TokenType.RETURN)),
|
||||
separated(defaultTokenizer(TokenType.EXTEND)),
|
||||
separated(defaultTokenizer(TokenType.IMPORT)),
|
||||
separated(defaultTokenizer(TokenType.FROM)),
|
||||
separated(defaultTokenizer(TokenType.AS)),
|
||||
|
||||
# Identifier (couldn't be before keywords!)
|
||||
identifierTokenizer,
|
||||
|
||||
# Other
|
||||
whitespacesTokenizer,
|
||||
commentTokenizer,
|
||||
)
|
||||
|
||||
|
||||
filters = [
|
||||
lambda token: token.type is not None,
|
||||
lambda token: token.type != TokenType.COMMENT
|
||||
]
|
||||
|
||||
|
||||
def tokenize(lines):
|
||||
tokens = []
|
||||
for lineNumber, line in enumerate(lines):
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
from smnp.token.tools import charTokenizer
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeAssign(input, current, line):
|
||||
return charTokenizer(TokenType.ASSIGN, '=')(input, current, line)
|
||||
@@ -1,6 +0,0 @@
|
||||
from smnp.token.tools import charTokenizer
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
|
||||
def tokenizeAsterisk(input, current, line):
|
||||
return charTokenizer(TokenType.ASTERISK, '*')(input, current, line)
|
||||
@@ -1,10 +0,0 @@
|
||||
from smnp.token.tools import charTokenizer
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
|
||||
def tokenizeOpenBracket(input, current, line):
|
||||
return charTokenizer(TokenType.OPEN_BRACKET, '{')(input, current, line)
|
||||
|
||||
|
||||
def tokenizeCloseBracket(input, current, line):
|
||||
return charTokenizer(TokenType.CLOSE_BRACKET, '}')(input, current, line)
|
||||
@@ -1,6 +0,0 @@
|
||||
from smnp.token.tools import charTokenizer
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
|
||||
def tokenizeComma(input, current, line):
|
||||
return charTokenizer(TokenType.COMMA, ',')(input, current, line)
|
||||
@@ -1,7 +1,8 @@
|
||||
from smnp.token.type import TokenType
|
||||
from smnp.token.model import Token
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeComment(input, current, line):
|
||||
|
||||
def commentTokenizer(input, current, line):
|
||||
if input[current] == '#':
|
||||
consumedChars = 0
|
||||
value = ''
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
from smnp.token.tools import charTokenizer
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeDot(input, current, line):
|
||||
return charTokenizer(TokenType.DOT, '.')(input, current, line)
|
||||
@@ -1,6 +1,6 @@
|
||||
from smnp.token.tools import regexPatternTokenizer
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeIdentifier(input, current, line):
|
||||
def identifierTokenizer(input, current, line):
|
||||
# TODO: Disallow to create identifiers beggining from a number
|
||||
return regexPatternTokenizer(TokenType.IDENTIFIER, r'\w')(input, current, line)
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
from smnp.token.tools import regexPatternTokenizer, separate
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeInteger(input, current, line):
|
||||
return separate(regexPatternTokenizer(TokenType.INTEGER, r'\d'))(input, current, line)
|
||||
@@ -1,34 +1,10 @@
|
||||
from smnp.token.tools import keywordsTokenizer, keywordTokenizer, separate
|
||||
from smnp.token.tools import keywordsTokenizer, separated
|
||||
from smnp.token.type import TokenType
|
||||
from smnp.type.model import Type
|
||||
|
||||
|
||||
def tokenizeType(input, current, line):
|
||||
types = [ type.name.lower() for type in Type ]
|
||||
return separate(keywordsTokenizer(TokenType.TYPE, *types))(input, current, line)
|
||||
typeTokenizer = separated(keywordsTokenizer(TokenType.TYPE, *[type.name.lower() for type in Type]))
|
||||
|
||||
|
||||
def tokenizeReturn(input, current, line):
|
||||
return separate(keywordTokenizer(TokenType.RETURN, 'return'))(input, current, line)
|
||||
|
||||
|
||||
def tokenizeFunction(input, current, line):
|
||||
return separate(keywordTokenizer(TokenType.FUNCTION, 'function'))(input, current, line)
|
||||
|
||||
|
||||
def tokenizeExtend(input, current, line):
|
||||
return separate(keywordTokenizer(TokenType.EXTEND, "extend"))(input, current, line)
|
||||
|
||||
|
||||
def tokenizeImport(input, current, line):
|
||||
return separate(keywordTokenizer(TokenType.IMPORT, "import"))(input, current, line)
|
||||
|
||||
|
||||
def tokenizeFrom(input, current, line):
|
||||
return separate(keywordTokenizer(TokenType.FROM, "from"))(input, current, line)
|
||||
|
||||
|
||||
def tokenizeAs(input, current, line):
|
||||
return separate(keywordTokenizer(TokenType.AS, "as"))(input, current, line)
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
from smnp.token.tools import charTokenizer
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeMinus(input, current, line):
|
||||
return charTokenizer(TokenType.MINUS, '-')(input, current, line)
|
||||
@@ -5,7 +5,7 @@ from smnp.token.model import Token
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
|
||||
def tokenizeNote(input, current, line):
|
||||
def noteTokenizer(input, current, line):
|
||||
consumedChars = 0
|
||||
notePitch = None
|
||||
octave = None
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
from smnp.token.tools import charTokenizer
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
|
||||
def tokenizeOpenParen(input, current, line):
|
||||
return charTokenizer(TokenType.OPEN_PAREN, '(')(input, current, line)
|
||||
|
||||
def tokenizeCloseParen(input, current, line):
|
||||
return charTokenizer(TokenType.CLOSE_PAREN, ')')(input, current, line)
|
||||
@@ -1,5 +0,0 @@
|
||||
from smnp.token.tools import charTokenizer
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizePercent(input, current, line):
|
||||
return charTokenizer(TokenType.PERCENT, '%')(input, current, line)
|
||||
@@ -1,8 +0,0 @@
|
||||
from smnp.token.tools import charTokenizer
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeOpenSquare(input, current, line):
|
||||
return charTokenizer(TokenType.OPEN_SQUARE, '[')(input, current, line)
|
||||
|
||||
def tokenizeCloseSquare(input, current, line):
|
||||
return charTokenizer(TokenType.CLOSE_SQUARE, ']')(input, current, line)
|
||||
@@ -1,7 +1,8 @@
|
||||
from smnp.token.type import TokenType
|
||||
from smnp.token.model import Token
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeString(input, current, line):
|
||||
|
||||
def stringTokenizer(input, current, line):
|
||||
if input[current] == '"':
|
||||
value = input[current]
|
||||
char = ''
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
from smnp.token.tools import regexPatternTokenizer
|
||||
|
||||
def tokenizeWhitespaces(input, current, line):
|
||||
return regexPatternTokenizer(None, r'\s')(input, current, line)
|
||||
whitespacesTokenizer = regexPatternTokenizer(None, r'\s')
|
||||
|
||||
@@ -3,15 +3,6 @@ import re
|
||||
from smnp.token.model import Token
|
||||
|
||||
|
||||
def charTokenizer(type, char):
|
||||
def tokenizer(input, current, line):
|
||||
if input[current] == char:
|
||||
return (1, Token(type, input[current], (line, current)))
|
||||
return (0, None)
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
def regexPatternTokenizer(type, pattern):
|
||||
def tokenizer(input, current, line):
|
||||
consumedChars = 0
|
||||
@@ -44,7 +35,11 @@ def keywordTokenizer(type, keyword):
|
||||
return tokenizer
|
||||
|
||||
|
||||
def separate(tokenizer, end=r"\W"):
|
||||
def defaultTokenizer(type):
|
||||
return keywordTokenizer(type, type.key)
|
||||
|
||||
|
||||
def separated(tokenizer, end=r"\W"):
|
||||
def separated(input, current, line):
|
||||
consumedChars, token = tokenizer(input, current, line)
|
||||
if consumedChars > 0:
|
||||
|
||||
Reference in New Issue
Block a user