Refactor tokenizer
This commit is contained in:
@@ -1,62 +1,57 @@
|
|||||||
from smnp.error.syntax import SyntaxException
|
from smnp.error.syntax import SyntaxException
|
||||||
from smnp.token.model import TokenList
|
from smnp.token.model import TokenList
|
||||||
from smnp.token.tokenizers.assign import tokenizeAssign
|
from smnp.token.tokenizers.comment import commentTokenizer
|
||||||
from smnp.token.tokenizers.asterisk import tokenizeAsterisk
|
from smnp.token.tokenizers.identifier import identifierTokenizer
|
||||||
from smnp.token.tokenizers.bracket import tokenizeOpenBracket, tokenizeCloseBracket
|
from smnp.token.tokenizers.keyword import typeTokenizer
|
||||||
from smnp.token.tokenizers.comma import tokenizeComma
|
from smnp.token.tokenizers.note import noteTokenizer
|
||||||
from smnp.token.tokenizers.comment import tokenizeComment
|
from smnp.token.tokenizers.string import stringTokenizer
|
||||||
from smnp.token.tokenizers.dot import tokenizeDot
|
from smnp.token.tokenizers.whitespace import whitespacesTokenizer
|
||||||
from smnp.token.tokenizers.identifier import tokenizeIdentifier
|
from smnp.token.tools import defaultTokenizer, separated, regexPatternTokenizer
|
||||||
from smnp.token.tokenizers.integer import tokenizeInteger
|
|
||||||
from smnp.token.tokenizers.keyword import tokenizeType, tokenizeFunction, tokenizeReturn, tokenizeExtend, \
|
|
||||||
tokenizeImport, tokenizeFrom, tokenizeAs
|
|
||||||
from smnp.token.tokenizers.minus import tokenizeMinus
|
|
||||||
from smnp.token.tokenizers.note import tokenizeNote
|
|
||||||
from smnp.token.tokenizers.paren import tokenizeOpenParen, tokenizeCloseParen
|
|
||||||
from smnp.token.tokenizers.percent import tokenizePercent
|
|
||||||
from smnp.token.tokenizers.square import tokenizeOpenSquare, tokenizeCloseSquare
|
|
||||||
from smnp.token.tokenizers.string import tokenizeString
|
|
||||||
from smnp.token.tokenizers.whitespace import tokenizeWhitespaces
|
|
||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
# TODO !!!
|
|
||||||
# Enable tokenizer to detect separators of tokens
|
|
||||||
# for example, "notes" instead of being tokenized
|
|
||||||
# to IDENTIFIER(notes) is tokenized to [TYPE(note), IDENTIFIER(s)]
|
|
||||||
|
|
||||||
tokenizers = (
|
tokenizers = (
|
||||||
tokenizeOpenParen,
|
# Characters
|
||||||
tokenizeCloseParen,
|
defaultTokenizer(TokenType.OPEN_BRACKET),
|
||||||
tokenizeOpenSquare,
|
defaultTokenizer(TokenType.CLOSE_BRACKET),
|
||||||
tokenizeCloseSquare,
|
defaultTokenizer(TokenType.OPEN_PAREN),
|
||||||
tokenizeAsterisk,
|
defaultTokenizer(TokenType.OPEN_SQUARE),
|
||||||
tokenizeType,
|
defaultTokenizer(TokenType.CLOSE_SQUARE),
|
||||||
tokenizeString,
|
defaultTokenizer(TokenType.CLOSE_PAREN),
|
||||||
tokenizeFunction,
|
defaultTokenizer(TokenType.ASTERISK),
|
||||||
tokenizeReturn,
|
defaultTokenizer(TokenType.ASSIGN),
|
||||||
tokenizeExtend,
|
defaultTokenizer(TokenType.COMMA),
|
||||||
tokenizeImport,
|
defaultTokenizer(TokenType.MINUS),
|
||||||
tokenizeFrom,
|
defaultTokenizer(TokenType.DOT),
|
||||||
tokenizeAs,
|
|
||||||
tokenizeInteger,
|
# Types
|
||||||
tokenizeNote,
|
separated(regexPatternTokenizer(TokenType.INTEGER, r'\d')),
|
||||||
tokenizeIdentifier,
|
stringTokenizer,
|
||||||
tokenizeComma,
|
typeTokenizer,
|
||||||
tokenizeOpenBracket,
|
noteTokenizer,
|
||||||
tokenizeCloseBracket,
|
|
||||||
tokenizeAssign,
|
# Keywords
|
||||||
tokenizePercent,
|
separated(defaultTokenizer(TokenType.FUNCTION)),
|
||||||
tokenizeMinus,
|
separated(defaultTokenizer(TokenType.RETURN)),
|
||||||
tokenizeDot,
|
separated(defaultTokenizer(TokenType.EXTEND)),
|
||||||
tokenizeComment,
|
separated(defaultTokenizer(TokenType.IMPORT)),
|
||||||
tokenizeWhitespaces,
|
separated(defaultTokenizer(TokenType.FROM)),
|
||||||
|
separated(defaultTokenizer(TokenType.AS)),
|
||||||
|
|
||||||
|
# Identifier (couldn't be before keywords!)
|
||||||
|
identifierTokenizer,
|
||||||
|
|
||||||
|
# Other
|
||||||
|
whitespacesTokenizer,
|
||||||
|
commentTokenizer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
filters = [
|
filters = [
|
||||||
lambda token: token.type is not None,
|
lambda token: token.type is not None,
|
||||||
lambda token: token.type != TokenType.COMMENT
|
lambda token: token.type != TokenType.COMMENT
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def tokenize(lines):
|
def tokenize(lines):
|
||||||
tokens = []
|
tokens = []
|
||||||
for lineNumber, line in enumerate(lines):
|
for lineNumber, line in enumerate(lines):
|
||||||
|
|||||||
@@ -1,5 +0,0 @@
|
|||||||
from smnp.token.tools import charTokenizer
|
|
||||||
from smnp.token.type import TokenType
|
|
||||||
|
|
||||||
def tokenizeAssign(input, current, line):
|
|
||||||
return charTokenizer(TokenType.ASSIGN, '=')(input, current, line)
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
from smnp.token.tools import charTokenizer
|
|
||||||
from smnp.token.type import TokenType
|
|
||||||
|
|
||||||
|
|
||||||
def tokenizeAsterisk(input, current, line):
|
|
||||||
return charTokenizer(TokenType.ASTERISK, '*')(input, current, line)
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
from smnp.token.tools import charTokenizer
|
|
||||||
from smnp.token.type import TokenType
|
|
||||||
|
|
||||||
|
|
||||||
def tokenizeOpenBracket(input, current, line):
|
|
||||||
return charTokenizer(TokenType.OPEN_BRACKET, '{')(input, current, line)
|
|
||||||
|
|
||||||
|
|
||||||
def tokenizeCloseBracket(input, current, line):
|
|
||||||
return charTokenizer(TokenType.CLOSE_BRACKET, '}')(input, current, line)
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
from smnp.token.tools import charTokenizer
|
|
||||||
from smnp.token.type import TokenType
|
|
||||||
|
|
||||||
|
|
||||||
def tokenizeComma(input, current, line):
|
|
||||||
return charTokenizer(TokenType.COMMA, ',')(input, current, line)
|
|
||||||
@@ -1,7 +1,8 @@
|
|||||||
from smnp.token.type import TokenType
|
|
||||||
from smnp.token.model import Token
|
from smnp.token.model import Token
|
||||||
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
def tokenizeComment(input, current, line):
|
|
||||||
|
def commentTokenizer(input, current, line):
|
||||||
if input[current] == '#':
|
if input[current] == '#':
|
||||||
consumedChars = 0
|
consumedChars = 0
|
||||||
value = ''
|
value = ''
|
||||||
|
|||||||
@@ -1,5 +0,0 @@
|
|||||||
from smnp.token.tools import charTokenizer
|
|
||||||
from smnp.token.type import TokenType
|
|
||||||
|
|
||||||
def tokenizeDot(input, current, line):
|
|
||||||
return charTokenizer(TokenType.DOT, '.')(input, current, line)
|
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
from smnp.token.tools import regexPatternTokenizer
|
from smnp.token.tools import regexPatternTokenizer
|
||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
def tokenizeIdentifier(input, current, line):
|
def identifierTokenizer(input, current, line):
|
||||||
# TODO: Disallow to create identifiers beggining from a number
|
# TODO: Disallow to create identifiers beggining from a number
|
||||||
return regexPatternTokenizer(TokenType.IDENTIFIER, r'\w')(input, current, line)
|
return regexPatternTokenizer(TokenType.IDENTIFIER, r'\w')(input, current, line)
|
||||||
|
|||||||
@@ -1,5 +0,0 @@
|
|||||||
from smnp.token.tools import regexPatternTokenizer, separate
|
|
||||||
from smnp.token.type import TokenType
|
|
||||||
|
|
||||||
def tokenizeInteger(input, current, line):
|
|
||||||
return separate(regexPatternTokenizer(TokenType.INTEGER, r'\d'))(input, current, line)
|
|
||||||
@@ -1,34 +1,10 @@
|
|||||||
from smnp.token.tools import keywordsTokenizer, keywordTokenizer, separate
|
from smnp.token.tools import keywordsTokenizer, separated
|
||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
from smnp.type.model import Type
|
from smnp.type.model import Type
|
||||||
|
|
||||||
|
|
||||||
def tokenizeType(input, current, line):
|
typeTokenizer = separated(keywordsTokenizer(TokenType.TYPE, *[type.name.lower() for type in Type]))
|
||||||
types = [ type.name.lower() for type in Type ]
|
|
||||||
return separate(keywordsTokenizer(TokenType.TYPE, *types))(input, current, line)
|
|
||||||
|
|
||||||
|
|
||||||
def tokenizeReturn(input, current, line):
|
|
||||||
return separate(keywordTokenizer(TokenType.RETURN, 'return'))(input, current, line)
|
|
||||||
|
|
||||||
|
|
||||||
def tokenizeFunction(input, current, line):
|
|
||||||
return separate(keywordTokenizer(TokenType.FUNCTION, 'function'))(input, current, line)
|
|
||||||
|
|
||||||
|
|
||||||
def tokenizeExtend(input, current, line):
|
|
||||||
return separate(keywordTokenizer(TokenType.EXTEND, "extend"))(input, current, line)
|
|
||||||
|
|
||||||
|
|
||||||
def tokenizeImport(input, current, line):
|
|
||||||
return separate(keywordTokenizer(TokenType.IMPORT, "import"))(input, current, line)
|
|
||||||
|
|
||||||
|
|
||||||
def tokenizeFrom(input, current, line):
|
|
||||||
return separate(keywordTokenizer(TokenType.FROM, "from"))(input, current, line)
|
|
||||||
|
|
||||||
|
|
||||||
def tokenizeAs(input, current, line):
|
|
||||||
return separate(keywordTokenizer(TokenType.AS, "as"))(input, current, line)
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +0,0 @@
|
|||||||
from smnp.token.tools import charTokenizer
|
|
||||||
from smnp.token.type import TokenType
|
|
||||||
|
|
||||||
def tokenizeMinus(input, current, line):
|
|
||||||
return charTokenizer(TokenType.MINUS, '-')(input, current, line)
|
|
||||||
@@ -5,7 +5,7 @@ from smnp.token.model import Token
|
|||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
|
|
||||||
def tokenizeNote(input, current, line):
|
def noteTokenizer(input, current, line):
|
||||||
consumedChars = 0
|
consumedChars = 0
|
||||||
notePitch = None
|
notePitch = None
|
||||||
octave = None
|
octave = None
|
||||||
|
|||||||
@@ -1,9 +0,0 @@
|
|||||||
from smnp.token.tools import charTokenizer
|
|
||||||
from smnp.token.type import TokenType
|
|
||||||
|
|
||||||
|
|
||||||
def tokenizeOpenParen(input, current, line):
|
|
||||||
return charTokenizer(TokenType.OPEN_PAREN, '(')(input, current, line)
|
|
||||||
|
|
||||||
def tokenizeCloseParen(input, current, line):
|
|
||||||
return charTokenizer(TokenType.CLOSE_PAREN, ')')(input, current, line)
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
from smnp.token.tools import charTokenizer
|
|
||||||
from smnp.token.type import TokenType
|
|
||||||
|
|
||||||
def tokenizePercent(input, current, line):
|
|
||||||
return charTokenizer(TokenType.PERCENT, '%')(input, current, line)
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
from smnp.token.tools import charTokenizer
|
|
||||||
from smnp.token.type import TokenType
|
|
||||||
|
|
||||||
def tokenizeOpenSquare(input, current, line):
|
|
||||||
return charTokenizer(TokenType.OPEN_SQUARE, '[')(input, current, line)
|
|
||||||
|
|
||||||
def tokenizeCloseSquare(input, current, line):
|
|
||||||
return charTokenizer(TokenType.CLOSE_SQUARE, ']')(input, current, line)
|
|
||||||
@@ -1,7 +1,8 @@
|
|||||||
from smnp.token.type import TokenType
|
|
||||||
from smnp.token.model import Token
|
from smnp.token.model import Token
|
||||||
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
def tokenizeString(input, current, line):
|
|
||||||
|
def stringTokenizer(input, current, line):
|
||||||
if input[current] == '"':
|
if input[current] == '"':
|
||||||
value = input[current]
|
value = input[current]
|
||||||
char = ''
|
char = ''
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
from smnp.token.tools import regexPatternTokenizer
|
from smnp.token.tools import regexPatternTokenizer
|
||||||
|
|
||||||
def tokenizeWhitespaces(input, current, line):
|
whitespacesTokenizer = regexPatternTokenizer(None, r'\s')
|
||||||
return regexPatternTokenizer(None, r'\s')(input, current, line)
|
|
||||||
|
|||||||
@@ -3,15 +3,6 @@ import re
|
|||||||
from smnp.token.model import Token
|
from smnp.token.model import Token
|
||||||
|
|
||||||
|
|
||||||
def charTokenizer(type, char):
|
|
||||||
def tokenizer(input, current, line):
|
|
||||||
if input[current] == char:
|
|
||||||
return (1, Token(type, input[current], (line, current)))
|
|
||||||
return (0, None)
|
|
||||||
|
|
||||||
return tokenizer
|
|
||||||
|
|
||||||
|
|
||||||
def regexPatternTokenizer(type, pattern):
|
def regexPatternTokenizer(type, pattern):
|
||||||
def tokenizer(input, current, line):
|
def tokenizer(input, current, line):
|
||||||
consumedChars = 0
|
consumedChars = 0
|
||||||
@@ -44,7 +35,11 @@ def keywordTokenizer(type, keyword):
|
|||||||
return tokenizer
|
return tokenizer
|
||||||
|
|
||||||
|
|
||||||
def separate(tokenizer, end=r"\W"):
|
def defaultTokenizer(type):
|
||||||
|
return keywordTokenizer(type, type.key)
|
||||||
|
|
||||||
|
|
||||||
|
def separated(tokenizer, end=r"\W"):
|
||||||
def separated(input, current, line):
|
def separated(input, current, line):
|
||||||
consumedChars, token = tokenizer(input, current, line)
|
consumedChars, token = tokenizer(input, current, line)
|
||||||
if consumedChars > 0:
|
if consumedChars > 0:
|
||||||
|
|||||||
Reference in New Issue
Block a user