Refactor tokenizer

This commit is contained in:
Bartłomiej Pluta
2019-07-06 22:09:01 +02:00
parent fbb3f79731
commit 756f4544e4
18 changed files with 59 additions and 156 deletions

View File

@@ -1,62 +1,57 @@
from smnp.error.syntax import SyntaxException from smnp.error.syntax import SyntaxException
from smnp.token.model import TokenList from smnp.token.model import TokenList
from smnp.token.tokenizers.assign import tokenizeAssign from smnp.token.tokenizers.comment import commentTokenizer
from smnp.token.tokenizers.asterisk import tokenizeAsterisk from smnp.token.tokenizers.identifier import identifierTokenizer
from smnp.token.tokenizers.bracket import tokenizeOpenBracket, tokenizeCloseBracket from smnp.token.tokenizers.keyword import typeTokenizer
from smnp.token.tokenizers.comma import tokenizeComma from smnp.token.tokenizers.note import noteTokenizer
from smnp.token.tokenizers.comment import tokenizeComment from smnp.token.tokenizers.string import stringTokenizer
from smnp.token.tokenizers.dot import tokenizeDot from smnp.token.tokenizers.whitespace import whitespacesTokenizer
from smnp.token.tokenizers.identifier import tokenizeIdentifier from smnp.token.tools import defaultTokenizer, separated, regexPatternTokenizer
from smnp.token.tokenizers.integer import tokenizeInteger
from smnp.token.tokenizers.keyword import tokenizeType, tokenizeFunction, tokenizeReturn, tokenizeExtend, \
tokenizeImport, tokenizeFrom, tokenizeAs
from smnp.token.tokenizers.minus import tokenizeMinus
from smnp.token.tokenizers.note import tokenizeNote
from smnp.token.tokenizers.paren import tokenizeOpenParen, tokenizeCloseParen
from smnp.token.tokenizers.percent import tokenizePercent
from smnp.token.tokenizers.square import tokenizeOpenSquare, tokenizeCloseSquare
from smnp.token.tokenizers.string import tokenizeString
from smnp.token.tokenizers.whitespace import tokenizeWhitespaces
from smnp.token.type import TokenType from smnp.token.type import TokenType
# TODO !!!
# Enable tokenizer to detect separators of tokens
# for example, "notes" instead of being tokenized
# to IDENTIFIER(notes) is tokenized to [TYPE(note), IDENTIFIER(s)]
tokenizers = ( tokenizers = (
tokenizeOpenParen, # Characters
tokenizeCloseParen, defaultTokenizer(TokenType.OPEN_BRACKET),
tokenizeOpenSquare, defaultTokenizer(TokenType.CLOSE_BRACKET),
tokenizeCloseSquare, defaultTokenizer(TokenType.OPEN_PAREN),
tokenizeAsterisk, defaultTokenizer(TokenType.OPEN_SQUARE),
tokenizeType, defaultTokenizer(TokenType.CLOSE_SQUARE),
tokenizeString, defaultTokenizer(TokenType.CLOSE_PAREN),
tokenizeFunction, defaultTokenizer(TokenType.ASTERISK),
tokenizeReturn, defaultTokenizer(TokenType.ASSIGN),
tokenizeExtend, defaultTokenizer(TokenType.COMMA),
tokenizeImport, defaultTokenizer(TokenType.MINUS),
tokenizeFrom, defaultTokenizer(TokenType.DOT),
tokenizeAs,
tokenizeInteger, # Types
tokenizeNote, separated(regexPatternTokenizer(TokenType.INTEGER, r'\d')),
tokenizeIdentifier, stringTokenizer,
tokenizeComma, typeTokenizer,
tokenizeOpenBracket, noteTokenizer,
tokenizeCloseBracket,
tokenizeAssign, # Keywords
tokenizePercent, separated(defaultTokenizer(TokenType.FUNCTION)),
tokenizeMinus, separated(defaultTokenizer(TokenType.RETURN)),
tokenizeDot, separated(defaultTokenizer(TokenType.EXTEND)),
tokenizeComment, separated(defaultTokenizer(TokenType.IMPORT)),
tokenizeWhitespaces, separated(defaultTokenizer(TokenType.FROM)),
separated(defaultTokenizer(TokenType.AS)),
# Identifier (couldn't be before keywords!)
identifierTokenizer,
# Other
whitespacesTokenizer,
commentTokenizer,
) )
filters = [ filters = [
lambda token: token.type is not None, lambda token: token.type is not None,
lambda token: token.type != TokenType.COMMENT lambda token: token.type != TokenType.COMMENT
] ]
def tokenize(lines): def tokenize(lines):
tokens = [] tokens = []
for lineNumber, line in enumerate(lines): for lineNumber, line in enumerate(lines):

View File

@@ -1,5 +0,0 @@
from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType
def tokenizeAssign(input, current, line):
return charTokenizer(TokenType.ASSIGN, '=')(input, current, line)

View File

@@ -1,6 +0,0 @@
from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType
def tokenizeAsterisk(input, current, line):
return charTokenizer(TokenType.ASTERISK, '*')(input, current, line)

View File

@@ -1,10 +0,0 @@
from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType
def tokenizeOpenBracket(input, current, line):
return charTokenizer(TokenType.OPEN_BRACKET, '{')(input, current, line)
def tokenizeCloseBracket(input, current, line):
return charTokenizer(TokenType.CLOSE_BRACKET, '}')(input, current, line)

View File

@@ -1,6 +0,0 @@
from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType
def tokenizeComma(input, current, line):
return charTokenizer(TokenType.COMMA, ',')(input, current, line)

View File

@@ -1,7 +1,8 @@
from smnp.token.type import TokenType
from smnp.token.model import Token from smnp.token.model import Token
from smnp.token.type import TokenType
def tokenizeComment(input, current, line):
def commentTokenizer(input, current, line):
if input[current] == '#': if input[current] == '#':
consumedChars = 0 consumedChars = 0
value = '' value = ''

View File

@@ -1,5 +0,0 @@
from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType
def tokenizeDot(input, current, line):
return charTokenizer(TokenType.DOT, '.')(input, current, line)

View File

@@ -1,6 +1,6 @@
from smnp.token.tools import regexPatternTokenizer from smnp.token.tools import regexPatternTokenizer
from smnp.token.type import TokenType from smnp.token.type import TokenType
def tokenizeIdentifier(input, current, line): def identifierTokenizer(input, current, line):
# TODO: Disallow to create identifiers beggining from a number # TODO: Disallow to create identifiers beggining from a number
return regexPatternTokenizer(TokenType.IDENTIFIER, r'\w')(input, current, line) return regexPatternTokenizer(TokenType.IDENTIFIER, r'\w')(input, current, line)

View File

@@ -1,5 +0,0 @@
from smnp.token.tools import regexPatternTokenizer, separate
from smnp.token.type import TokenType
def tokenizeInteger(input, current, line):
return separate(regexPatternTokenizer(TokenType.INTEGER, r'\d'))(input, current, line)

View File

@@ -1,34 +1,10 @@
from smnp.token.tools import keywordsTokenizer, keywordTokenizer, separate from smnp.token.tools import keywordsTokenizer, separated
from smnp.token.type import TokenType from smnp.token.type import TokenType
from smnp.type.model import Type from smnp.type.model import Type
def tokenizeType(input, current, line): typeTokenizer = separated(keywordsTokenizer(TokenType.TYPE, *[type.name.lower() for type in Type]))
types = [ type.name.lower() for type in Type ]
return separate(keywordsTokenizer(TokenType.TYPE, *types))(input, current, line)
def tokenizeReturn(input, current, line):
return separate(keywordTokenizer(TokenType.RETURN, 'return'))(input, current, line)
def tokenizeFunction(input, current, line):
return separate(keywordTokenizer(TokenType.FUNCTION, 'function'))(input, current, line)
def tokenizeExtend(input, current, line):
return separate(keywordTokenizer(TokenType.EXTEND, "extend"))(input, current, line)
def tokenizeImport(input, current, line):
return separate(keywordTokenizer(TokenType.IMPORT, "import"))(input, current, line)
def tokenizeFrom(input, current, line):
return separate(keywordTokenizer(TokenType.FROM, "from"))(input, current, line)
def tokenizeAs(input, current, line):
return separate(keywordTokenizer(TokenType.AS, "as"))(input, current, line)

View File

@@ -1,5 +0,0 @@
from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType
def tokenizeMinus(input, current, line):
return charTokenizer(TokenType.MINUS, '-')(input, current, line)

View File

@@ -5,7 +5,7 @@ from smnp.token.model import Token
from smnp.token.type import TokenType from smnp.token.type import TokenType
def tokenizeNote(input, current, line): def noteTokenizer(input, current, line):
consumedChars = 0 consumedChars = 0
notePitch = None notePitch = None
octave = None octave = None

View File

@@ -1,9 +0,0 @@
from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType
def tokenizeOpenParen(input, current, line):
return charTokenizer(TokenType.OPEN_PAREN, '(')(input, current, line)
def tokenizeCloseParen(input, current, line):
return charTokenizer(TokenType.CLOSE_PAREN, ')')(input, current, line)

View File

@@ -1,5 +0,0 @@
from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType
def tokenizePercent(input, current, line):
return charTokenizer(TokenType.PERCENT, '%')(input, current, line)

View File

@@ -1,8 +0,0 @@
from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType
def tokenizeOpenSquare(input, current, line):
return charTokenizer(TokenType.OPEN_SQUARE, '[')(input, current, line)
def tokenizeCloseSquare(input, current, line):
return charTokenizer(TokenType.CLOSE_SQUARE, ']')(input, current, line)

View File

@@ -1,7 +1,8 @@
from smnp.token.type import TokenType
from smnp.token.model import Token from smnp.token.model import Token
from smnp.token.type import TokenType
def tokenizeString(input, current, line):
def stringTokenizer(input, current, line):
if input[current] == '"': if input[current] == '"':
value = input[current] value = input[current]
char = '' char = ''

View File

@@ -1,4 +1,3 @@
from smnp.token.tools import regexPatternTokenizer from smnp.token.tools import regexPatternTokenizer
def tokenizeWhitespaces(input, current, line): whitespacesTokenizer = regexPatternTokenizer(None, r'\s')
return regexPatternTokenizer(None, r'\s')(input, current, line)

View File

@@ -3,15 +3,6 @@ import re
from smnp.token.model import Token from smnp.token.model import Token
def charTokenizer(type, char):
def tokenizer(input, current, line):
if input[current] == char:
return (1, Token(type, input[current], (line, current)))
return (0, None)
return tokenizer
def regexPatternTokenizer(type, pattern): def regexPatternTokenizer(type, pattern):
def tokenizer(input, current, line): def tokenizer(input, current, line):
consumedChars = 0 consumedChars = 0
@@ -44,7 +35,11 @@ def keywordTokenizer(type, keyword):
return tokenizer return tokenizer
def separate(tokenizer, end=r"\W"): def defaultTokenizer(type):
return keywordTokenizer(type, type.key)
def separated(tokenizer, end=r"\W"):
def separated(input, current, line): def separated(input, current, line):
consumedChars, token = tokenizer(input, current, line) consumedChars, token = tokenizer(input, current, line)
if consumedChars > 0: if consumedChars > 0: