From 9c4046ac2a2a892111be836446ab0bec8f7cba83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Pluta?= Date: Sat, 6 Jul 2019 13:35:21 +0200 Subject: [PATCH] Enable tokenizer to support separators between keywords and integers --- smnp/token/tokenizers/assign.py | 4 +- smnp/token/tokenizers/asterisk.py | 5 +- smnp/token/tokenizers/bracket.py | 8 ++-- smnp/token/tokenizers/comma.py | 5 +- smnp/token/tokenizers/dot.py | 4 +- smnp/token/tokenizers/identifier.py | 5 +- smnp/token/tokenizers/integer.py | 4 +- smnp/token/tokenizers/keyword.py | 16 +++---- smnp/token/tokenizers/minus.py | 4 +- smnp/token/tokenizers/paren.py | 7 +-- smnp/token/tokenizers/percent.py | 4 +- smnp/token/tokenizers/square.py | 6 +-- smnp/token/tokenizers/whitespace.py | 4 +- smnp/token/tools.py | 71 ++++++++++++++++++++--------- 14 files changed, 90 insertions(+), 57 deletions(-) diff --git a/smnp/token/tokenizers/assign.py b/smnp/token/tokenizers/assign.py index ca61260..0467fd9 100644 --- a/smnp/token/tokenizers/assign.py +++ b/smnp/token/tokenizers/assign.py @@ -1,5 +1,5 @@ -from smnp.token.tools import tokenizeChar +from smnp.token.tools import charTokenizer from smnp.token.type import TokenType def tokenizeAssign(input, current, line): - return tokenizeChar(TokenType.ASSIGN, '=', input, current, line) + return charTokenizer(TokenType.ASSIGN, '=')(input, current, line) diff --git a/smnp/token/tokenizers/asterisk.py b/smnp/token/tokenizers/asterisk.py index 42bb212..ee03f9d 100644 --- a/smnp/token/tokenizers/asterisk.py +++ b/smnp/token/tokenizers/asterisk.py @@ -1,5 +1,6 @@ -from smnp.token.tools import tokenizeChar +from smnp.token.tools import charTokenizer from smnp.token.type import TokenType + def tokenizeAsterisk(input, current, line): - return tokenizeChar(TokenType.ASTERISK, '*', input, current, line) + return charTokenizer(TokenType.ASTERISK, '*')(input, current, line) diff --git a/smnp/token/tokenizers/bracket.py b/smnp/token/tokenizers/bracket.py index d160461..3220ef7 100644 --- a/smnp/token/tokenizers/bracket.py +++ b/smnp/token/tokenizers/bracket.py @@ -1,8 +1,10 @@ -from smnp.token.tools import tokenizeChar +from smnp.token.tools import charTokenizer from smnp.token.type import TokenType + def tokenizeOpenBracket(input, current, line): - return tokenizeChar(TokenType.OPEN_BRACKET, '{', input, current, line) + return charTokenizer(TokenType.OPEN_BRACKET, '{')(input, current, line) + def tokenizeCloseBracket(input, current, line): - return tokenizeChar(TokenType.CLOSE_BRACKET, '}', input, current, line) + return charTokenizer(TokenType.CLOSE_BRACKET, '}')(input, current, line) diff --git a/smnp/token/tokenizers/comma.py b/smnp/token/tokenizers/comma.py index 722dbba..184e84c 100644 --- a/smnp/token/tokenizers/comma.py +++ b/smnp/token/tokenizers/comma.py @@ -1,5 +1,6 @@ -from smnp.token.tools import tokenizeChar +from smnp.token.tools import charTokenizer from smnp.token.type import TokenType + def tokenizeComma(input, current, line): - return tokenizeChar(TokenType.COMMA, ',', input, current, line) + return charTokenizer(TokenType.COMMA, ',')(input, current, line) diff --git a/smnp/token/tokenizers/dot.py b/smnp/token/tokenizers/dot.py index f959dc2..126e12c 100644 --- a/smnp/token/tokenizers/dot.py +++ b/smnp/token/tokenizers/dot.py @@ -1,5 +1,5 @@ -from smnp.token.tools import tokenizeChar +from smnp.token.tools import charTokenizer from smnp.token.type import TokenType def tokenizeDot(input, current, line): - return tokenizeChar(TokenType.DOT, '.', input, current, line) + return charTokenizer(TokenType.DOT, '.')(input, current, line) diff --git a/smnp/token/tokenizers/identifier.py b/smnp/token/tokenizers/identifier.py index 35b8835..6728039 100644 --- a/smnp/token/tokenizers/identifier.py +++ b/smnp/token/tokenizers/identifier.py @@ -1,5 +1,6 @@ -from smnp.token.tools import tokenizeRegexPattern +from smnp.token.tools import regexPatternTokenizer from smnp.token.type import TokenType def tokenizeIdentifier(input, current, line): - return tokenizeRegexPattern(TokenType.IDENTIFIER, r'\w', input, current, line) + # TODO: Disallow to create identifiers beggining from a number + return regexPatternTokenizer(TokenType.IDENTIFIER, r'\w')(input, current, line) diff --git a/smnp/token/tokenizers/integer.py b/smnp/token/tokenizers/integer.py index 48b31ed..b135444 100644 --- a/smnp/token/tokenizers/integer.py +++ b/smnp/token/tokenizers/integer.py @@ -1,5 +1,5 @@ -from smnp.token.tools import tokenizeRegexPattern +from smnp.token.tools import regexPatternTokenizer, separate from smnp.token.type import TokenType def tokenizeInteger(input, current, line): - return tokenizeRegexPattern(TokenType.INTEGER, r'\d', input, current, line) + return separate(regexPatternTokenizer(TokenType.INTEGER, r'\d'))(input, current, line) diff --git a/smnp/token/tokenizers/keyword.py b/smnp/token/tokenizers/keyword.py index e3feb76..e0dd453 100644 --- a/smnp/token/tokenizers/keyword.py +++ b/smnp/token/tokenizers/keyword.py @@ -1,34 +1,34 @@ -from smnp.token.tools import tokenizeKeywords, tokenizeKeyword +from smnp.token.tools import keywordsTokenizer, keywordTokenizer, separate from smnp.token.type import TokenType from smnp.type.model import Type def tokenizeType(input, current, line): types = [ type.name.lower() for type in Type ] - return tokenizeKeywords(TokenType.TYPE, input, current, line, *types) + return separate(keywordsTokenizer(TokenType.TYPE, *types))(input, current, line) def tokenizeReturn(input, current, line): - return tokenizeKeyword(TokenType.RETURN, 'return', input, current, line) + return separate(keywordTokenizer(TokenType.RETURN, 'return'))(input, current, line) def tokenizeFunction(input, current, line): - return tokenizeKeyword(TokenType.FUNCTION, 'function', input, current, line) + return separate(keywordTokenizer(TokenType.FUNCTION, 'function'))(input, current, line) def tokenizeExtend(input, current, line): - return tokenizeKeyword(TokenType.EXTEND, "extend", input, current, line) + return separate(keywordTokenizer(TokenType.EXTEND, "extend"))(input, current, line) def tokenizeImport(input, current, line): - return tokenizeKeyword(TokenType.IMPORT, "import", input, current, line) + return separate(keywordTokenizer(TokenType.IMPORT, "import"))(input, current, line) def tokenizeFrom(input, current, line): - return tokenizeKeyword(TokenType.FROM, "from", input, current, line) + return separate(keywordTokenizer(TokenType.FROM, "from"))(input, current, line) def tokenizeAs(input, current, line): - return tokenizeKeyword(TokenType.AS, "as", input, current, line) + return separate(keywordTokenizer(TokenType.AS, "as"))(input, current, line) diff --git a/smnp/token/tokenizers/minus.py b/smnp/token/tokenizers/minus.py index 7b40a2b..38fb784 100644 --- a/smnp/token/tokenizers/minus.py +++ b/smnp/token/tokenizers/minus.py @@ -1,5 +1,5 @@ -from smnp.token.tools import tokenizeChar +from smnp.token.tools import charTokenizer from smnp.token.type import TokenType def tokenizeMinus(input, current, line): - return tokenizeChar(TokenType.MINUS, '-', input, current, line) + return charTokenizer(TokenType.MINUS, '-')(input, current, line) diff --git a/smnp/token/tokenizers/paren.py b/smnp/token/tokenizers/paren.py index 44d324c..30fa9f5 100644 --- a/smnp/token/tokenizers/paren.py +++ b/smnp/token/tokenizers/paren.py @@ -1,8 +1,9 @@ -from smnp.token.tools import tokenizeChar +from smnp.token.tools import charTokenizer from smnp.token.type import TokenType + def tokenizeOpenParen(input, current, line): - return tokenizeChar(TokenType.OPEN_PAREN, '(', input, current, line) + return charTokenizer(TokenType.OPEN_PAREN, '(')(input, current, line) def tokenizeCloseParen(input, current, line): - return tokenizeChar(TokenType.CLOSE_PAREN, ')', input, current, line) + return charTokenizer(TokenType.CLOSE_PAREN, ')')(input, current, line) diff --git a/smnp/token/tokenizers/percent.py b/smnp/token/tokenizers/percent.py index cccb638..1d16e08 100644 --- a/smnp/token/tokenizers/percent.py +++ b/smnp/token/tokenizers/percent.py @@ -1,5 +1,5 @@ -from smnp.token.tools import tokenizeChar +from smnp.token.tools import charTokenizer from smnp.token.type import TokenType def tokenizePercent(input, current, line): - return tokenizeChar(TokenType.PERCENT, '%', input, current, line) + return charTokenizer(TokenType.PERCENT, '%')(input, current, line) diff --git a/smnp/token/tokenizers/square.py b/smnp/token/tokenizers/square.py index 08de66a..0946b32 100644 --- a/smnp/token/tokenizers/square.py +++ b/smnp/token/tokenizers/square.py @@ -1,8 +1,8 @@ -from smnp.token.tools import tokenizeChar +from smnp.token.tools import charTokenizer from smnp.token.type import TokenType def tokenizeOpenSquare(input, current, line): - return tokenizeChar(TokenType.OPEN_SQUARE, '[', input, current, line) + return charTokenizer(TokenType.OPEN_SQUARE, '[')(input, current, line) def tokenizeCloseSquare(input, current, line): - return tokenizeChar(TokenType.CLOSE_SQUARE, ']', input, current, line) + return charTokenizer(TokenType.CLOSE_SQUARE, ']')(input, current, line) diff --git a/smnp/token/tokenizers/whitespace.py b/smnp/token/tokenizers/whitespace.py index a4cc2b3..a25168b 100644 --- a/smnp/token/tokenizers/whitespace.py +++ b/smnp/token/tokenizers/whitespace.py @@ -1,4 +1,4 @@ -from smnp.token.tools import tokenizeRegexPattern +from smnp.token.tools import regexPatternTokenizer def tokenizeWhitespaces(input, current, line): - return tokenizeRegexPattern(None, r'\s', input, current, line) + return regexPatternTokenizer(None, r'\s')(input, current, line) diff --git a/smnp/token/tools.py b/smnp/token/tools.py index e0ebc09..eba097c 100644 --- a/smnp/token/tools.py +++ b/smnp/token/tools.py @@ -3,28 +3,55 @@ import re from smnp.token.model import Token -def tokenizeChar(type, char, input, current, line): - if input[current] == char: - return (1, Token(type, input[current], (line, current))) - return (0, None) +def charTokenizer(type, char): + def tokenizer(input, current, line): + if input[current] == char: + return (1, Token(type, input[current], (line, current))) + return (0, None) -def tokenizeRegexPattern(type, pattern, input, current, line): - consumedChars = 0 - value = '' - - while current+consumedChars < len(input) and re.match(pattern, input[current+consumedChars]): - value += input[current+consumedChars] - consumedChars += 1 - return (consumedChars, Token(type, value, (line, current)) if consumedChars > 0 else None) + return tokenizer -def tokenizeKeywords(type, input, current, line, *keywords): - for keyword in keywords: - result = tokenizeKeyword(type, keyword, input, current, line) - if result[0] > 0: - return result - return (0, None) -def tokenizeKeyword(type, keyword, input, current, line): - if len(input) >= current+len(keyword) and input[current:current+len(keyword)] == keyword: - return (len(keyword), Token(type, keyword, (line, current))) - return (0, None) +def regexPatternTokenizer(type, pattern): + def tokenizer(input, current, line): + consumedChars = 0 + value = '' + + while current+consumedChars < len(input) and re.match(pattern, input[current+consumedChars]): + value += input[current+consumedChars] + consumedChars += 1 + return (consumedChars, Token(type, value, (line, current)) if consumedChars > 0 else None) + + return tokenizer + + +def keywordsTokenizer(type, *keywords): + def tokenizer(input, current, line): + for keyword in keywords: + result = keywordTokenizer(type, keyword)(input, current, line) + if result[0] > 0: + return result + return (0, None) + + return tokenizer + + +def keywordTokenizer(type, keyword): + def tokenizer(input, current, line): + if len(input) >= current+len(keyword) and input[current:current+len(keyword)] == keyword: + return (len(keyword), Token(type, keyword, (line, current))) + return (0, None) + return tokenizer + + +def separate(tokenizer, end=r"\W"): + def separated(input, current, line): + consumedChars, token = tokenizer(input, current, line) + if consumedChars > 0: + if len(input) > current+consumedChars and re.match(end, input[current+consumedChars]): + return (consumedChars, token) + if len(input) == current+consumedChars: + return (consumedChars, token) + return (0, None) + + return separated