Enable tokenizer to support separators between keywords and integers

This commit is contained in:
Bartłomiej Pluta
2019-07-06 13:35:21 +02:00
parent 675b1774fe
commit 9c4046ac2a
14 changed files with 90 additions and 57 deletions

View File

@@ -1,5 +1,5 @@
from smnp.token.tools import tokenizeChar from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType from smnp.token.type import TokenType
def tokenizeAssign(input, current, line): def tokenizeAssign(input, current, line):
return tokenizeChar(TokenType.ASSIGN, '=', input, current, line) return charTokenizer(TokenType.ASSIGN, '=')(input, current, line)

View File

@@ -1,5 +1,6 @@
from smnp.token.tools import tokenizeChar from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType from smnp.token.type import TokenType
def tokenizeAsterisk(input, current, line): def tokenizeAsterisk(input, current, line):
return tokenizeChar(TokenType.ASTERISK, '*', input, current, line) return charTokenizer(TokenType.ASTERISK, '*')(input, current, line)

View File

@@ -1,8 +1,10 @@
from smnp.token.tools import tokenizeChar from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType from smnp.token.type import TokenType
def tokenizeOpenBracket(input, current, line): def tokenizeOpenBracket(input, current, line):
return tokenizeChar(TokenType.OPEN_BRACKET, '{', input, current, line) return charTokenizer(TokenType.OPEN_BRACKET, '{')(input, current, line)
def tokenizeCloseBracket(input, current, line): def tokenizeCloseBracket(input, current, line):
return tokenizeChar(TokenType.CLOSE_BRACKET, '}', input, current, line) return charTokenizer(TokenType.CLOSE_BRACKET, '}')(input, current, line)

View File

@@ -1,5 +1,6 @@
from smnp.token.tools import tokenizeChar from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType from smnp.token.type import TokenType
def tokenizeComma(input, current, line): def tokenizeComma(input, current, line):
return tokenizeChar(TokenType.COMMA, ',', input, current, line) return charTokenizer(TokenType.COMMA, ',')(input, current, line)

View File

@@ -1,5 +1,5 @@
from smnp.token.tools import tokenizeChar from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType from smnp.token.type import TokenType
def tokenizeDot(input, current, line): def tokenizeDot(input, current, line):
return tokenizeChar(TokenType.DOT, '.', input, current, line) return charTokenizer(TokenType.DOT, '.')(input, current, line)

View File

@@ -1,5 +1,6 @@
from smnp.token.tools import tokenizeRegexPattern from smnp.token.tools import regexPatternTokenizer
from smnp.token.type import TokenType from smnp.token.type import TokenType
def tokenizeIdentifier(input, current, line): def tokenizeIdentifier(input, current, line):
return tokenizeRegexPattern(TokenType.IDENTIFIER, r'\w', input, current, line) # TODO: Disallow to create identifiers beggining from a number
return regexPatternTokenizer(TokenType.IDENTIFIER, r'\w')(input, current, line)

View File

@@ -1,5 +1,5 @@
from smnp.token.tools import tokenizeRegexPattern from smnp.token.tools import regexPatternTokenizer, separate
from smnp.token.type import TokenType from smnp.token.type import TokenType
def tokenizeInteger(input, current, line): def tokenizeInteger(input, current, line):
return tokenizeRegexPattern(TokenType.INTEGER, r'\d', input, current, line) return separate(regexPatternTokenizer(TokenType.INTEGER, r'\d'))(input, current, line)

View File

@@ -1,34 +1,34 @@
from smnp.token.tools import tokenizeKeywords, tokenizeKeyword from smnp.token.tools import keywordsTokenizer, keywordTokenizer, separate
from smnp.token.type import TokenType from smnp.token.type import TokenType
from smnp.type.model import Type from smnp.type.model import Type
def tokenizeType(input, current, line): def tokenizeType(input, current, line):
types = [ type.name.lower() for type in Type ] types = [ type.name.lower() for type in Type ]
return tokenizeKeywords(TokenType.TYPE, input, current, line, *types) return separate(keywordsTokenizer(TokenType.TYPE, *types))(input, current, line)
def tokenizeReturn(input, current, line): def tokenizeReturn(input, current, line):
return tokenizeKeyword(TokenType.RETURN, 'return', input, current, line) return separate(keywordTokenizer(TokenType.RETURN, 'return'))(input, current, line)
def tokenizeFunction(input, current, line): def tokenizeFunction(input, current, line):
return tokenizeKeyword(TokenType.FUNCTION, 'function', input, current, line) return separate(keywordTokenizer(TokenType.FUNCTION, 'function'))(input, current, line)
def tokenizeExtend(input, current, line): def tokenizeExtend(input, current, line):
return tokenizeKeyword(TokenType.EXTEND, "extend", input, current, line) return separate(keywordTokenizer(TokenType.EXTEND, "extend"))(input, current, line)
def tokenizeImport(input, current, line): def tokenizeImport(input, current, line):
return tokenizeKeyword(TokenType.IMPORT, "import", input, current, line) return separate(keywordTokenizer(TokenType.IMPORT, "import"))(input, current, line)
def tokenizeFrom(input, current, line): def tokenizeFrom(input, current, line):
return tokenizeKeyword(TokenType.FROM, "from", input, current, line) return separate(keywordTokenizer(TokenType.FROM, "from"))(input, current, line)
def tokenizeAs(input, current, line): def tokenizeAs(input, current, line):
return tokenizeKeyword(TokenType.AS, "as", input, current, line) return separate(keywordTokenizer(TokenType.AS, "as"))(input, current, line)

View File

@@ -1,5 +1,5 @@
from smnp.token.tools import tokenizeChar from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType from smnp.token.type import TokenType
def tokenizeMinus(input, current, line): def tokenizeMinus(input, current, line):
return tokenizeChar(TokenType.MINUS, '-', input, current, line) return charTokenizer(TokenType.MINUS, '-')(input, current, line)

View File

@@ -1,8 +1,9 @@
from smnp.token.tools import tokenizeChar from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType from smnp.token.type import TokenType
def tokenizeOpenParen(input, current, line): def tokenizeOpenParen(input, current, line):
return tokenizeChar(TokenType.OPEN_PAREN, '(', input, current, line) return charTokenizer(TokenType.OPEN_PAREN, '(')(input, current, line)
def tokenizeCloseParen(input, current, line): def tokenizeCloseParen(input, current, line):
return tokenizeChar(TokenType.CLOSE_PAREN, ')', input, current, line) return charTokenizer(TokenType.CLOSE_PAREN, ')')(input, current, line)

View File

@@ -1,5 +1,5 @@
from smnp.token.tools import tokenizeChar from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType from smnp.token.type import TokenType
def tokenizePercent(input, current, line): def tokenizePercent(input, current, line):
return tokenizeChar(TokenType.PERCENT, '%', input, current, line) return charTokenizer(TokenType.PERCENT, '%')(input, current, line)

View File

@@ -1,8 +1,8 @@
from smnp.token.tools import tokenizeChar from smnp.token.tools import charTokenizer
from smnp.token.type import TokenType from smnp.token.type import TokenType
def tokenizeOpenSquare(input, current, line): def tokenizeOpenSquare(input, current, line):
return tokenizeChar(TokenType.OPEN_SQUARE, '[', input, current, line) return charTokenizer(TokenType.OPEN_SQUARE, '[')(input, current, line)
def tokenizeCloseSquare(input, current, line): def tokenizeCloseSquare(input, current, line):
return tokenizeChar(TokenType.CLOSE_SQUARE, ']', input, current, line) return charTokenizer(TokenType.CLOSE_SQUARE, ']')(input, current, line)

View File

@@ -1,4 +1,4 @@
from smnp.token.tools import tokenizeRegexPattern from smnp.token.tools import regexPatternTokenizer
def tokenizeWhitespaces(input, current, line): def tokenizeWhitespaces(input, current, line):
return tokenizeRegexPattern(None, r'\s', input, current, line) return regexPatternTokenizer(None, r'\s')(input, current, line)

View File

@@ -3,28 +3,55 @@ import re
from smnp.token.model import Token from smnp.token.model import Token
def tokenizeChar(type, char, input, current, line): def charTokenizer(type, char):
if input[current] == char: def tokenizer(input, current, line):
return (1, Token(type, input[current], (line, current))) if input[current] == char:
return (0, None) return (1, Token(type, input[current], (line, current)))
return (0, None)
def tokenizeRegexPattern(type, pattern, input, current, line): return tokenizer
consumedChars = 0
value = ''
while current+consumedChars < len(input) and re.match(pattern, input[current+consumedChars]):
value += input[current+consumedChars]
consumedChars += 1
return (consumedChars, Token(type, value, (line, current)) if consumedChars > 0 else None)
def tokenizeKeywords(type, input, current, line, *keywords): def regexPatternTokenizer(type, pattern):
for keyword in keywords: def tokenizer(input, current, line):
result = tokenizeKeyword(type, keyword, input, current, line) consumedChars = 0
if result[0] > 0: value = ''
return result
return (0, None)
def tokenizeKeyword(type, keyword, input, current, line): while current+consumedChars < len(input) and re.match(pattern, input[current+consumedChars]):
if len(input) >= current+len(keyword) and input[current:current+len(keyword)] == keyword: value += input[current+consumedChars]
return (len(keyword), Token(type, keyword, (line, current))) consumedChars += 1
return (0, None) return (consumedChars, Token(type, value, (line, current)) if consumedChars > 0 else None)
return tokenizer
def keywordsTokenizer(type, *keywords):
def tokenizer(input, current, line):
for keyword in keywords:
result = keywordTokenizer(type, keyword)(input, current, line)
if result[0] > 0:
return result
return (0, None)
return tokenizer
def keywordTokenizer(type, keyword):
def tokenizer(input, current, line):
if len(input) >= current+len(keyword) and input[current:current+len(keyword)] == keyword:
return (len(keyword), Token(type, keyword, (line, current)))
return (0, None)
return tokenizer
def separate(tokenizer, end=r"\W"):
def separated(input, current, line):
consumedChars, token = tokenizer(input, current, line)
if consumedChars > 0:
if len(input) > current+consumedChars and re.match(end, input[current+consumedChars]):
return (consumedChars, token)
if len(input) == current+consumedChars:
return (consumedChars, token)
return (0, None)
return separated