Enable tokenizer to support separators between keywords and integers
This commit is contained in:
@@ -1,5 +1,5 @@
|
|||||||
from smnp.token.tools import tokenizeChar
|
from smnp.token.tools import charTokenizer
|
||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
def tokenizeAssign(input, current, line):
|
def tokenizeAssign(input, current, line):
|
||||||
return tokenizeChar(TokenType.ASSIGN, '=', input, current, line)
|
return charTokenizer(TokenType.ASSIGN, '=')(input, current, line)
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
from smnp.token.tools import tokenizeChar
|
from smnp.token.tools import charTokenizer
|
||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
|
|
||||||
def tokenizeAsterisk(input, current, line):
|
def tokenizeAsterisk(input, current, line):
|
||||||
return tokenizeChar(TokenType.ASTERISK, '*', input, current, line)
|
return charTokenizer(TokenType.ASTERISK, '*')(input, current, line)
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
from smnp.token.tools import tokenizeChar
|
from smnp.token.tools import charTokenizer
|
||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
|
|
||||||
def tokenizeOpenBracket(input, current, line):
|
def tokenizeOpenBracket(input, current, line):
|
||||||
return tokenizeChar(TokenType.OPEN_BRACKET, '{', input, current, line)
|
return charTokenizer(TokenType.OPEN_BRACKET, '{')(input, current, line)
|
||||||
|
|
||||||
|
|
||||||
def tokenizeCloseBracket(input, current, line):
|
def tokenizeCloseBracket(input, current, line):
|
||||||
return tokenizeChar(TokenType.CLOSE_BRACKET, '}', input, current, line)
|
return charTokenizer(TokenType.CLOSE_BRACKET, '}')(input, current, line)
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
from smnp.token.tools import tokenizeChar
|
from smnp.token.tools import charTokenizer
|
||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
|
|
||||||
def tokenizeComma(input, current, line):
|
def tokenizeComma(input, current, line):
|
||||||
return tokenizeChar(TokenType.COMMA, ',', input, current, line)
|
return charTokenizer(TokenType.COMMA, ',')(input, current, line)
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from smnp.token.tools import tokenizeChar
|
from smnp.token.tools import charTokenizer
|
||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
def tokenizeDot(input, current, line):
|
def tokenizeDot(input, current, line):
|
||||||
return tokenizeChar(TokenType.DOT, '.', input, current, line)
|
return charTokenizer(TokenType.DOT, '.')(input, current, line)
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
from smnp.token.tools import tokenizeRegexPattern
|
from smnp.token.tools import regexPatternTokenizer
|
||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
def tokenizeIdentifier(input, current, line):
|
def tokenizeIdentifier(input, current, line):
|
||||||
return tokenizeRegexPattern(TokenType.IDENTIFIER, r'\w', input, current, line)
|
# TODO: Disallow to create identifiers beggining from a number
|
||||||
|
return regexPatternTokenizer(TokenType.IDENTIFIER, r'\w')(input, current, line)
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from smnp.token.tools import tokenizeRegexPattern
|
from smnp.token.tools import regexPatternTokenizer, separate
|
||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
def tokenizeInteger(input, current, line):
|
def tokenizeInteger(input, current, line):
|
||||||
return tokenizeRegexPattern(TokenType.INTEGER, r'\d', input, current, line)
|
return separate(regexPatternTokenizer(TokenType.INTEGER, r'\d'))(input, current, line)
|
||||||
|
|||||||
@@ -1,34 +1,34 @@
|
|||||||
from smnp.token.tools import tokenizeKeywords, tokenizeKeyword
|
from smnp.token.tools import keywordsTokenizer, keywordTokenizer, separate
|
||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
from smnp.type.model import Type
|
from smnp.type.model import Type
|
||||||
|
|
||||||
|
|
||||||
def tokenizeType(input, current, line):
|
def tokenizeType(input, current, line):
|
||||||
types = [ type.name.lower() for type in Type ]
|
types = [ type.name.lower() for type in Type ]
|
||||||
return tokenizeKeywords(TokenType.TYPE, input, current, line, *types)
|
return separate(keywordsTokenizer(TokenType.TYPE, *types))(input, current, line)
|
||||||
|
|
||||||
|
|
||||||
def tokenizeReturn(input, current, line):
|
def tokenizeReturn(input, current, line):
|
||||||
return tokenizeKeyword(TokenType.RETURN, 'return', input, current, line)
|
return separate(keywordTokenizer(TokenType.RETURN, 'return'))(input, current, line)
|
||||||
|
|
||||||
|
|
||||||
def tokenizeFunction(input, current, line):
|
def tokenizeFunction(input, current, line):
|
||||||
return tokenizeKeyword(TokenType.FUNCTION, 'function', input, current, line)
|
return separate(keywordTokenizer(TokenType.FUNCTION, 'function'))(input, current, line)
|
||||||
|
|
||||||
|
|
||||||
def tokenizeExtend(input, current, line):
|
def tokenizeExtend(input, current, line):
|
||||||
return tokenizeKeyword(TokenType.EXTEND, "extend", input, current, line)
|
return separate(keywordTokenizer(TokenType.EXTEND, "extend"))(input, current, line)
|
||||||
|
|
||||||
|
|
||||||
def tokenizeImport(input, current, line):
|
def tokenizeImport(input, current, line):
|
||||||
return tokenizeKeyword(TokenType.IMPORT, "import", input, current, line)
|
return separate(keywordTokenizer(TokenType.IMPORT, "import"))(input, current, line)
|
||||||
|
|
||||||
|
|
||||||
def tokenizeFrom(input, current, line):
|
def tokenizeFrom(input, current, line):
|
||||||
return tokenizeKeyword(TokenType.FROM, "from", input, current, line)
|
return separate(keywordTokenizer(TokenType.FROM, "from"))(input, current, line)
|
||||||
|
|
||||||
|
|
||||||
def tokenizeAs(input, current, line):
|
def tokenizeAs(input, current, line):
|
||||||
return tokenizeKeyword(TokenType.AS, "as", input, current, line)
|
return separate(keywordTokenizer(TokenType.AS, "as"))(input, current, line)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from smnp.token.tools import tokenizeChar
|
from smnp.token.tools import charTokenizer
|
||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
def tokenizeMinus(input, current, line):
|
def tokenizeMinus(input, current, line):
|
||||||
return tokenizeChar(TokenType.MINUS, '-', input, current, line)
|
return charTokenizer(TokenType.MINUS, '-')(input, current, line)
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
from smnp.token.tools import tokenizeChar
|
from smnp.token.tools import charTokenizer
|
||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
|
|
||||||
def tokenizeOpenParen(input, current, line):
|
def tokenizeOpenParen(input, current, line):
|
||||||
return tokenizeChar(TokenType.OPEN_PAREN, '(', input, current, line)
|
return charTokenizer(TokenType.OPEN_PAREN, '(')(input, current, line)
|
||||||
|
|
||||||
def tokenizeCloseParen(input, current, line):
|
def tokenizeCloseParen(input, current, line):
|
||||||
return tokenizeChar(TokenType.CLOSE_PAREN, ')', input, current, line)
|
return charTokenizer(TokenType.CLOSE_PAREN, ')')(input, current, line)
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from smnp.token.tools import tokenizeChar
|
from smnp.token.tools import charTokenizer
|
||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
def tokenizePercent(input, current, line):
|
def tokenizePercent(input, current, line):
|
||||||
return tokenizeChar(TokenType.PERCENT, '%', input, current, line)
|
return charTokenizer(TokenType.PERCENT, '%')(input, current, line)
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
from smnp.token.tools import tokenizeChar
|
from smnp.token.tools import charTokenizer
|
||||||
from smnp.token.type import TokenType
|
from smnp.token.type import TokenType
|
||||||
|
|
||||||
def tokenizeOpenSquare(input, current, line):
|
def tokenizeOpenSquare(input, current, line):
|
||||||
return tokenizeChar(TokenType.OPEN_SQUARE, '[', input, current, line)
|
return charTokenizer(TokenType.OPEN_SQUARE, '[')(input, current, line)
|
||||||
|
|
||||||
def tokenizeCloseSquare(input, current, line):
|
def tokenizeCloseSquare(input, current, line):
|
||||||
return tokenizeChar(TokenType.CLOSE_SQUARE, ']', input, current, line)
|
return charTokenizer(TokenType.CLOSE_SQUARE, ']')(input, current, line)
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from smnp.token.tools import tokenizeRegexPattern
|
from smnp.token.tools import regexPatternTokenizer
|
||||||
|
|
||||||
def tokenizeWhitespaces(input, current, line):
|
def tokenizeWhitespaces(input, current, line):
|
||||||
return tokenizeRegexPattern(None, r'\s', input, current, line)
|
return regexPatternTokenizer(None, r'\s')(input, current, line)
|
||||||
|
|||||||
@@ -3,28 +3,55 @@ import re
|
|||||||
from smnp.token.model import Token
|
from smnp.token.model import Token
|
||||||
|
|
||||||
|
|
||||||
def tokenizeChar(type, char, input, current, line):
|
def charTokenizer(type, char):
|
||||||
if input[current] == char:
|
def tokenizer(input, current, line):
|
||||||
return (1, Token(type, input[current], (line, current)))
|
if input[current] == char:
|
||||||
return (0, None)
|
return (1, Token(type, input[current], (line, current)))
|
||||||
|
return (0, None)
|
||||||
|
|
||||||
def tokenizeRegexPattern(type, pattern, input, current, line):
|
return tokenizer
|
||||||
consumedChars = 0
|
|
||||||
value = ''
|
|
||||||
|
|
||||||
while current+consumedChars < len(input) and re.match(pattern, input[current+consumedChars]):
|
|
||||||
value += input[current+consumedChars]
|
|
||||||
consumedChars += 1
|
|
||||||
return (consumedChars, Token(type, value, (line, current)) if consumedChars > 0 else None)
|
|
||||||
|
|
||||||
def tokenizeKeywords(type, input, current, line, *keywords):
|
|
||||||
for keyword in keywords:
|
|
||||||
result = tokenizeKeyword(type, keyword, input, current, line)
|
|
||||||
if result[0] > 0:
|
|
||||||
return result
|
|
||||||
return (0, None)
|
|
||||||
|
|
||||||
def tokenizeKeyword(type, keyword, input, current, line):
|
def regexPatternTokenizer(type, pattern):
|
||||||
if len(input) >= current+len(keyword) and input[current:current+len(keyword)] == keyword:
|
def tokenizer(input, current, line):
|
||||||
return (len(keyword), Token(type, keyword, (line, current)))
|
consumedChars = 0
|
||||||
return (0, None)
|
value = ''
|
||||||
|
|
||||||
|
while current+consumedChars < len(input) and re.match(pattern, input[current+consumedChars]):
|
||||||
|
value += input[current+consumedChars]
|
||||||
|
consumedChars += 1
|
||||||
|
return (consumedChars, Token(type, value, (line, current)) if consumedChars > 0 else None)
|
||||||
|
|
||||||
|
return tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def keywordsTokenizer(type, *keywords):
|
||||||
|
def tokenizer(input, current, line):
|
||||||
|
for keyword in keywords:
|
||||||
|
result = keywordTokenizer(type, keyword)(input, current, line)
|
||||||
|
if result[0] > 0:
|
||||||
|
return result
|
||||||
|
return (0, None)
|
||||||
|
|
||||||
|
return tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def keywordTokenizer(type, keyword):
|
||||||
|
def tokenizer(input, current, line):
|
||||||
|
if len(input) >= current+len(keyword) and input[current:current+len(keyword)] == keyword:
|
||||||
|
return (len(keyword), Token(type, keyword, (line, current)))
|
||||||
|
return (0, None)
|
||||||
|
return tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def separate(tokenizer, end=r"\W"):
|
||||||
|
def separated(input, current, line):
|
||||||
|
consumedChars, token = tokenizer(input, current, line)
|
||||||
|
if consumedChars > 0:
|
||||||
|
if len(input) > current+consumedChars and re.match(end, input[current+consumedChars]):
|
||||||
|
return (consumedChars, token)
|
||||||
|
if len(input) == current+consumedChars:
|
||||||
|
return (consumedChars, token)
|
||||||
|
return (0, None)
|
||||||
|
|
||||||
|
return separated
|
||||||
|
|||||||
Reference in New Issue
Block a user