Refactor tokenizer
This commit is contained in:
1
smnp/token/__init__.py
Normal file
1
smnp/token/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
__all__ = ["tokenize"]
|
||||
54
smnp/token/model.py
Normal file
54
smnp/token/model.py
Normal file
@@ -0,0 +1,54 @@
|
||||
class Token:
|
||||
def __init__(self, type, value, pos):
|
||||
self.type = type
|
||||
self.value = value
|
||||
self.pos = pos
|
||||
def __str__(self):
|
||||
return "Token(" + str(self.type) + ", '" + self.value + "', " + str(self.pos) + ")"
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
class TokenList:
|
||||
def __init__(self, tokens = []):
|
||||
self.tokens = tokens
|
||||
self.cursor = 0
|
||||
self.snap = 0
|
||||
|
||||
def append(self, token):
|
||||
self.tokens.append(token)
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.tokens[index]
|
||||
|
||||
def current(self):
|
||||
if self.cursor >= len(self.tokens):
|
||||
raise RuntimeError(f"Cursor points to not existing token! Cursor = {self.cursor}, len = {len(self.tokens)}")
|
||||
return self.tokens[self.cursor]
|
||||
|
||||
def next(self, number=1):
|
||||
return self.tokens[self.cursor + number]
|
||||
|
||||
def prev(self, number=1):
|
||||
return self.tokens[self.cursor - number]
|
||||
|
||||
def hasMore(self, count=1):
|
||||
return self.cursor + count < len(self.tokens)
|
||||
|
||||
def hasCurrent(self):
|
||||
return self.cursor < len(self.tokens)
|
||||
|
||||
def ahead(self):
|
||||
self.cursor += 1
|
||||
|
||||
def snapshot(self):
|
||||
self.snapshot = self.cursor
|
||||
|
||||
def reset(self):
|
||||
self.cursor = self.snapshot
|
||||
return self.tokens[self.cursor]
|
||||
|
||||
def __str__(self):
|
||||
return f"[Cursor: {self.cursor}\n{', '.join([str(token) for token in self.tokens])}]"
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
81
smnp/token/tokenizer.py
Normal file
81
smnp/token/tokenizer.py
Normal file
@@ -0,0 +1,81 @@
|
||||
import sys
|
||||
import time
|
||||
import re
|
||||
from smnp.error.syntax import SyntaxException
|
||||
from smnp.token.type import TokenType
|
||||
from smnp.token.model import Token, TokenList
|
||||
from smnp.token.tools import tokenizeChar, tokenizeRegexPattern
|
||||
from smnp.token.tokenizers.paren import tokenizeOpenParen, tokenizeCloseParen
|
||||
from smnp.token.tokenizers.asterisk import tokenizeAsterisk
|
||||
from smnp.token.tokenizers.whitespace import tokenizeWhitespaces
|
||||
from smnp.token.tokenizers.identifier import tokenizeIdentifier
|
||||
from smnp.token.tokenizers.comma import tokenizeComma
|
||||
from smnp.token.tokenizers.string import tokenizeString
|
||||
from smnp.token.tokenizers.integer import tokenizeInteger
|
||||
from smnp.token.tokenizers.bracket import tokenizeOpenBracket, tokenizeCloseBracket
|
||||
from smnp.token.tokenizers.assign import tokenizeAssign
|
||||
from smnp.token.tokenizers.colon import tokenizeColon
|
||||
from smnp.token.tokenizers.comment import tokenizeComment
|
||||
from smnp.token.tokenizers.note import tokenizeNote
|
||||
from smnp.token.tokenizers.function import tokenizeFunction
|
||||
from smnp.token.tokenizers.ret import tokenizeReturn
|
||||
from smnp.token.tokenizers.percent import tokenizePercent
|
||||
from smnp.token.tokenizers.minus import tokenizeMinus
|
||||
from smnp.token.tokenizers.dot import tokenizeDot
|
||||
|
||||
tokenizers = (
|
||||
tokenizeOpenParen,
|
||||
tokenizeCloseParen,
|
||||
tokenizeAsterisk,
|
||||
tokenizeString,
|
||||
tokenizeFunction,
|
||||
tokenizeReturn,
|
||||
tokenizeInteger,
|
||||
tokenizeNote,
|
||||
tokenizeIdentifier,
|
||||
tokenizeComma,
|
||||
tokenizeOpenBracket,
|
||||
tokenizeCloseBracket,
|
||||
tokenizeAssign,
|
||||
tokenizeColon,
|
||||
tokenizePercent,
|
||||
tokenizeMinus,
|
||||
tokenizeDot,
|
||||
tokenizeComment,
|
||||
tokenizeWhitespaces,
|
||||
)
|
||||
|
||||
filters = [
|
||||
lambda token: token.type is not None,
|
||||
lambda token: token.type != TokenType.COMMENT
|
||||
]
|
||||
|
||||
def tokenize(lines):
|
||||
tokens = []
|
||||
for lineNumber, line in enumerate(lines):
|
||||
current = 0
|
||||
while current < len(line):
|
||||
consumedChars, token = combinedTokenizer(line, current, lineNumber)
|
||||
|
||||
if consumedChars == 0:
|
||||
raise SyntaxException((lineNumber, current), f"Unknown symbol '{line[current]}'")
|
||||
|
||||
current += consumedChars
|
||||
tokens.append(token)
|
||||
|
||||
return TokenList(filterTokens(filters, tokens))
|
||||
|
||||
def combinedTokenizer(line, current, lineNumber):
|
||||
for tokenizer in tokenizers:
|
||||
consumedChars, token = tokenizer(line, current, lineNumber)
|
||||
if consumedChars > 0:
|
||||
return (consumedChars, token)
|
||||
return (0, None)
|
||||
|
||||
def filterTokens(filters, tokens):
|
||||
if not filters:
|
||||
return tokens
|
||||
|
||||
return filterTokens(filters[1:], (token for token in tokens if filters[0](token)))
|
||||
|
||||
__all__ = ["tokenize"]
|
||||
0
smnp/token/tokenizers/__init__.py
Normal file
0
smnp/token/tokenizers/__init__.py
Normal file
5
smnp/token/tokenizers/assign.py
Normal file
5
smnp/token/tokenizers/assign.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from smnp.token.tools import tokenizeChar
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeAssign(input, current, line):
|
||||
return tokenizeChar(TokenType.ASSIGN, '=', input, current, line)
|
||||
5
smnp/token/tokenizers/asterisk.py
Normal file
5
smnp/token/tokenizers/asterisk.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from smnp.token.tools import tokenizeChar
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeAsterisk(input, current, line):
|
||||
return tokenizeChar(TokenType.ASTERISK, '*', input, current, line)
|
||||
8
smnp/token/tokenizers/bracket.py
Normal file
8
smnp/token/tokenizers/bracket.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from smnp.token.tools import tokenizeChar
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeOpenBracket(input, current, line):
|
||||
return tokenizeChar(TokenType.OPEN_BRACKET, '{', input, current, line)
|
||||
|
||||
def tokenizeCloseBracket(input, current, line):
|
||||
return tokenizeChar(TokenType.CLOSE_BRACKET, '}', input, current, line)
|
||||
5
smnp/token/tokenizers/colon.py
Normal file
5
smnp/token/tokenizers/colon.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from smnp.token.tools import tokenizeChar
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeColon(input, current, line):
|
||||
return tokenizeChar(TokenType.COLON, ':', input, current, line)
|
||||
5
smnp/token/tokenizers/comma.py
Normal file
5
smnp/token/tokenizers/comma.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from smnp.token.tools import tokenizeChar
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeComma(input, current, line):
|
||||
return tokenizeChar(TokenType.COMMA, ',', input, current, line)
|
||||
13
smnp/token/tokenizers/comment.py
Normal file
13
smnp/token/tokenizers/comment.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from smnp.token.type import TokenType
|
||||
from smnp.token.model import Token
|
||||
|
||||
def tokenizeComment(input, current, line):
|
||||
if input[current] == '#':
|
||||
consumedChars = 0
|
||||
value = ''
|
||||
while current+consumedChars < len(input):
|
||||
value += input[current+consumedChars]
|
||||
consumedChars += 1
|
||||
pass
|
||||
return (consumedChars, Token(TokenType.COMMENT, value, (line, current)))
|
||||
return (0, None)
|
||||
5
smnp/token/tokenizers/dot.py
Normal file
5
smnp/token/tokenizers/dot.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from smnp.token.tools import tokenizeChar
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeDot(input, current, line):
|
||||
return tokenizeChar(TokenType.DOT, '.', input, current, line)
|
||||
5
smnp/token/tokenizers/function.py
Normal file
5
smnp/token/tokenizers/function.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from smnp.token.tools import tokenizeKeyword
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeFunction(input, current, line):
|
||||
return tokenizeKeyword(TokenType.FUNCTION, 'function', input, current, line)
|
||||
5
smnp/token/tokenizers/identifier.py
Normal file
5
smnp/token/tokenizers/identifier.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from smnp.token.tools import tokenizeRegexPattern
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeIdentifier(input, current, line):
|
||||
return tokenizeRegexPattern(TokenType.IDENTIFIER, r'\w', input, current, line)
|
||||
5
smnp/token/tokenizers/integer.py
Normal file
5
smnp/token/tokenizers/integer.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from smnp.token.tools import tokenizeRegexPattern
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeInteger(input, current, line):
|
||||
return tokenizeRegexPattern(TokenType.INTEGER, r'\d', input, current, line)
|
||||
5
smnp/token/tokenizers/minus.py
Normal file
5
smnp/token/tokenizers/minus.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from smnp.token.tools import tokenizeChar
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeMinus(input, current, line):
|
||||
return tokenizeChar(TokenType.MINUS, '-', input, current, line)
|
||||
37
smnp/token/tokenizers/note.py
Normal file
37
smnp/token/tokenizers/note.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import re
|
||||
from smnp.token.type import TokenType
|
||||
from smnp.token.model import Token
|
||||
|
||||
def tokenizeNote(input, current, line):
|
||||
consumedChars = 0
|
||||
value = ''
|
||||
if input[current] == '@':
|
||||
consumedChars += 1
|
||||
value += input[current]
|
||||
if input[current+consumedChars] in ('C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'A', 'a', 'H', 'h', 'B', 'b'):
|
||||
value += input[current+consumedChars]
|
||||
consumedChars += 1
|
||||
|
||||
if current+consumedChars < len(input) and input[current+consumedChars] in ('b', '#'):
|
||||
value += input[current+consumedChars]
|
||||
consumedChars += 1
|
||||
|
||||
if current+consumedChars < len(input) and re.match(r'\d', input[current+consumedChars]):
|
||||
value += input[current+consumedChars]
|
||||
consumedChars += 1
|
||||
|
||||
if current+consumedChars < len(input) and input[current+consumedChars] == '.':
|
||||
duration = input[current+consumedChars]
|
||||
consumedChars += 1
|
||||
while current+consumedChars < len(input) and re.match(r'\d', input[current+consumedChars]):
|
||||
duration += input[current+consumedChars]
|
||||
consumedChars += 1
|
||||
if current+consumedChars < len(input) and input[current+consumedChars] == 'd':
|
||||
duration += input[current+consumedChars]
|
||||
consumedChars += 1
|
||||
if len(duration) > 1:
|
||||
value += duration
|
||||
else:
|
||||
consumedChars -= 1
|
||||
return (consumedChars, Token(TokenType.NOTE, value, (line, current)))
|
||||
return (0, None)
|
||||
8
smnp/token/tokenizers/paren.py
Normal file
8
smnp/token/tokenizers/paren.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from smnp.token.tools import tokenizeChar
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeOpenParen(input, current, line):
|
||||
return tokenizeChar(TokenType.OPEN_PAREN, '(', input, current, line)
|
||||
|
||||
def tokenizeCloseParen(input, current, line):
|
||||
return tokenizeChar(TokenType.CLOSE_PAREN, ')', input, current, line)
|
||||
5
smnp/token/tokenizers/percent.py
Normal file
5
smnp/token/tokenizers/percent.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from smnp.token.tools import tokenizeChar
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizePercent(input, current, line):
|
||||
return tokenizeChar(TokenType.PERCENT, '%', input, current, line)
|
||||
5
smnp/token/tokenizers/ret.py
Normal file
5
smnp/token/tokenizers/ret.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from smnp.token.tools import tokenizeKeyword
|
||||
from smnp.token.type import TokenType
|
||||
|
||||
def tokenizeReturn(input, current, line):
|
||||
return tokenizeKeyword(TokenType.RETURN, 'return', input, current, line)
|
||||
16
smnp/token/tokenizers/string.py
Normal file
16
smnp/token/tokenizers/string.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from smnp.token.type import TokenType
|
||||
from smnp.token.model import Token
|
||||
|
||||
def tokenizeString(input, current, line):
|
||||
if input[current] == '"':
|
||||
value = input[current]
|
||||
char = ''
|
||||
consumedChars = 1
|
||||
while char != '"':
|
||||
if char is None: #TODO!!!
|
||||
print("String not terminated")
|
||||
char = input[current + consumedChars]
|
||||
value += char
|
||||
consumedChars += 1
|
||||
return (consumedChars, Token(TokenType.STRING, value, (line, current)))
|
||||
return (0, None)
|
||||
4
smnp/token/tokenizers/whitespace.py
Normal file
4
smnp/token/tokenizers/whitespace.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from smnp.token.tools import tokenizeRegexPattern
|
||||
|
||||
def tokenizeWhitespaces(input, current, line):
|
||||
return tokenizeRegexPattern(None, r'\s', input, current, line)
|
||||
21
smnp/token/tools.py
Normal file
21
smnp/token/tools.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import re
|
||||
from smnp.token.model import Token
|
||||
|
||||
def tokenizeChar(type, char, input, current, line):
|
||||
if input[current] == char:
|
||||
return (1, Token(type, input[current], (line, current)))
|
||||
return (0, None)
|
||||
|
||||
def tokenizeRegexPattern(type, pattern, input, current, line):
|
||||
consumedChars = 0
|
||||
value = ''
|
||||
|
||||
while current+consumedChars < len(input) and re.match(pattern, input[current+consumedChars]):
|
||||
value += input[current+consumedChars]
|
||||
consumedChars += 1
|
||||
return (consumedChars, Token(type, value, (line, current)) if consumedChars > 0 else None)
|
||||
|
||||
def tokenizeKeyword(type, keyword, input, current, line):
|
||||
if len(input) >= current+len(keyword) and input[current:current+len(keyword)] == keyword:
|
||||
return (len(keyword), Token(type, keyword, (line, current)))
|
||||
return (0, None)
|
||||
21
smnp/token/type.py
Normal file
21
smnp/token/type.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from enum import Enum
|
||||
|
||||
class TokenType(Enum):
|
||||
OPEN_PAREN = 1
|
||||
CLOSE_PAREN = 2
|
||||
ASTERISK = 3
|
||||
STRING = 4
|
||||
IDENTIFIER = 5
|
||||
COMMA = 6
|
||||
INTEGER = 7
|
||||
OPEN_BRACKET = 8
|
||||
CLOSE_BRACKET = 9
|
||||
ASSIGN = 10
|
||||
COLON = 11
|
||||
NOTE = 12
|
||||
COMMENT = 13
|
||||
PERCENT = 14
|
||||
MINUS = 15
|
||||
FUNCTION = 16
|
||||
RETURN = 17
|
||||
DOT = 18
|
||||
Reference in New Issue
Block a user