Refactor tokenizer

This commit is contained in:
Bartłomiej Pluta
2019-07-03 01:55:08 +02:00
parent 8313d2dcfd
commit f826516d8f
41 changed files with 589 additions and 296 deletions

1
smnp/token/__init__.py Normal file
View File

@@ -0,0 +1 @@
__all__ = ["tokenize"]

54
smnp/token/model.py Normal file
View File

@@ -0,0 +1,54 @@
class Token:
def __init__(self, type, value, pos):
self.type = type
self.value = value
self.pos = pos
def __str__(self):
return "Token(" + str(self.type) + ", '" + self.value + "', " + str(self.pos) + ")"
def __repr__(self):
return self.__str__()
class TokenList:
def __init__(self, tokens = []):
self.tokens = tokens
self.cursor = 0
self.snap = 0
def append(self, token):
self.tokens.append(token)
def __getitem__(self, index):
return self.tokens[index]
def current(self):
if self.cursor >= len(self.tokens):
raise RuntimeError(f"Cursor points to not existing token! Cursor = {self.cursor}, len = {len(self.tokens)}")
return self.tokens[self.cursor]
def next(self, number=1):
return self.tokens[self.cursor + number]
def prev(self, number=1):
return self.tokens[self.cursor - number]
def hasMore(self, count=1):
return self.cursor + count < len(self.tokens)
def hasCurrent(self):
return self.cursor < len(self.tokens)
def ahead(self):
self.cursor += 1
def snapshot(self):
self.snapshot = self.cursor
def reset(self):
self.cursor = self.snapshot
return self.tokens[self.cursor]
def __str__(self):
return f"[Cursor: {self.cursor}\n{', '.join([str(token) for token in self.tokens])}]"
def __repr__(self):
return self.__str__()

81
smnp/token/tokenizer.py Normal file
View File

@@ -0,0 +1,81 @@
import sys
import time
import re
from smnp.error.syntax import SyntaxException
from smnp.token.type import TokenType
from smnp.token.model import Token, TokenList
from smnp.token.tools import tokenizeChar, tokenizeRegexPattern
from smnp.token.tokenizers.paren import tokenizeOpenParen, tokenizeCloseParen
from smnp.token.tokenizers.asterisk import tokenizeAsterisk
from smnp.token.tokenizers.whitespace import tokenizeWhitespaces
from smnp.token.tokenizers.identifier import tokenizeIdentifier
from smnp.token.tokenizers.comma import tokenizeComma
from smnp.token.tokenizers.string import tokenizeString
from smnp.token.tokenizers.integer import tokenizeInteger
from smnp.token.tokenizers.bracket import tokenizeOpenBracket, tokenizeCloseBracket
from smnp.token.tokenizers.assign import tokenizeAssign
from smnp.token.tokenizers.colon import tokenizeColon
from smnp.token.tokenizers.comment import tokenizeComment
from smnp.token.tokenizers.note import tokenizeNote
from smnp.token.tokenizers.function import tokenizeFunction
from smnp.token.tokenizers.ret import tokenizeReturn
from smnp.token.tokenizers.percent import tokenizePercent
from smnp.token.tokenizers.minus import tokenizeMinus
from smnp.token.tokenizers.dot import tokenizeDot
tokenizers = (
tokenizeOpenParen,
tokenizeCloseParen,
tokenizeAsterisk,
tokenizeString,
tokenizeFunction,
tokenizeReturn,
tokenizeInteger,
tokenizeNote,
tokenizeIdentifier,
tokenizeComma,
tokenizeOpenBracket,
tokenizeCloseBracket,
tokenizeAssign,
tokenizeColon,
tokenizePercent,
tokenizeMinus,
tokenizeDot,
tokenizeComment,
tokenizeWhitespaces,
)
filters = [
lambda token: token.type is not None,
lambda token: token.type != TokenType.COMMENT
]
def tokenize(lines):
tokens = []
for lineNumber, line in enumerate(lines):
current = 0
while current < len(line):
consumedChars, token = combinedTokenizer(line, current, lineNumber)
if consumedChars == 0:
raise SyntaxException((lineNumber, current), f"Unknown symbol '{line[current]}'")
current += consumedChars
tokens.append(token)
return TokenList(filterTokens(filters, tokens))
def combinedTokenizer(line, current, lineNumber):
for tokenizer in tokenizers:
consumedChars, token = tokenizer(line, current, lineNumber)
if consumedChars > 0:
return (consumedChars, token)
return (0, None)
def filterTokens(filters, tokens):
if not filters:
return tokens
return filterTokens(filters[1:], (token for token in tokens if filters[0](token)))
__all__ = ["tokenize"]

View File

View File

@@ -0,0 +1,5 @@
from smnp.token.tools import tokenizeChar
from smnp.token.type import TokenType
def tokenizeAssign(input, current, line):
return tokenizeChar(TokenType.ASSIGN, '=', input, current, line)

View File

@@ -0,0 +1,5 @@
from smnp.token.tools import tokenizeChar
from smnp.token.type import TokenType
def tokenizeAsterisk(input, current, line):
return tokenizeChar(TokenType.ASTERISK, '*', input, current, line)

View File

@@ -0,0 +1,8 @@
from smnp.token.tools import tokenizeChar
from smnp.token.type import TokenType
def tokenizeOpenBracket(input, current, line):
return tokenizeChar(TokenType.OPEN_BRACKET, '{', input, current, line)
def tokenizeCloseBracket(input, current, line):
return tokenizeChar(TokenType.CLOSE_BRACKET, '}', input, current, line)

View File

@@ -0,0 +1,5 @@
from smnp.token.tools import tokenizeChar
from smnp.token.type import TokenType
def tokenizeColon(input, current, line):
return tokenizeChar(TokenType.COLON, ':', input, current, line)

View File

@@ -0,0 +1,5 @@
from smnp.token.tools import tokenizeChar
from smnp.token.type import TokenType
def tokenizeComma(input, current, line):
return tokenizeChar(TokenType.COMMA, ',', input, current, line)

View File

@@ -0,0 +1,13 @@
from smnp.token.type import TokenType
from smnp.token.model import Token
def tokenizeComment(input, current, line):
if input[current] == '#':
consumedChars = 0
value = ''
while current+consumedChars < len(input):
value += input[current+consumedChars]
consumedChars += 1
pass
return (consumedChars, Token(TokenType.COMMENT, value, (line, current)))
return (0, None)

View File

@@ -0,0 +1,5 @@
from smnp.token.tools import tokenizeChar
from smnp.token.type import TokenType
def tokenizeDot(input, current, line):
return tokenizeChar(TokenType.DOT, '.', input, current, line)

View File

@@ -0,0 +1,5 @@
from smnp.token.tools import tokenizeKeyword
from smnp.token.type import TokenType
def tokenizeFunction(input, current, line):
return tokenizeKeyword(TokenType.FUNCTION, 'function', input, current, line)

View File

@@ -0,0 +1,5 @@
from smnp.token.tools import tokenizeRegexPattern
from smnp.token.type import TokenType
def tokenizeIdentifier(input, current, line):
return tokenizeRegexPattern(TokenType.IDENTIFIER, r'\w', input, current, line)

View File

@@ -0,0 +1,5 @@
from smnp.token.tools import tokenizeRegexPattern
from smnp.token.type import TokenType
def tokenizeInteger(input, current, line):
return tokenizeRegexPattern(TokenType.INTEGER, r'\d', input, current, line)

View File

@@ -0,0 +1,5 @@
from smnp.token.tools import tokenizeChar
from smnp.token.type import TokenType
def tokenizeMinus(input, current, line):
return tokenizeChar(TokenType.MINUS, '-', input, current, line)

View File

@@ -0,0 +1,37 @@
import re
from smnp.token.type import TokenType
from smnp.token.model import Token
def tokenizeNote(input, current, line):
consumedChars = 0
value = ''
if input[current] == '@':
consumedChars += 1
value += input[current]
if input[current+consumedChars] in ('C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'A', 'a', 'H', 'h', 'B', 'b'):
value += input[current+consumedChars]
consumedChars += 1
if current+consumedChars < len(input) and input[current+consumedChars] in ('b', '#'):
value += input[current+consumedChars]
consumedChars += 1
if current+consumedChars < len(input) and re.match(r'\d', input[current+consumedChars]):
value += input[current+consumedChars]
consumedChars += 1
if current+consumedChars < len(input) and input[current+consumedChars] == '.':
duration = input[current+consumedChars]
consumedChars += 1
while current+consumedChars < len(input) and re.match(r'\d', input[current+consumedChars]):
duration += input[current+consumedChars]
consumedChars += 1
if current+consumedChars < len(input) and input[current+consumedChars] == 'd':
duration += input[current+consumedChars]
consumedChars += 1
if len(duration) > 1:
value += duration
else:
consumedChars -= 1
return (consumedChars, Token(TokenType.NOTE, value, (line, current)))
return (0, None)

View File

@@ -0,0 +1,8 @@
from smnp.token.tools import tokenizeChar
from smnp.token.type import TokenType
def tokenizeOpenParen(input, current, line):
return tokenizeChar(TokenType.OPEN_PAREN, '(', input, current, line)
def tokenizeCloseParen(input, current, line):
return tokenizeChar(TokenType.CLOSE_PAREN, ')', input, current, line)

View File

@@ -0,0 +1,5 @@
from smnp.token.tools import tokenizeChar
from smnp.token.type import TokenType
def tokenizePercent(input, current, line):
return tokenizeChar(TokenType.PERCENT, '%', input, current, line)

View File

@@ -0,0 +1,5 @@
from smnp.token.tools import tokenizeKeyword
from smnp.token.type import TokenType
def tokenizeReturn(input, current, line):
return tokenizeKeyword(TokenType.RETURN, 'return', input, current, line)

View File

@@ -0,0 +1,16 @@
from smnp.token.type import TokenType
from smnp.token.model import Token
def tokenizeString(input, current, line):
if input[current] == '"':
value = input[current]
char = ''
consumedChars = 1
while char != '"':
if char is None: #TODO!!!
print("String not terminated")
char = input[current + consumedChars]
value += char
consumedChars += 1
return (consumedChars, Token(TokenType.STRING, value, (line, current)))
return (0, None)

View File

@@ -0,0 +1,4 @@
from smnp.token.tools import tokenizeRegexPattern
def tokenizeWhitespaces(input, current, line):
return tokenizeRegexPattern(None, r'\s', input, current, line)

21
smnp/token/tools.py Normal file
View File

@@ -0,0 +1,21 @@
import re
from smnp.token.model import Token
def tokenizeChar(type, char, input, current, line):
if input[current] == char:
return (1, Token(type, input[current], (line, current)))
return (0, None)
def tokenizeRegexPattern(type, pattern, input, current, line):
consumedChars = 0
value = ''
while current+consumedChars < len(input) and re.match(pattern, input[current+consumedChars]):
value += input[current+consumedChars]
consumedChars += 1
return (consumedChars, Token(type, value, (line, current)) if consumedChars > 0 else None)
def tokenizeKeyword(type, keyword, input, current, line):
if len(input) >= current+len(keyword) and input[current:current+len(keyword)] == keyword:
return (len(keyword), Token(type, keyword, (line, current)))
return (0, None)

21
smnp/token/type.py Normal file
View File

@@ -0,0 +1,21 @@
from enum import Enum
class TokenType(Enum):
OPEN_PAREN = 1
CLOSE_PAREN = 2
ASTERISK = 3
STRING = 4
IDENTIFIER = 5
COMMA = 6
INTEGER = 7
OPEN_BRACKET = 8
CLOSE_BRACKET = 9
ASSIGN = 10
COLON = 11
NOTE = 12
COMMENT = 13
PERCENT = 14
MINUS = 15
FUNCTION = 16
RETURN = 17
DOT = 18