From aafbd31599bcda73b314ba90624a2f88c29a7b6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Pluta?= Date: Fri, 28 Jun 2019 20:23:42 +0200 Subject: [PATCH] Create working parser --- note.py | 36 +++++ parser.py | 362 +++++++++++++++++++++++++++++++++++++++++++++++++++ tokenizer.py | 199 ++++++++++++++++++++++++++++ 3 files changed, 597 insertions(+) create mode 100644 note.py create mode 100644 parser.py create mode 100644 tokenizer.py diff --git a/note.py b/note.py new file mode 100644 index 0000000..fd080da --- /dev/null +++ b/note.py @@ -0,0 +1,36 @@ +from enum import Enum + +class NotePitch(Enum): + C = 1 + CIS = 2 + D = 3 + DIS = 4 + E = 5 + F = 6 + FIS = 7 + G = 8 + GIS = 9 + A = 10 + AIS = 11 + H = 12 + + @staticmethod + def toPitch(string): + map = { 'c': NotePitch.C, 'c#': NotePitch.CIS, 'db': NotePitch.CIS, 'd': NotePitch.D, + 'd#': NotePitch.DIS, 'eb': NotePitch.DIS, 'e': NotePitch.E, 'fb': NotePitch.E, 'e#': NotePitch.F, + 'f': NotePitch.F, 'f#': NotePitch.FIS, 'gb': NotePitch.FIS, 'g': NotePitch.G, 'g#': NotePitch.GIS, + 'ab': NotePitch.GIS, 'a': NotePitch.A, 'a#': NotePitch.AIS, 'b': NotePitch.AIS, 'h': NotePitch.H + } + return map[string] + +class Note: + def __init__(self, note, octave, duration): + if type(note) == str: + self.note = NotePitch.toPitch(note) + else: + self.note = note + self.octave = octave + self.duration = duration + + def __str__(self): + return f"{self.note}[{self.octave}, {self.duration}]" diff --git a/parser.py b/parser.py new file mode 100644 index 0000000..149bd1f --- /dev/null +++ b/parser.py @@ -0,0 +1,362 @@ +from enum import Enum +from tokenizer import * +from note import * +import json + +class ParseError(Exception): + pass + +class NodeType(Enum): + INTEGER = 1 + STRING = 2 + NOTE = 3 + BLOCK = 4 + ARGUMENTS = 5 + IDENTIFIER = 6 + ASSIGN = 7 + PROGRAM = 8 + ASTERISK = 9 + COLON = 10 + FUNCTION_CALL = 11 + COMMA = 12 + PERCENT = 13 + +class Node: + def __init__(self): + self.children = [] + + def __repr__(self): + return self.__str__() + + def __len__(self): + return len(self.children) + + def __getitem__(self, index): + return self.children[index] + + def append(self, node): + self.children.append(node) + + def pop(self, index): + return self.children.pop(index) + +class Program(Node): + def __init__(self): + Node.__init__(self) + self.type = NodeType.PROGRAM + + def __str__(self): + return "Program:\n" + "\n".join([str(e) for e in self.children]) + + +class BlockNode(Node): + def __init__(self): + Node.__init__(self) + self.type = NodeType.BLOCK + + def __str__(self): + return "B{\n" + "\n".join([str(e) for e in self.children]) + "\n}" + + +class ArgumentsNode(Node): + def __init__(self): + Node.__init__(self) + self.type = NodeType.ARGUMENTS + + def __str__(self): + return "@(" + ", ".join([str(e) for e in self.children]) + ")" + +class IdentifierNode(Node): + def __init__(self, identifier): + self.type = NodeType.IDENTIFIER + self.identifier = identifier + + def __str__(self): + return f"L'{self.identifier}'" + +class AssignExpression(Node): + def __init__(self, target, value): + self.type = NodeType.ASSIGN + self.target = target + self.value = value + + def __str__(self): + return f"A[{self.target} = {self.value}]" + +class AsteriskStatementNode(Node): + def __init__(self, iterator, statement): + self.type = NodeType.ASTERISK + self.iterator = iterator + self.statement = statement + + def __str__(self): + return f"*({self.iterator}: {self.statement})" + +class ColonNode(Node): + def __init__(self, a, b): + self.type = NodeType.COLON + self.a = a + self.b = b + + def __str__(self): + return f":({self.a}, {self.b})" + +class ExpressionNode(Node): + def __str__(self): + return f"{self.__class__.__name__}('{self.value}')" + + +class IntegerLiteralNode(ExpressionNode): + def __init__(self, value): + self.type = NodeType.INTEGER + self.value = value + + def __str__(self): + return f"i'{self.value}'" + +class StringLiteralNode(ExpressionNode): + def __init__(self, value): + self.type = NodeType.STRING + self.value = value + + def __str__(self): + return f"s'{self.value}'" + +class NoteLiteralNode(ExpressionNode): + def __init__(self, value): + self.type = NodeType.NOTE + self.value = value + + def __str__(self): + return f"n'{self.value}'" + +class FunctionCallNode(Node): + def __init__(self, identifier, arguments): + self.type = NodeType.FUNCTION_CALL + self.identifier = identifier + self.arguments = arguments + + def __str__(self): + return f"F({self.identifier}: {self.arguments})" + +class CommaNode(Node): + def __init__(self): + self.type = NodeType.COMMA + + def __str__(self): + return "[,]" + +class PercentNode(Node): + def __init__(self, value): + self.type = NodeType.PERCENT + self.value = value + + def __str__(self): + return f"%'{self.value}'" + +def expectedFound(expected, found): + raise ParseError(f"Expected: {expected}, found: {found}") + +def parseInteger(input, parent): + if input[0].type != TokenType.INTEGER: + expectedFound(TokenType.INTEGER, input[0].type) + + return IntegerLiteralNode(int(input.pop(0).value)) + +def parseString(input, parent): + if input[0].type != TokenType.STRING: + expectedFound(TokenType.STRING, input[0].type) + + return StringLiteralNode(input.pop(0).value[1:-1]) + +def parseNote(input, parent): + if input[0].type != TokenType.NOTE: + expectedFound(TokenType.NOTE, input[0].type) + + value = input.pop(0).value + consumedChars = 1 + notePitch = value[consumedChars] + consumedChars += 1 + octave = 1 + duration = 1 + if consumedChars < len(value) and value[consumedChars] in ('b', '#'): + notePitch += value[consumedChars] + consumedChars += 1 + if consumedChars < len(value) and re.match(r'\d', value[consumedChars]): + octave = int(value[consumedChars]) + consumedChars += 1 + if consumedChars < len(value) and value[consumedChars] == '.': + consumedChars += 1 + durationString = '' + while consumedChars < len(value) and re.match(r'\d', value[consumedChars]): + durationString += value[consumedChars] + consumedChars += 1 + duration = int(durationString) + + return NoteLiteralNode(Note(notePitch, octave, duration)) + +def parseComma(input, parent): + if input[0].type != TokenType.COMMA: + expectedFound(TokenType.COMMA, input[0].type) + input.pop(0) + return CommaNode() + +def parseArguments(input, parent): + if input[0].type != TokenType.OPEN_PAREN: + expectedFound(TokenType.OPEN_PAREN, input[0].type) + + input.pop(0) + + arguments = ArgumentsNode() + + while input[0].type != TokenType.CLOSE_PAREN: + arguments.append(parseArrayElement(input, arguments)) #TODO: parseExpression + + if input[0].type != TokenType.CLOSE_PAREN: + expectedFound(TokenType.CLOSE_PAREN, input[0].type) + input.pop(0) + + return arguments + +def parseBlock(input, parent): + if input[0].type != TokenType.OPEN_BRACKET: + expectedFound(TokenType.OPEN_BRACKET, input[0].type) + + input.pop(0) + + block = BlockNode() + + while input[0].type != TokenType.CLOSE_BRACKET: + block.append(parseToken(input, block)) + + if input[0].type != TokenType.CLOSE_BRACKET: + expectedFound(TokenType.CLOSE_BRACKET, input[0].type) + input.pop(0) + + return block + +def parseIdentifier(input, parent): + if input[0].type != TokenType.IDENTIFIER: + expectedFound(TokenType.IDENTIFIER, input[0].type) + + return IdentifierNode(input.pop(0).value) + +def parseAssign(input, parent): + if input[0].type != TokenType.ASSIGN: + expectedFound(TokenType.ASSIGN, input[0].type) + + input.pop(0) + + target = parent.pop(-1) + value = parseExpression(input, parent) #TODO: only expressions! + + return AssignExpression(target, value) + +def parseAsterisk(input, parent): + if input[0].type != TokenType.ASTERISK: + expectedFound(TokenType.ASTERISK, input[0].type) + + input.pop(0) + + iterator = parent.pop(-1) + value = parseStatement(input, parent) #TODO: only statements! (?) + + return AsteriskStatementNode(iterator, value) + +def parseColon(input, parent): + if input[0].type != TokenType.COLON: + expectedFound(TokenType.COLON, input[0].type) + + input.pop(0) + + a = parent.pop(-1) + b = parseExpression(input, parent) #TODO: only expressions! + + return ColonNode(a, b) + +def parseFunctionCallOrIdentifier(input, parent): + if input[0].type != TokenType.IDENTIFIER: + expectedFound(TokenType.IDENTIFIER, input[0].type) + + if input[1].type == TokenType.OPEN_PAREN: + identifier = parseIdentifier(input, parent) + arguments = parseArguments(input, parent) + return FunctionCallNode(identifier, arguments) + + return parseIdentifier(input, parent) + +def parsePercent(input, parent): + if input[0].type != TokenType.PERCENT: + expectedFound(TokenType.PERCENT, input[0].type) + + input.pop(0) + + value = parent.pop(-1) + + return PercentNode(value) + +def parseExpression(input, parent): + type = input[0].type + if type == TokenType.INTEGER: + return parseInteger(input, parent) + if type == TokenType.STRING: + return parseString(input, parent) + if type == TokenType.NOTE: + return parseNote(input, parent) + if type == TokenType.IDENTIFIER: + return parseFunctionCallOrIdentifier(input, parent) + if type == TokenType.PERCENT: + return parsePercent(input, parent) + if type == TokenType.OPEN_PAREN: + return parseArguments(input, parent) + if type == TokenType.ASSIGN: + return parseAssign(input, parent) + if type == TokenType.COLON: + return parseColon(input, parent) + +def parseArrayElement(input, parent): + type = input[0].type + if type == TokenType.COMMA: + return parseComma(input, parent) + return parseExpression(input, parent) + +def parseStatement(input, parent): + type = input[0].type + if type == TokenType.OPEN_BRACKET: + return parseBlock(input, parent) + if type == TokenType.ASTERISK: + return parseAsterisk(input, parent) + + return parseExpression(input, parent) + +def parseToken(input, parent): + type = input[0].type + + return parseStatement(input, parent) + + +def parseProgram(input): + root = Program() + while len(input) > 0: + root.append(parseToken(input, root)) + return root + +def test(): + try: + with open('test2.lit', 'r') as source: + lines = [line.rstrip('\n') for line in source.readlines()] + + tokens = [token for token in tokenize(lines) if token.type != TokenType.COMMENT] + + ast = parseProgram(tokens) + + print(ast) + except TokenizerError as e: + print(str(e)) + + except ParseError as e: + print(str(e)) + +if __name__ == "__main__": + test() + diff --git a/tokenizer.py b/tokenizer.py new file mode 100644 index 0000000..6037bc0 --- /dev/null +++ b/tokenizer.py @@ -0,0 +1,199 @@ +from enum import Enum +import time +import re +import sys + +class TokenType(Enum): + OPEN_PAREN = 1 + CLOSE_PAREN = 2 + ASTERISK = 3 + STRING = 4 + IDENTIFIER = 5 + COMMA = 6 + INTEGER = 7 + OPEN_BRACKET = 8 + CLOSE_BRACKET = 9 + ASSIGN = 10 + COLON = 11 + NOTE = 12 + COMMENT = 13 + PERCENT = 14 + +class TokenizerError(Exception): + pass + + +class Token: + def __init__(self, type, value, pos): + self.type = type + self.value = value + self.pos = pos + def __str__(self): + return "Token(" + str(self.type) + ", '" + self.value + "', " + str(self.pos) + ")" + def __repr__(self): + return self.__str__() + +def tokenizeOpenParen(input, current, line): + if input[current] == '(': + return (1, Token(TokenType.OPEN_PAREN, input[current], (line, current))) + return (0, None) + +def tokenizeCloseParen(input, current, line): + if input[current] == ')': + return (1, Token(TokenType.CLOSE_PAREN, input[current], (line, current))) + return (0, None) + +def tokenizeAsterisk(input, current, line): + if input[current] == '*': + return (1, Token(TokenType.ASTERISK, input[current], (line, current))) + return (0, None) + +def tokenizeString(input, current, line): + if input[current] == '"': + value = input[current] + char = '' + consumedChars = 1 + while char != '"': + if char is None: + print("String not terminated") + char = input[current + consumedChars] + value += char + consumedChars += 1 + return (consumedChars, Token(TokenType.STRING, value, (line, current))) + return (0, None) + +def tokenizeRegexPattern(type, pattern, input, current, line): + consumedChars = 0 + value = '' + + while current+consumedChars < len(input) and re.match(pattern, input[current+consumedChars]): + value += input[current+consumedChars] + consumedChars += 1 + return (consumedChars, Token(type, value, (line, current)) if consumedChars > 0 else None) + +def tokenizeWhitespaces(input, current, line): + return tokenizeRegexPattern(None, r'\s', input, current, line) + +def tokenizeIdentifier(input, current, line): + return tokenizeRegexPattern(TokenType.IDENTIFIER, r'\w', input, current, line) + +def tokenizeComma(input, current, line): + if input[current] == ',': + return (1, Token(TokenType.COMMA, input[current], (line, current))) + return (0, None) + +def tokenizeInteger(input, current, line): + return tokenizeRegexPattern(TokenType.INTEGER, r'\d', input, current, line) + +def tokenizeOpenBracket(input, current, line): + if input[current] == '{': + return (1, Token(TokenType.OPEN_BRACKET, input[current], (line, current))) + return (0, None) + +def tokenizeCloseBracket(input, current, line): + if input[current] == '}': + return (1, Token(TokenType.CLOSE_BRACKET, input[current], (line, current))) + return (0, None) + +def tokenizeAssign(input, current, line): + if input[current] == '=': + return (1, Token(TokenType.ASSIGN, input[current], (line, current))) + return (0, None) + +def tokenizeColon(input, current, line): + if input[current] == ':': + return (1, Token(TokenType.COLON, input[current], (line, current))) + return (0, None) + +def tokenizeComment(input, current, line): + if input[current] == '#': + consumedChars = 0 + value = '' + while current+consumedChars < len(input): + value += input[current+consumedChars] + consumedChars += 1 + pass + return (consumedChars, Token(TokenType.COMMENT, value, (line, current))) + return (0, None) + +def tokenizeNote(input, current, line): + consumedChars = 0 + value = '' + if input[current] == '@': + consumedChars += 1 + value += input[current] + if input[current+consumedChars] in ('C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'A', 'a', 'H', 'h', 'B', 'b'): + value += input[current+consumedChars] + consumedChars += 1 + + if current+consumedChars < len(input) and input[current+consumedChars] in ('b', '#'): + value += input[current+consumedChars] + consumedChars += 1 + + if current+consumedChars < len(input) and re.match(r'\d', input[current+consumedChars]): + value += input[current+consumedChars] + consumedChars += 1 + + if current+consumedChars < len(input) and input[current+consumedChars] == '.': + value += input[current+consumedChars] + consumedChars += 1 + while current+consumedChars < len(input) and re.match(r'\d', input[current+consumedChars]): + value += input[current+consumedChars] + consumedChars += 1 + return (consumedChars, Token(TokenType.NOTE, value, (line, current))) + return (0, None) + +def tokenizePercent(input, current, line): + if input[current] == '%': + return (1, Token(TokenType.PERCENT, input[current], (line, current))) + return (0, None) + +tokenizers = ( + tokenizeOpenParen, + tokenizeCloseParen, + tokenizeAsterisk, + tokenizeString, + tokenizeInteger, + tokenizeNote, + tokenizeIdentifier, + tokenizeComma, + tokenizeOpenBracket, + tokenizeCloseBracket, + tokenizeAssign, + tokenizeColon, + tokenizePercent, + tokenizeComment, + tokenizeWhitespaces +) + +def tokenize(lines): + tokens = [] + for lineNumber, line in enumerate(lines): + current = 0 + while current < len(line): + tokenized = False + for tokenizer in tokenizers: + consumedChars, value = tokenizer(line, current, lineNumber) + if consumedChars > 0: + tokens.append(value) + current += consumedChars + tokenized = True + break + + if not tokenized: + raise TokenizerError(f"Line {lineNumber+1}, col {current+1}: unknown symbol '{line[current]}'") + + return [token for token in tokens if token.type is not None] + +if __name__ == "__main__": + try: + with open(sys.argv[1], 'r') as source: + lines = [line.rstrip('\n') for line in source.readlines()] + + tokens = tokenize(lines) + + for token in tokens: + print(token) + except TokenizerError as e: + print(str(e)) +