Create working parser

2019-06-28 20:23:42 +02:00
commit aafbd31599
3 changed files with 597 additions and 0 deletions
--- a/note.py
+++ b/note.py
@@ -0,0 +1,36 @@
 from enum import Enum
 class NotePitch(Enum):
    C = 1
    CIS = 2
    D = 3
    DIS = 4
    E = 5
    F = 6
    FIS = 7
    G = 8
    GIS = 9
    A = 10
    AIS = 11
    H = 12
    @staticmethod
    def toPitch(string):
        map = { 'c': NotePitch.C, 'c#': NotePitch.CIS, 'db': NotePitch.CIS, 'd': NotePitch.D,
               'd#': NotePitch.DIS, 'eb': NotePitch.DIS, 'e': NotePitch.E, 'fb': NotePitch.E, 'e#': NotePitch.F,
               'f': NotePitch.F, 'f#': NotePitch.FIS, 'gb': NotePitch.FIS, 'g': NotePitch.G, 'g#': NotePitch.GIS,
               'ab': NotePitch.GIS, 'a': NotePitch.A, 'a#': NotePitch.AIS, 'b': NotePitch.AIS, 'h': NotePitch.H
               }
        return map[string]
 class Note:
    def __init__(self, note, octave, duration):        
        if type(note) == str:            
            self.note = NotePitch.toPitch(note)
        else:
            self.note = note
        self.octave = octave
        self.duration = duration
    def __str__(self):
        return f"{self.note}[{self.octave}, {self.duration}]"
--- a/parser.py
+++ b/parser.py
@@ -0,0 +1,362 @@
 from enum import Enum
 from tokenizer import *
 from note import *
 import json
 class ParseError(Exception):
    pass
 class NodeType(Enum):
    INTEGER = 1
    STRING = 2
    NOTE = 3
    BLOCK = 4
    ARGUMENTS = 5
    IDENTIFIER = 6
    ASSIGN = 7
    PROGRAM = 8
    ASTERISK = 9
    COLON = 10
    FUNCTION_CALL = 11
    COMMA = 12
    PERCENT = 13
 class Node:
    def __init__(self):
        self.children = []            
    def __repr__(self):
        return self.__str__()
    def __len__(self):
        return len(self.children)
    def __getitem__(self, index):
        return self.children[index]
    def append(self, node):
        self.children.append(node)
    def pop(self, index):
        return self.children.pop(index)
 class Program(Node):
    def __init__(self):
        Node.__init__(self)
        self.type =  NodeType.PROGRAM        
    def __str__(self):
        return "Program:\n" + "\n".join([str(e) for e in self.children])
 class BlockNode(Node):
    def __init__(self):      
        Node.__init__(self)
        self.type = NodeType.BLOCK        
    def __str__(self):
        return "B{\n" + "\n".join([str(e) for e in self.children]) + "\n}"
 class ArgumentsNode(Node):
    def __init__(self):
        Node.__init__(self)
        self.type = NodeType.ARGUMENTS        
    def __str__(self):
        return "@(" + ", ".join([str(e) for e in self.children]) + ")"      
 class IdentifierNode(Node):
    def __init__(self, identifier):
        self.type = NodeType.IDENTIFIER
        self.identifier = identifier
    def __str__(self):
        return f"L'{self.identifier}'"
 class AssignExpression(Node):
    def __init__(self, target, value):
        self.type = NodeType.ASSIGN
        self.target = target
        self.value = value
    def __str__(self):
        return f"A[{self.target} = {self.value}]"
 class AsteriskStatementNode(Node):
    def __init__(self, iterator, statement):
        self.type = NodeType.ASTERISK
        self.iterator = iterator
        self.statement = statement
    def __str__(self):
        return f"*({self.iterator}: {self.statement})"
 class ColonNode(Node):
    def __init__(self, a, b):
        self.type = NodeType.COLON
        self.a = a
        self.b = b
    def __str__(self):
        return f":({self.a}, {self.b})"
 class ExpressionNode(Node):
    def __str__(self):
        return f"{self.__class__.__name__}('{self.value}')"
 class IntegerLiteralNode(ExpressionNode):
    def __init__(self, value):
        self.type = NodeType.INTEGER
        self.value = value        
    def __str__(self):
        return f"i'{self.value}'"
 class StringLiteralNode(ExpressionNode):
    def __init__(self, value):
        self.type = NodeType.STRING
        self.value = value
    def __str__(self):
        return f"s'{self.value}'"
 class NoteLiteralNode(ExpressionNode):
    def __init__(self, value):
        self.type = NodeType.NOTE
        self.value = value
    def __str__(self):
        return f"n'{self.value}'"
 class FunctionCallNode(Node):
    def __init__(self, identifier, arguments):
        self.type = NodeType.FUNCTION_CALL
        self.identifier = identifier
        self.arguments = arguments
    def __str__(self):
        return f"F({self.identifier}: {self.arguments})"
 class CommaNode(Node):
    def __init__(self):
        self.type = NodeType.COMMA
    def __str__(self):
        return "[,]"
 class PercentNode(Node):
    def __init__(self, value):
        self.type = NodeType.PERCENT
        self.value = value
    def __str__(self):
        return f"%'{self.value}'"
 def expectedFound(expected, found):
    raise ParseError(f"Expected: {expected}, found: {found}")
 def parseInteger(input, parent):
    if input[0].type != TokenType.INTEGER:
        expectedFound(TokenType.INTEGER, input[0].type)
    return IntegerLiteralNode(int(input.pop(0).value))
 def parseString(input, parent):
    if input[0].type != TokenType.STRING:
        expectedFound(TokenType.STRING, input[0].type)
    return StringLiteralNode(input.pop(0).value[1:-1])
 def parseNote(input, parent):
    if input[0].type != TokenType.NOTE:
        expectedFound(TokenType.NOTE, input[0].type)
    value = input.pop(0).value
    consumedChars = 1
    notePitch = value[consumedChars]
    consumedChars += 1
    octave = 1
    duration = 1
    if consumedChars < len(value) and value[consumedChars] in ('b', '#'):
        notePitch += value[consumedChars]
        consumedChars += 1
    if consumedChars < len(value) and re.match(r'\d', value[consumedChars]):
        octave = int(value[consumedChars])
        consumedChars += 1
    if consumedChars < len(value) and value[consumedChars] == '.':
        consumedChars += 1
        durationString = ''            
        while consumedChars < len(value) and re.match(r'\d', value[consumedChars]):
            durationString += value[consumedChars]      
            consumedChars += 1  
            duration = int(durationString)
    return NoteLiteralNode(Note(notePitch, octave, duration))
 def parseComma(input, parent):
    if input[0].type != TokenType.COMMA:
        expectedFound(TokenType.COMMA, input[0].type)
    input.pop(0)
    return CommaNode()
 def parseArguments(input, parent):    
    if input[0].type != TokenType.OPEN_PAREN:
        expectedFound(TokenType.OPEN_PAREN, input[0].type)
    input.pop(0)
    arguments = ArgumentsNode()
    while input[0].type != TokenType.CLOSE_PAREN:  
        arguments.append(parseArrayElement(input, arguments)) #TODO: parseExpression
    if input[0].type != TokenType.CLOSE_PAREN:
        expectedFound(TokenType.CLOSE_PAREN, input[0].type)
    input.pop(0)
    return arguments
 def parseBlock(input, parent):
    if input[0].type != TokenType.OPEN_BRACKET:
        expectedFound(TokenType.OPEN_BRACKET, input[0].type)
    input.pop(0)
    block = BlockNode()
    while input[0].type != TokenType.CLOSE_BRACKET:
        block.append(parseToken(input, block))
    if input[0].type != TokenType.CLOSE_BRACKET:
        expectedFound(TokenType.CLOSE_BRACKET, input[0].type)
    input.pop(0)
    return block
 def parseIdentifier(input, parent):
    if input[0].type != TokenType.IDENTIFIER:
        expectedFound(TokenType.IDENTIFIER, input[0].type)
    return IdentifierNode(input.pop(0).value)
 def parseAssign(input, parent):
    if input[0].type != TokenType.ASSIGN:
        expectedFound(TokenType.ASSIGN, input[0].type)
    input.pop(0)
    target = parent.pop(-1)
    value = parseExpression(input, parent) #TODO: only expressions!
    return AssignExpression(target, value)
 def parseAsterisk(input, parent):
    if input[0].type != TokenType.ASTERISK:
        expectedFound(TokenType.ASTERISK, input[0].type)
    input.pop(0)
    iterator = parent.pop(-1)
    value = parseStatement(input, parent) #TODO: only statements! (?)
    return AsteriskStatementNode(iterator, value)        
 def parseColon(input, parent):    
    if input[0].type != TokenType.COLON:
        expectedFound(TokenType.COLON, input[0].type)
    input.pop(0)
    a = parent.pop(-1)
    b = parseExpression(input, parent) #TODO: only expressions!
    return ColonNode(a, b)
 def parseFunctionCallOrIdentifier(input, parent):
    if input[0].type != TokenType.IDENTIFIER:
        expectedFound(TokenType.IDENTIFIER, input[0].type)
    if input[1].type == TokenType.OPEN_PAREN:
        identifier = parseIdentifier(input, parent)
        arguments = parseArguments(input, parent)        
        return FunctionCallNode(identifier, arguments)
    return parseIdentifier(input, parent)
 def parsePercent(input, parent):
    if input[0].type != TokenType.PERCENT:
        expectedFound(TokenType.PERCENT, input[0].type)
    input.pop(0)
    value = parent.pop(-1)
    return PercentNode(value)
 def parseExpression(input, parent):
    type = input[0].type
    if type == TokenType.INTEGER:
        return parseInteger(input, parent)
    if type == TokenType.STRING:
        return parseString(input, parent)
    if type == TokenType.NOTE:
        return parseNote(input, parent)    
    if type == TokenType.IDENTIFIER:
        return parseFunctionCallOrIdentifier(input, parent)
    if type == TokenType.PERCENT:
        return parsePercent(input, parent)
    if type == TokenType.OPEN_PAREN:
        return parseArguments(input, parent)
    if type == TokenType.ASSIGN:
        return parseAssign(input, parent)
    if type == TokenType.COLON:
        return parseColon(input, parent)    
 def parseArrayElement(input, parent):
    type = input[0].type
    if type == TokenType.COMMA:
        return parseComma(input, parent)
    return parseExpression(input, parent)
 def parseStatement(input, parent):
    type = input[0].type
    if type == TokenType.OPEN_BRACKET:
        return parseBlock(input, parent)
    if type == TokenType.ASTERISK:
        return parseAsterisk(input, parent)
    return parseExpression(input, parent)
 def parseToken(input, parent):      
    type = input[0].type    
    return parseStatement(input, parent)
 def parseProgram(input):    
    root = Program()
    while len(input) > 0:
        root.append(parseToken(input, root))
    return root
 def test():
    try:
        with open('test2.lit', 'r') as source:
            lines = [line.rstrip('\n') for line in source.readlines()]
        tokens = [token for token in tokenize(lines) if token.type != TokenType.COMMENT]
        ast = parseProgram(tokens)
        print(ast)
    except TokenizerError as e:
        print(str(e))
    except ParseError as e:
        print(str(e))
 if __name__ == "__main__":      
    test()
--- a/tokenizer.py
+++ b/tokenizer.py
@@ -0,0 +1,199 @@
 from enum import Enum
 import time
 import re
 import sys
 class TokenType(Enum):
    OPEN_PAREN = 1
    CLOSE_PAREN = 2
    ASTERISK = 3
    STRING = 4
    IDENTIFIER = 5
    COMMA = 6
    INTEGER = 7
    OPEN_BRACKET = 8
    CLOSE_BRACKET = 9
    ASSIGN = 10
    COLON = 11
    NOTE = 12
    COMMENT = 13
    PERCENT = 14
 class TokenizerError(Exception):
    pass
 class Token:
    def __init__(self, type, value, pos):
        self.type = type
        self.value = value    
        self.pos = pos
    def __str__(self):
        return "Token(" + str(self.type) + ", '" + self.value + "', " + str(self.pos) + ")"
    def __repr__(self):
        return self.__str__()
 def tokenizeOpenParen(input, current, line):
    if input[current] == '(':
        return (1, Token(TokenType.OPEN_PAREN, input[current], (line, current)))
    return (0, None)
 def tokenizeCloseParen(input, current, line):
    if input[current] == ')':
        return (1, Token(TokenType.CLOSE_PAREN, input[current], (line, current)))
    return (0, None)
 def tokenizeAsterisk(input, current, line):
    if input[current] == '*':
        return (1, Token(TokenType.ASTERISK, input[current], (line, current)))
    return (0, None)
 def tokenizeString(input, current, line):
    if input[current] == '"':
        value = input[current]
        char = ''
        consumedChars = 1
        while char != '"':
            if char is None:
                print("String not terminated")
            char = input[current + consumedChars]
            value += char
            consumedChars += 1
        return (consumedChars, Token(TokenType.STRING, value, (line, current)))
    return (0, None)
 def tokenizeRegexPattern(type, pattern, input, current, line):    
    consumedChars = 0
    value = ''
    while current+consumedChars < len(input) and re.match(pattern, input[current+consumedChars]):
        value += input[current+consumedChars]        
        consumedChars += 1            
    return (consumedChars, Token(type, value, (line, current)) if consumedChars > 0 else None)
 def tokenizeWhitespaces(input, current, line):    
    return tokenizeRegexPattern(None, r'\s', input, current, line)
 def tokenizeIdentifier(input, current, line):
    return tokenizeRegexPattern(TokenType.IDENTIFIER, r'\w', input, current, line)
 def tokenizeComma(input, current, line):
    if input[current] == ',':
        return (1, Token(TokenType.COMMA, input[current], (line, current)))
    return (0, None)
 def tokenizeInteger(input, current, line):    
    return tokenizeRegexPattern(TokenType.INTEGER, r'\d', input, current, line)
 def tokenizeOpenBracket(input, current, line):
    if input[current] == '{':
        return (1, Token(TokenType.OPEN_BRACKET, input[current], (line, current)))
    return (0, None)
 def tokenizeCloseBracket(input, current, line):
    if input[current] == '}':
        return (1, Token(TokenType.CLOSE_BRACKET, input[current], (line, current)))
    return (0, None)
 def tokenizeAssign(input, current, line):
    if input[current] == '=':
        return (1, Token(TokenType.ASSIGN, input[current], (line, current)))
    return (0, None)
 def tokenizeColon(input, current, line):
    if input[current] == ':':
        return (1, Token(TokenType.COLON, input[current], (line, current)))
    return (0, None)
 def tokenizeComment(input, current, line):
    if input[current] == '#':
        consumedChars = 0
        value = ''
        while current+consumedChars < len(input):
            value += input[current+consumedChars]
            consumedChars += 1            
            pass
        return (consumedChars, Token(TokenType.COMMENT, value, (line, current)))
    return (0, None)
 def tokenizeNote(input, current, line):
    consumedChars = 0
    value = ''
    if input[current] == '@':
        consumedChars += 1
        value += input[current]
        if input[current+consumedChars] in ('C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'A', 'a', 'H', 'h', 'B', 'b'):
            value += input[current+consumedChars]
            consumedChars += 1
            if current+consumedChars < len(input) and input[current+consumedChars] in ('b', '#'):                        
                value += input[current+consumedChars]
                consumedChars += 1
            if current+consumedChars < len(input) and re.match(r'\d', input[current+consumedChars]):            
                value += input[current+consumedChars]
                consumedChars += 1
            if current+consumedChars < len(input) and input[current+consumedChars] == '.':            
                value += input[current+consumedChars]
                consumedChars += 1
                while current+consumedChars < len(input) and re.match(r'\d', input[current+consumedChars]):
                    value += input[current+consumedChars]        
                    consumedChars += 1  
            return (consumedChars, Token(TokenType.NOTE, value, (line, current)))
    return (0, None)
 def tokenizePercent(input, current, line):
    if input[current] == '%':
        return (1, Token(TokenType.PERCENT, input[current], (line, current)))
    return (0, None)
 tokenizers = (
    tokenizeOpenParen, 
    tokenizeCloseParen, 
    tokenizeAsterisk, 
    tokenizeString, 
    tokenizeInteger,
    tokenizeNote,
    tokenizeIdentifier, 
    tokenizeComma,
    tokenizeOpenBracket,
    tokenizeCloseBracket,
    tokenizeAssign,
    tokenizeColon,
    tokenizePercent,
    tokenizeComment,
    tokenizeWhitespaces
 )
 def tokenize(lines):    
    tokens = []         
    for lineNumber, line in enumerate(lines):    
        current = 0
        while current < len(line):
            tokenized = False
            for tokenizer in tokenizers:
                consumedChars, value = tokenizer(line, current, lineNumber)
                if consumedChars > 0:
                    tokens.append(value)
                    current += consumedChars
                    tokenized = True
                    break            
            if not tokenized:
                raise TokenizerError(f"Line {lineNumber+1}, col {current+1}: unknown symbol '{line[current]}'")
    return [token for token in tokens if token.type is not None]
 if __name__ == "__main__":   
    try:
        with open(sys.argv[1], 'r') as source:
            lines = [line.rstrip('\n') for line in source.readlines()]
            tokens = tokenize(lines)    
        for token in tokens:
            print(token)        
    except TokenizerError as e:
        print(str(e))