Create working parser

2019-06-28 20:23:42 +02:00
commit aafbd31599
3 changed files with 597 additions and 0 deletions
--- a/note.py
+++ b/note.py
@@ -0,0 +1,36 @@
+from enum import Enum
+
+class NotePitch(Enum):
+    C = 1
+    CIS = 2
+    D = 3
+    DIS = 4
+    E = 5
+    F = 6
+    FIS = 7
+    G = 8
+    GIS = 9
+    A = 10
+    AIS = 11
+    H = 12
+    
+    @staticmethod
+    def toPitch(string):
+        map = { 'c': NotePitch.C, 'c#': NotePitch.CIS, 'db': NotePitch.CIS, 'd': NotePitch.D,
+               'd#': NotePitch.DIS, 'eb': NotePitch.DIS, 'e': NotePitch.E, 'fb': NotePitch.E, 'e#': NotePitch.F,
+               'f': NotePitch.F, 'f#': NotePitch.FIS, 'gb': NotePitch.FIS, 'g': NotePitch.G, 'g#': NotePitch.GIS,
+               'ab': NotePitch.GIS, 'a': NotePitch.A, 'a#': NotePitch.AIS, 'b': NotePitch.AIS, 'h': NotePitch.H
+               }
+        return map[string]
+
+class Note:
+    def __init__(self, note, octave, duration):        
+        if type(note) == str:            
+            self.note = NotePitch.toPitch(note)
+        else:
+            self.note = note
+        self.octave = octave
+        self.duration = duration
+        
+    def __str__(self):
+        return f"{self.note}[{self.octave}, {self.duration}]"
--- a/parser.py
+++ b/parser.py
@@ -0,0 +1,362 @@
+from enum import Enum
+from tokenizer import *
+from note import *
+import json
+
+class ParseError(Exception):
+    pass
+
+class NodeType(Enum):
+    INTEGER = 1
+    STRING = 2
+    NOTE = 3
+    BLOCK = 4
+    ARGUMENTS = 5
+    IDENTIFIER = 6
+    ASSIGN = 7
+    PROGRAM = 8
+    ASTERISK = 9
+    COLON = 10
+    FUNCTION_CALL = 11
+    COMMA = 12
+    PERCENT = 13
+
+class Node:
+    def __init__(self):
+        self.children = []            
+        
+    def __repr__(self):
+        return self.__str__()
+    
+    def __len__(self):
+        return len(self.children)
+        
+    def __getitem__(self, index):
+        return self.children[index]
+  
+    def append(self, node):
+        self.children.append(node)
+        
+    def pop(self, index):
+        return self.children.pop(index)
+
+class Program(Node):
+    def __init__(self):
+        Node.__init__(self)
+        self.type =  NodeType.PROGRAM        
+    
+    def __str__(self):
+        return "Program:\n" + "\n".join([str(e) for e in self.children])
+    
+
+class BlockNode(Node):
+    def __init__(self):      
+        Node.__init__(self)
+        self.type = NodeType.BLOCK        
+    
+    def __str__(self):
+        return "B{\n" + "\n".join([str(e) for e in self.children]) + "\n}"
+
+
+class ArgumentsNode(Node):
+    def __init__(self):
+        Node.__init__(self)
+        self.type = NodeType.ARGUMENTS        
+    
+    def __str__(self):
+        return "@(" + ", ".join([str(e) for e in self.children]) + ")"      
+        
+class IdentifierNode(Node):
+    def __init__(self, identifier):
+        self.type = NodeType.IDENTIFIER
+        self.identifier = identifier
+    
+    def __str__(self):
+        return f"L'{self.identifier}'"
+        
+class AssignExpression(Node):
+    def __init__(self, target, value):
+        self.type = NodeType.ASSIGN
+        self.target = target
+        self.value = value
+        
+    def __str__(self):
+        return f"A[{self.target} = {self.value}]"
+   
+class AsteriskStatementNode(Node):
+    def __init__(self, iterator, statement):
+        self.type = NodeType.ASTERISK
+        self.iterator = iterator
+        self.statement = statement
+        
+    def __str__(self):
+        return f"*({self.iterator}: {self.statement})"
+   
+class ColonNode(Node):
+    def __init__(self, a, b):
+        self.type = NodeType.COLON
+        self.a = a
+        self.b = b
+        
+    def __str__(self):
+        return f":({self.a}, {self.b})"
+   
+class ExpressionNode(Node):
+    def __str__(self):
+        return f"{self.__class__.__name__}('{self.value}')"
+
+
+class IntegerLiteralNode(ExpressionNode):
+    def __init__(self, value):
+        self.type = NodeType.INTEGER
+        self.value = value        
+    
+    def __str__(self):
+        return f"i'{self.value}'"
+
+class StringLiteralNode(ExpressionNode):
+    def __init__(self, value):
+        self.type = NodeType.STRING
+        self.value = value
+
+    def __str__(self):
+        return f"s'{self.value}'"
+    
+class NoteLiteralNode(ExpressionNode):
+    def __init__(self, value):
+        self.type = NodeType.NOTE
+        self.value = value
+        
+    def __str__(self):
+        return f"n'{self.value}'"
+
+class FunctionCallNode(Node):
+    def __init__(self, identifier, arguments):
+        self.type = NodeType.FUNCTION_CALL
+        self.identifier = identifier
+        self.arguments = arguments
+        
+    def __str__(self):
+        return f"F({self.identifier}: {self.arguments})"
+
+class CommaNode(Node):
+    def __init__(self):
+        self.type = NodeType.COMMA
+        
+    def __str__(self):
+        return "[,]"
+
+class PercentNode(Node):
+    def __init__(self, value):
+        self.type = NodeType.PERCENT
+        self.value = value
+        
+    def __str__(self):
+        return f"%'{self.value}'"
+
+def expectedFound(expected, found):
+    raise ParseError(f"Expected: {expected}, found: {found}")
+
+def parseInteger(input, parent):
+    if input[0].type != TokenType.INTEGER:
+        expectedFound(TokenType.INTEGER, input[0].type)
+    
+    return IntegerLiteralNode(int(input.pop(0).value))
+    
+def parseString(input, parent):
+    if input[0].type != TokenType.STRING:
+        expectedFound(TokenType.STRING, input[0].type)
+        
+    return StringLiteralNode(input.pop(0).value[1:-1])
+    
+def parseNote(input, parent):
+    if input[0].type != TokenType.NOTE:
+        expectedFound(TokenType.NOTE, input[0].type)
+    
+    value = input.pop(0).value
+    consumedChars = 1
+    notePitch = value[consumedChars]
+    consumedChars += 1
+    octave = 1
+    duration = 1
+    if consumedChars < len(value) and value[consumedChars] in ('b', '#'):
+        notePitch += value[consumedChars]
+        consumedChars += 1
+    if consumedChars < len(value) and re.match(r'\d', value[consumedChars]):
+        octave = int(value[consumedChars])
+        consumedChars += 1
+    if consumedChars < len(value) and value[consumedChars] == '.':
+        consumedChars += 1
+        durationString = ''            
+        while consumedChars < len(value) and re.match(r'\d', value[consumedChars]):
+            durationString += value[consumedChars]      
+            consumedChars += 1  
+            duration = int(durationString)
+    
+    return NoteLiteralNode(Note(notePitch, octave, duration))
+
+def parseComma(input, parent):
+    if input[0].type != TokenType.COMMA:
+        expectedFound(TokenType.COMMA, input[0].type)
+    input.pop(0)
+    return CommaNode()
+
+def parseArguments(input, parent):    
+    if input[0].type != TokenType.OPEN_PAREN:
+        expectedFound(TokenType.OPEN_PAREN, input[0].type)
+        
+    input.pop(0)
+    
+    arguments = ArgumentsNode()
+    
+    while input[0].type != TokenType.CLOSE_PAREN:  
+        arguments.append(parseArrayElement(input, arguments)) #TODO: parseExpression
+    
+    if input[0].type != TokenType.CLOSE_PAREN:
+        expectedFound(TokenType.CLOSE_PAREN, input[0].type)
+    input.pop(0)
+    
+    return arguments
+            
+def parseBlock(input, parent):
+    if input[0].type != TokenType.OPEN_BRACKET:
+        expectedFound(TokenType.OPEN_BRACKET, input[0].type)
+        
+    input.pop(0)
+    
+    block = BlockNode()
+    
+    while input[0].type != TokenType.CLOSE_BRACKET:
+        block.append(parseToken(input, block))
+    
+    if input[0].type != TokenType.CLOSE_BRACKET:
+        expectedFound(TokenType.CLOSE_BRACKET, input[0].type)
+    input.pop(0)
+    
+    return block
+
+def parseIdentifier(input, parent):
+    if input[0].type != TokenType.IDENTIFIER:
+        expectedFound(TokenType.IDENTIFIER, input[0].type)
+    
+    return IdentifierNode(input.pop(0).value)
+
+def parseAssign(input, parent):
+    if input[0].type != TokenType.ASSIGN:
+        expectedFound(TokenType.ASSIGN, input[0].type)
+        
+    input.pop(0)
+    
+    target = parent.pop(-1)
+    value = parseExpression(input, parent) #TODO: only expressions!
+    
+    return AssignExpression(target, value)
+
+def parseAsterisk(input, parent):
+    if input[0].type != TokenType.ASTERISK:
+        expectedFound(TokenType.ASTERISK, input[0].type)
+        
+    input.pop(0)
+    
+    iterator = parent.pop(-1)
+    value = parseStatement(input, parent) #TODO: only statements! (?)
+    
+    return AsteriskStatementNode(iterator, value)        
+   
+def parseColon(input, parent):    
+    if input[0].type != TokenType.COLON:
+        expectedFound(TokenType.COLON, input[0].type)
+    
+    input.pop(0)
+    
+    a = parent.pop(-1)
+    b = parseExpression(input, parent) #TODO: only expressions!
+    
+    return ColonNode(a, b)
+   
+def parseFunctionCallOrIdentifier(input, parent):
+    if input[0].type != TokenType.IDENTIFIER:
+        expectedFound(TokenType.IDENTIFIER, input[0].type)
+    
+    if input[1].type == TokenType.OPEN_PAREN:
+        identifier = parseIdentifier(input, parent)
+        arguments = parseArguments(input, parent)        
+        return FunctionCallNode(identifier, arguments)
+    
+    return parseIdentifier(input, parent)
+
+def parsePercent(input, parent):
+    if input[0].type != TokenType.PERCENT:
+        expectedFound(TokenType.PERCENT, input[0].type)
+        
+    input.pop(0)
+    
+    value = parent.pop(-1)
+    
+    return PercentNode(value)
+
+def parseExpression(input, parent):
+    type = input[0].type
+    if type == TokenType.INTEGER:
+        return parseInteger(input, parent)
+    if type == TokenType.STRING:
+        return parseString(input, parent)
+    if type == TokenType.NOTE:
+        return parseNote(input, parent)    
+    if type == TokenType.IDENTIFIER:
+        return parseFunctionCallOrIdentifier(input, parent)
+    if type == TokenType.PERCENT:
+        return parsePercent(input, parent)
+    if type == TokenType.OPEN_PAREN:
+        return parseArguments(input, parent)
+    if type == TokenType.ASSIGN:
+        return parseAssign(input, parent)
+    if type == TokenType.COLON:
+        return parseColon(input, parent)    
+ 
+def parseArrayElement(input, parent):
+    type = input[0].type
+    if type == TokenType.COMMA:
+        return parseComma(input, parent)
+    return parseExpression(input, parent)
+ 
+def parseStatement(input, parent):
+    type = input[0].type
+    if type == TokenType.OPEN_BRACKET:
+        return parseBlock(input, parent)
+    if type == TokenType.ASTERISK:
+        return parseAsterisk(input, parent)
+    
+    return parseExpression(input, parent)
+    
+def parseToken(input, parent):      
+    type = input[0].type    
+    
+    return parseStatement(input, parent)
+    
+  
+def parseProgram(input):    
+    root = Program()
+    while len(input) > 0:
+        root.append(parseToken(input, root))
+    return root
+  
+def test():
+    try:
+        with open('test2.lit', 'r') as source:
+            lines = [line.rstrip('\n') for line in source.readlines()]
+            
+        tokens = [token for token in tokenize(lines) if token.type != TokenType.COMMENT]
+        
+        ast = parseProgram(tokens)
+        
+        print(ast)
+    except TokenizerError as e:
+        print(str(e))
+        
+    except ParseError as e:
+        print(str(e))
+  
+if __name__ == "__main__":      
+    test()
+    
--- a/tokenizer.py
+++ b/tokenizer.py
@@ -0,0 +1,199 @@
+from enum import Enum
+import time
+import re
+import sys
+
+class TokenType(Enum):
+    OPEN_PAREN = 1
+    CLOSE_PAREN = 2
+    ASTERISK = 3
+    STRING = 4
+    IDENTIFIER = 5
+    COMMA = 6
+    INTEGER = 7
+    OPEN_BRACKET = 8
+    CLOSE_BRACKET = 9
+    ASSIGN = 10
+    COLON = 11
+    NOTE = 12
+    COMMENT = 13
+    PERCENT = 14
+    
+class TokenizerError(Exception):
+    pass
+    
+
+class Token:
+    def __init__(self, type, value, pos):
+        self.type = type
+        self.value = value    
+        self.pos = pos
+    def __str__(self):
+        return "Token(" + str(self.type) + ", '" + self.value + "', " + str(self.pos) + ")"
+    def __repr__(self):
+        return self.__str__()
+
+def tokenizeOpenParen(input, current, line):
+    if input[current] == '(':
+        return (1, Token(TokenType.OPEN_PAREN, input[current], (line, current)))
+    return (0, None)
+
+def tokenizeCloseParen(input, current, line):
+    if input[current] == ')':
+        return (1, Token(TokenType.CLOSE_PAREN, input[current], (line, current)))
+    return (0, None)
+
+def tokenizeAsterisk(input, current, line):
+    if input[current] == '*':
+        return (1, Token(TokenType.ASTERISK, input[current], (line, current)))
+    return (0, None)
+
+def tokenizeString(input, current, line):
+    if input[current] == '"':
+        value = input[current]
+        char = ''
+        consumedChars = 1
+        while char != '"':
+            if char is None:
+                print("String not terminated")
+            char = input[current + consumedChars]
+            value += char
+            consumedChars += 1
+        return (consumedChars, Token(TokenType.STRING, value, (line, current)))
+    return (0, None)
+
+def tokenizeRegexPattern(type, pattern, input, current, line):    
+    consumedChars = 0
+    value = ''
+    
+    while current+consumedChars < len(input) and re.match(pattern, input[current+consumedChars]):
+        value += input[current+consumedChars]        
+        consumedChars += 1            
+    return (consumedChars, Token(type, value, (line, current)) if consumedChars > 0 else None)
+        
+def tokenizeWhitespaces(input, current, line):    
+    return tokenizeRegexPattern(None, r'\s', input, current, line)
+
+def tokenizeIdentifier(input, current, line):
+    return tokenizeRegexPattern(TokenType.IDENTIFIER, r'\w', input, current, line)
+
+def tokenizeComma(input, current, line):
+    if input[current] == ',':
+        return (1, Token(TokenType.COMMA, input[current], (line, current)))
+    return (0, None)
+
+def tokenizeInteger(input, current, line):    
+    return tokenizeRegexPattern(TokenType.INTEGER, r'\d', input, current, line)
+
+def tokenizeOpenBracket(input, current, line):
+    if input[current] == '{':
+        return (1, Token(TokenType.OPEN_BRACKET, input[current], (line, current)))
+    return (0, None)
+
+def tokenizeCloseBracket(input, current, line):
+    if input[current] == '}':
+        return (1, Token(TokenType.CLOSE_BRACKET, input[current], (line, current)))
+    return (0, None)
+
+def tokenizeAssign(input, current, line):
+    if input[current] == '=':
+        return (1, Token(TokenType.ASSIGN, input[current], (line, current)))
+    return (0, None)
+
+def tokenizeColon(input, current, line):
+    if input[current] == ':':
+        return (1, Token(TokenType.COLON, input[current], (line, current)))
+    return (0, None)
+
+def tokenizeComment(input, current, line):
+    if input[current] == '#':
+        consumedChars = 0
+        value = ''
+        while current+consumedChars < len(input):
+            value += input[current+consumedChars]
+            consumedChars += 1            
+            pass
+        return (consumedChars, Token(TokenType.COMMENT, value, (line, current)))
+    return (0, None)
+
+def tokenizeNote(input, current, line):
+    consumedChars = 0
+    value = ''
+    if input[current] == '@':
+        consumedChars += 1
+        value += input[current]
+        if input[current+consumedChars] in ('C', 'c', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'A', 'a', 'H', 'h', 'B', 'b'):
+            value += input[current+consumedChars]
+            consumedChars += 1
+            
+            if current+consumedChars < len(input) and input[current+consumedChars] in ('b', '#'):                        
+                value += input[current+consumedChars]
+                consumedChars += 1
+                
+            if current+consumedChars < len(input) and re.match(r'\d', input[current+consumedChars]):            
+                value += input[current+consumedChars]
+                consumedChars += 1
+                
+            if current+consumedChars < len(input) and input[current+consumedChars] == '.':            
+                value += input[current+consumedChars]
+                consumedChars += 1
+                while current+consumedChars < len(input) and re.match(r'\d', input[current+consumedChars]):
+                    value += input[current+consumedChars]        
+                    consumedChars += 1  
+            return (consumedChars, Token(TokenType.NOTE, value, (line, current)))
+    return (0, None)
+
+def tokenizePercent(input, current, line):
+    if input[current] == '%':
+        return (1, Token(TokenType.PERCENT, input[current], (line, current)))
+    return (0, None)
+
+tokenizers = (
+    tokenizeOpenParen, 
+    tokenizeCloseParen, 
+    tokenizeAsterisk, 
+    tokenizeString, 
+    tokenizeInteger,
+    tokenizeNote,
+    tokenizeIdentifier, 
+    tokenizeComma,
+    tokenizeOpenBracket,
+    tokenizeCloseBracket,
+    tokenizeAssign,
+    tokenizeColon,
+    tokenizePercent,
+    tokenizeComment,
+    tokenizeWhitespaces
+)
+
+def tokenize(lines):    
+    tokens = []         
+    for lineNumber, line in enumerate(lines):    
+        current = 0
+        while current < len(line):
+            tokenized = False
+            for tokenizer in tokenizers:
+                consumedChars, value = tokenizer(line, current, lineNumber)
+                if consumedChars > 0:
+                    tokens.append(value)
+                    current += consumedChars
+                    tokenized = True
+                    break            
+            
+            if not tokenized:
+                raise TokenizerError(f"Line {lineNumber+1}, col {current+1}: unknown symbol '{line[current]}'")
+            
+    return [token for token in tokens if token.type is not None]
+
+if __name__ == "__main__":   
+    try:
+        with open(sys.argv[1], 'r') as source:
+            lines = [line.rstrip('\n') for line in source.readlines()]
+            
+            tokens = tokenize(lines)    
+        
+        for token in tokens:
+            print(token)        
+    except TokenizerError as e:
+        print(str(e))
+