module Assembler ( tokenize ) where import Data.Char as Char import Data.Monoid as Monoid import qualified VirtualMachine as VM (Op(..), Instruction, Command, instructionByOp) import qualified Util as U data Token = Operator VM.Op | IntLiteral Int | StringLiteral String | Identifier String | Colon | Ampersand | NewLine | WhiteSpace | Comment String deriving (Eq, Show) type ConsumedChars = Int data TokenizeResult = TokenizeResult Token ConsumedChars deriving (Eq, Show) type Tokenizer = String -> Maybe TokenizeResult type CaseSensitive = Bool keywordTokenizer :: CaseSensitive -> String -> Token -> Tokenizer keywordTokenizer _ _ _ [] = Nothing keywordTokenizer cs kwd token input | matches = Just $ TokenizeResult token len | otherwise = Nothing where len = length kwd mapper = if cs then id else U.toLowerCase zipped = zipWith (==) (mapper kwd) (mapper . take len $ input) matches = and zipped && len == length zipped operatorTokenizer :: VM.Op -> Tokenizer operatorTokenizer op input = keywordTokenizer False (U.toLowerCase . show $ op) (Operator op) input tokenizeOperators :: Tokenizer tokenizeOperators = anyTokenizer $ map operatorTokenizer [VM.Push ..] tokenizeIdentifier :: Tokenizer tokenizeIdentifier input@(x:_) = if null identifier || (not . Char.isAlpha) x then Nothing else Just $ TokenizeResult (Identifier identifier) (length identifier) where identifier = takeWhile (or . sequenceA [Char.isAlphaNum, (=='_')]) input tokenizeWhitespace :: Tokenizer tokenizeWhitespace [] = Nothing tokenizeWhitespace (x:_) | Char.isSpace x = Just $ TokenizeResult WhiteSpace 1 | otherwise = Nothing tokenizeDecimal :: Tokenizer tokenizeDecimal [] = Nothing tokenizeDecimal input = if null numberStr then Nothing else Just $ TokenizeResult (IntLiteral number) len where number = read numberStr len = length numberStr numberStr = takeWhile Char.isDigit input tokenizeHex :: Tokenizer tokenizeHex [] = Nothing tokenizeHex input = if isPrefix && len > 0 then Just $ TokenizeResult (IntLiteral number) (len + 2) else Nothing where isPrefix = take 2 input == "0x" number = read . ("0x"++) $ numberStr len = length numberStr numberStr = takeWhile Char.isHexDigit (drop 2 input) tokenizeChar :: Tokenizer tokenizeChar ('\'':'\\':x:'\'':_) = U.controlChar x >>= (\s -> return $ TokenizeResult (IntLiteral s) 4) tokenizeChar ('\'':x:'\'':_) = Just $ TokenizeResult (IntLiteral . ord $ x) 3 tokenizeChar _ = Nothing tokenizeString :: Tokenizer tokenizeString ('"':xs) = do string <- extractString xs unescaped <- U.unescape string return $ TokenizeResult (StringLiteral unescaped) (length string + 2) where extractString [] = Nothing extractString (x:xs) | x == '"' = Just [] | x == '\n' = Nothing | otherwise = extractString xs >>= (\r -> return $ x : r) tokenizeString _ = Nothing tokenizeComment :: Tokenizer tokenizeComment [] = Nothing tokenizeComment (x:xs) = if x == ';' then Just $ TokenizeResult (Comment comment) (len + 1) else Nothing where len = length comment comment = takeWhile (/='\n') xs type SeparatorPredicate = Char -> Bool sepTokenizer :: SeparatorPredicate -> Tokenizer -> Tokenizer sepTokenizer pred tokenizer input = do (TokenizeResult token consumed) <- tokenizer input let next = drop consumed input let (isSep, consumed') = if null next then (True, 0) else if pred . head $ next then (True, 1) else (False, 0) if isSep then return $ TokenizeResult token (consumed + consumed') else Nothing anyTokenizer :: [Tokenizer] -> Tokenizer anyTokenizer tokenizers input = Monoid.getFirst . Monoid.mconcat . map Monoid.First $ sequenceA tokenizers input tokenize :: String -> Either String [Token] tokenize [] = Right [] tokenize input = case tokenizers input of (Just (TokenizeResult token chars)) -> tokenize (drop chars input) >>= (\rest -> return $ token : rest) Nothing -> Left $ "Unknown token: " ++ take 20 input tokenizers :: Tokenizer tokenizers = anyTokenizer [ keywordTokenizer False "\n" NewLine , tokenizeWhitespace , tokenizeComment , sepTokenizer Char.isSpace tokenizeOperators , sepTokenizer Char.isSpace tokenizeHex , sepTokenizer Char.isSpace tokenizeDecimal , tokenizeIdentifier , keywordTokenizer False ":" Colon , keywordTokenizer False "&" Ampersand , tokenizeChar , tokenizeString ]