Files
mvm/app/Assembler/Tokenizer.hs

125 lines
4.5 KiB
Haskell

module Assembler.Tokenizer where
import Data.List (sortBy)
import Data.Char (ord, isDigit, isSpace, isAlpha, isAlphaNum, isHexDigit)
import Data.Monoid (First(..))
import VirtualMachine.VM (Op(..))
import Util (toLowerCase, controlChar, unescape, maybeToEither)
import Control.Monad (guard)
data Token = Operator Op
| IntLiteral Int
| StringLiteral String
| Identifier String
| Colon
| Ampersand
| Dot
| NewLine
| WhiteSpace
| Comment String
deriving (Eq, Show)
type ConsumedChars = Int
data TokenizeResult = TokenizeResult Token ConsumedChars deriving (Eq, Show)
type Tokenizer = String -> Maybe TokenizeResult
type CaseSensitive = Bool
keywordTokenizer :: CaseSensitive -> String -> Token -> Tokenizer
keywordTokenizer _ _ _ [] = Nothing
keywordTokenizer cs kwd token input = guard matches >> return (TokenizeResult token len)
where len = length kwd
matches = mapper kwd == mapper (take len input)
mapper = if cs then id else toLowerCase
operatorTokenizer :: Op -> Tokenizer
operatorTokenizer op = keywordTokenizer False (toLowerCase . show $ op) (Operator op)
tokenizeOperators :: Tokenizer
tokenizeOperators = anyTokenizer $ map operatorTokenizer $ sortBy cmp [Nop ..]
where cmp x y = (length . show) y `compare` (length . show) x
tokenizeIdentifier :: Tokenizer
tokenizeIdentifier [] = Nothing
tokenizeIdentifier input@(x:_) = guard (not $ null identifier) >> guard (isAlpha x) >> return token
where identifier = takeWhile (or . sequenceA [isAlphaNum, (=='_')]) input
token = TokenizeResult (Identifier identifier) (length identifier)
tokenizeWhitespace :: Tokenizer
tokenizeWhitespace [] = Nothing
tokenizeWhitespace (x:_) = guard (isSpace x) >> return (TokenizeResult WhiteSpace 1)
tokenizeDecimal :: Tokenizer
tokenizeDecimal [] = Nothing
tokenizeDecimal input = guard (not $ null numberStr) >> return token
where numberStr = takeWhile isDigit input
token = TokenizeResult (IntLiteral $ read numberStr) $ length numberStr
tokenizeHex :: Tokenizer
tokenizeHex ('0':'x':input) = guard (not $ null numberStr) >> return token
where numberStr = takeWhile isHexDigit input
token = TokenizeResult (IntLiteral $ read $ "0x" ++ numberStr) (length numberStr + 2)
tokenizeHex _ = Nothing
tokenizeChar :: Tokenizer
tokenizeChar ('\'':'\\':x:'\'':_) = controlChar x >>= \s -> return $ TokenizeResult (IntLiteral s) 4
tokenizeChar ('\'':x:'\'':_) = return $ TokenizeResult (IntLiteral . ord $ x) 3
tokenizeChar _ = Nothing
tokenizeString :: Tokenizer
tokenizeString ('"':xs) = do
string <- extractString xs
unescaped <- unescape string
return $ TokenizeResult (StringLiteral unescaped) (length string + 2)
where
extractString [] = Nothing
extractString (y:ys)
| y == '"' = Just []
| y == '\n' = Nothing
| otherwise = (y:) <$> extractString ys
tokenizeString _ = Nothing
tokenizeComment :: Tokenizer
tokenizeComment (';':xs) = return $ TokenizeResult (Comment comment) (length comment + 1)
where comment = takeWhile (/='\n') xs
tokenizeComment _ = Nothing
type SeparatorPredicate = Char -> Bool
sepTokenizer :: SeparatorPredicate -> Tokenizer -> Tokenizer
sepTokenizer _ _ [] = Nothing
sepTokenizer predicate tokenizer input = do
result@(TokenizeResult _ consumed) <- tokenizer input
let next = drop consumed input
guard $ null next || (predicate . head $ next)
return result
anyTokenizer :: [Tokenizer] -> Tokenizer
anyTokenizer _ [] = Nothing
anyTokenizer tokenizers input = getFirst . mconcat . map First $ sequenceA tokenizers input
tokenize :: String -> Either String [Token]
tokenize [] = Right []
tokenize input = tokens >>= (\(TokenizeResult token chars) -> (token:) <$> tokenize (drop chars input)) >>= (Right . filter tokenFilter)
where tokens = maybeToEither (tokenizers input) $ "Unknown token: " ++ take 20 input
tokenizers = anyTokenizer
[ keywordTokenizer False "\n" NewLine
, tokenizeWhitespace
, tokenizeComment
, sepTokenizer isSpace tokenizeOperators
, sepTokenizer isSpace tokenizeHex
, sepTokenizer isSpace tokenizeDecimal
, tokenizeIdentifier
, keywordTokenizer False ":" Colon
, keywordTokenizer False "&" Ampersand
, keywordTokenizer False "." Dot
, tokenizeChar
, tokenizeString
]
tokenFilter :: Token -> Bool
tokenFilter WhiteSpace = False
tokenFilter (Comment _) = False
tokenFilter _ = True