Add unit tests for Assembler tokenizers

This commit is contained in:
2021-11-05 13:48:12 +01:00
parent 8208de8791
commit ce3af87741
4 changed files with 423 additions and 12 deletions

View File

@@ -23,7 +23,9 @@ extra-source-files: CHANGELOG.md
executable MVM executable MVM
main-is: Main.hs main-is: Main.hs
-- Modules included in this executable, other than Main. ghc-options: -Wall
-- Modules included in this executable, other than Main.
other-modules: other-modules:
VirtualMachine VirtualMachine
Assembler.Tokenizer Assembler.Tokenizer
@@ -36,5 +38,29 @@ executable MVM
base ^>=4.15.0.0, base ^>=4.15.0.0,
bytestring ^>=0.11.0.0, bytestring ^>=0.11.0.0,
containers ^>=0.6.4.1 containers ^>=0.6.4.1
hs-source-dirs: app hs-source-dirs: app
default-language: Haskell2010 default-language: Haskell2010
test-suite spec
type: exitcode-stdio-1.0
main-is: Spec.hs
hs-source-dirs:
app
test
ghc-options: -Wall
build-depends:
base ^>=4.15.0.0,
bytestring ^>=0.11.0.0,
containers ^>=0.6.4.1
, hspec ==2.*
other-modules:
Assembler.TokenizerSpec
VirtualMachine
Assembler.Tokenizer
Assembler.Parser
Util
default-language: Haskell2010
build-tool-depends: hspec-discover:hspec-discover == 2.*

View File

@@ -1,8 +1,6 @@
module Assembler.Tokenizer ( module Assembler.Tokenizer where
Token(..),
tokenize
) where
import qualified Data.List as List
import qualified Data.Char as Char import qualified Data.Char as Char
import qualified Data.Monoid as Monoid import qualified Data.Monoid as Monoid
import qualified VirtualMachine as VM import qualified VirtualMachine as VM
@@ -40,9 +38,13 @@ operatorTokenizer :: VM.Op -> Tokenizer
operatorTokenizer op input = keywordTokenizer False (U.toLowerCase . show $ op) (Operator op) input operatorTokenizer op input = keywordTokenizer False (U.toLowerCase . show $ op) (Operator op) input
tokenizeOperators :: Tokenizer tokenizeOperators :: Tokenizer
tokenizeOperators = anyTokenizer $ map operatorTokenizer [VM.Push ..] tokenizeOperators = anyTokenizer $ map operatorTokenizer ops
where
ops = List.sortBy cmp [VM.Nop ..]
cmp x y = (length . show) y `compare` (length . show) x
tokenizeIdentifier :: Tokenizer tokenizeIdentifier :: Tokenizer
tokenizeIdentifier [] = Nothing
tokenizeIdentifier input@(x:_) = if null identifier || (not . Char.isAlpha) x tokenizeIdentifier input@(x:_) = if null identifier || (not . Char.isAlpha) x
then Nothing then Nothing
else Just $ TokenizeResult (Identifier identifier) (length identifier) else Just $ TokenizeResult (Identifier identifier) (length identifier)
@@ -105,19 +107,21 @@ tokenizeComment (x:xs) = if x == ';'
type SeparatorPredicate = Char -> Bool type SeparatorPredicate = Char -> Bool
sepTokenizer :: SeparatorPredicate -> Tokenizer -> Tokenizer sepTokenizer :: SeparatorPredicate -> Tokenizer -> Tokenizer
sepTokenizer _ _ [] = Nothing
sepTokenizer pred tokenizer input = do sepTokenizer pred tokenizer input = do
(TokenizeResult token consumed) <- tokenizer input result@(TokenizeResult token consumed) <- tokenizer input
let next = drop consumed input let next = drop consumed input
let (isSep, consumed') = if null next let (isSep, _) = if null next
then (True, 0) then (True, 0)
else if pred . head $ next else if pred . head $ next
then (True, 1) then (True, 1)
else (False, 0) else (False, 0)
if isSep if isSep
then return $ TokenizeResult token (consumed + consumed') then return $ result
else Nothing else Nothing
anyTokenizer :: [Tokenizer] -> Tokenizer anyTokenizer :: [Tokenizer] -> Tokenizer
anyTokenizer _ [] = Nothing
anyTokenizer tokenizers input = Monoid.getFirst . Monoid.mconcat . map Monoid.First $ sequenceA tokenizers input anyTokenizer tokenizers input = Monoid.getFirst . Monoid.mconcat . map Monoid.First $ sequenceA tokenizers input
tokenize :: String -> Either String [Token] tokenize :: String -> Either String [Token]
@@ -138,9 +142,9 @@ tokenizers = anyTokenizer
[ keywordTokenizer False "\n" NewLine [ keywordTokenizer False "\n" NewLine
, tokenizeWhitespace , tokenizeWhitespace
, tokenizeComment , tokenizeComment
, tokenizeOperators , sepTokenizer Char.isSpace tokenizeOperators
, tokenizeHex , sepTokenizer Char.isSpace tokenizeHex
, tokenizeDecimal , sepTokenizer Char.isSpace tokenizeDecimal
, tokenizeIdentifier , tokenizeIdentifier
, keywordTokenizer False ":" Colon , keywordTokenizer False ":" Colon
, keywordTokenizer False "&" Ampersand , keywordTokenizer False "&" Ampersand

View File

@@ -0,0 +1,380 @@
module Assembler.TokenizerSpec where
import Test.Hspec
import Test.Hspec.QuickCheck
import Numeric (showHex)
import Data.Char (ord)
import Assembler.Tokenizer
import VirtualMachine
success :: Token -> Int -> Maybe TokenizeResult
success token consumed = Just $ TokenizeResult token consumed
spec :: Spec
spec = do
describe "keywordTokenizer" $ do
it "supports truncated input" $
keywordTokenizer True "hey" NewLine "hey" `shouldBe` success NewLine 3
it "supports non-truncated input" $
keywordTokenizer True "hey" NewLine "heyjude" `shouldBe` success NewLine 3
it "supports case sensitivity" $
keywordTokenizer True "hey" NewLine "heYjude" `shouldBe` Nothing
it "supports case insensitivity" $
keywordTokenizer False "hey" NewLine "heYjude" `shouldBe` success NewLine 3
it "returns correct token" $
keywordTokenizer True "hey" Colon "heyjude" `shouldBe` success Colon 3
it "returns Nothing if input does not match" $
keywordTokenizer True "hey" Colon "xheyjude" `shouldBe` Nothing
it "supports empty input" $
keywordTokenizer True "hey" Colon "" `shouldBe` Nothing
describe "operatorTokenizer" $ do
it "returns proper operator when given a truncated input" $
operatorTokenizer Push "push" `shouldBe` success (Operator Push) 4
it "returns proper operator when given a nontruncated input" $
operatorTokenizer Pop "pops" `shouldBe` success (Operator Pop) 3
it "returns Nothing if input does not match" $
operatorTokenizer Pop "poop" `shouldBe` Nothing
it "supports empty input" $
operatorTokenizer Call "" `shouldBe` Nothing
describe "tokenizeOperators" $ do
it "autodetects and returns proper operators" $ do
let ops = [Nop ..]
let input = map show ops
let expected = map (\o -> success (Operator o) (length . show $ o)) ops
map tokenizeOperators input `shouldBe` expected
it "is case insensitive" $ do
let input = ["jmp", "CALL", "pUsH"]
let expected = [ success (Operator Jmp) 3
, success (Operator Call) 4
, success (Operator Push) 4
]
map tokenizeOperators input `shouldBe` expected
it "rejects other input" $
tokenizeOperators "some unsupported input" `shouldBe` Nothing
it "supports empty input" $
tokenizeOperators "" `shouldBe` Nothing
describe "tokenizeIdentifier" $ do
it "parses correct identifier" $
tokenizeIdentifier "someId" `shouldBe` success (Identifier "someId") 6
it "parses correct identifier with numbers" $
tokenizeIdentifier "someId14" `shouldBe` success (Identifier "someId14") 8
it "parses correct identifier with underscores" $
tokenizeIdentifier "some_Id" `shouldBe` success (Identifier "some_Id") 7
it "disallows to start identifier with underscore" $
tokenizeIdentifier "_someId" `shouldBe` Nothing
it "disallows to start identifier with digit" $
tokenizeIdentifier "5someId" `shouldBe` Nothing
it "supports empty input" $
tokenizeIdentifier "" `shouldBe` Nothing
describe "tokenizeWhitespace" $ do
it "parses space" $
tokenizeWhitespace " " `shouldBe` success WhiteSpace 1
it "parses tab" $
tokenizeWhitespace "\t" `shouldBe` success WhiteSpace 1
it "parses newline" $
tokenizeWhitespace "\n" `shouldBe` success WhiteSpace 1
it "parses CR" $
tokenizeWhitespace "\r" `shouldBe` success WhiteSpace 1
it "rejects non-whitespace chars" $ do
let input = map (\x -> [x]) $ ['a' .. 'z'] ++ ['A' .. 'Z'] ++ ['0' .. '9']
let expected = take (length input) . repeat $ Nothing
map tokenizeWhitespace input `shouldBe` expected
it "supports empty input" $
tokenizeIdentifier "" `shouldBe` Nothing
describe "tokenizeDecimal" $ do
it "parses numbers from 0 to 65535" $ do
let nums = [0 .. 65535]
let input = map show nums
let expected = map (\n -> success (IntLiteral n) (length . show $ n)) nums
map tokenizeDecimal input `shouldBe` expected
it "does not support negative numbers" $ do
let nums = [-1, -2 .. -65535] :: [Integer]
let input = map show nums
let expected = take (length nums) . repeat $ Nothing
map tokenizeDecimal input `shouldBe` expected
it "rejects other input" $
tokenizeDecimal "some unsupported input" `shouldBe` Nothing
it "supports empty input" $
tokenizeDecimal "" `shouldBe` Nothing
describe "tokenizeHex" $ do
it "parses numbers from 0x0 to 0xFFFF" $ do
let nums = [0 .. 0xFFFF]
let input = map (("0x"++) . (flip showHex "")) nums
let expected = map (\n -> success (IntLiteral n) (length . ("0x"++) . (flip showHex "") $ n)) nums
map tokenizeHex input `shouldBe` expected
it "does not support negative numbers" $ do
let nums = [0 .. 0xFFFF] :: [Integer]
let input = map (("-0x"++) . (flip showHex "")) nums
let expected = take (length nums) . repeat $ Nothing
map tokenizeHex input `shouldBe` expected
it "accepts left-padded number" $
tokenizeHex "0x0010" `shouldBe` success (IntLiteral 16) 6
it "rejects other input" $
tokenizeHex "some unsupported input" `shouldBe` Nothing
it "rejects '0'" $
tokenizeHex "0" `shouldBe` Nothing
it "rejects '0x'" $
tokenizeHex "0x" `shouldBe` Nothing
it "supports empty input" $
tokenizeHex "" `shouldBe` Nothing
describe "tokenizeChar" $ do
it "parses letters literals" $ do
let chars = ['a' .. 'z'] ++ ['A' .. 'Z']
let input = map (\c -> "'" ++ [c] ++ "'") chars
let expected = map (\c -> success (IntLiteral (ord c)) 3) chars
map tokenizeChar input `shouldBe` expected
it "parses digits literals" $ do
let chars = ['0' .. '9']
let input = map (\c -> "'" ++ [c] ++ "'") chars
let expected = map (\c -> success (IntLiteral (ord c)) 3) chars
map tokenizeChar input `shouldBe` expected
it "parses regular symbols literals" $ do
let chars = "!@#$%^&*()_+-=[]{};:|,/?<>\""
let input = map (\c -> "'" ++ [c] ++ "'") chars
let expected = map (\c -> success (IntLiteral (ord c)) 3) chars
map tokenizeChar input `shouldBe` expected
it "parses escape sequences literals" $ do
let input = [ "'\\n'"
, "'\\t'"
, "'\\v'"
, "'\\b'"
, "'\\r'"
, "'\\f'"
, "'\\a'"
, "'\\\\'"
, "'\\''"
, "'\\0'"
]
let expected = map (flip success 4 . IntLiteral) [10, 9, 11, 8, 13, 12, 7, 92, 39, 0]
map tokenizeChar input `shouldBe` expected
it "consumes double-quote without escaping" $
tokenizeChar "'\"'" `shouldBe` success (IntLiteral 34) 3
it "consumes double-quote with escaping" $
tokenizeChar "'\\\"'" `shouldBe` success (IntLiteral 34) 4
it "rejects non-single char literals" $
tokenizeChar "'ab'" `shouldBe` Nothing
it "rejects non-closed char literals" $
tokenizeChar "'a" `shouldBe` Nothing
it "rejects invalid escape sequences" $
tokenizeChar "'\\x'" `shouldBe` Nothing
it "rejects empty quotes" $
tokenizeChar "''" `shouldBe` Nothing
it "rejects single quote" $
tokenizeChar "'" `shouldBe` Nothing
it "supports empty input" $
tokenizeChar "" `shouldBe` Nothing
describe "tokenizeString" $ do
it "parses correct string" $
tokenizeString "\"Hello, world!\"" `shouldBe` success (StringLiteral "Hello, world!") 15
it "parses empty string" $
tokenizeString "\"\"" `shouldBe` success (StringLiteral "") 2
it "supports digits in strings" $ do
let str = "34823843dedsef231542c4f324c24234ffsef234g"
let len = length str + 2
let input = "\"" ++ str ++ "\""
tokenizeString input `shouldBe` success (StringLiteral str) len
it "supports regular symbols in strings" $ do
let str = "!@2#$%9^&*(1)_s+2-=[2h6sh]t{};:'e|<>,./?"
let len = length str + 2
let input = "\"" ++ str ++ "\""
tokenizeString input `shouldBe` success (StringLiteral str) len
it "supports escape sequences literals" $ do
pendingWith "We need probably to fix tokenizeString since the following test fails"
-- TODO:
-- let str = "\\n\\t\\v\\b\\r\\f\\a\\\\\\\"\\0"
-- let len = length str + 2
-- let input = "\"" ++ str ++ "\""
-- let expected = map chr [10, 9, 11, 8, 13, 12, 7, 92, 34, 0]
-- tokenizeString input `shouldBe` success (StringLiteral expected) len
it "consumes single quotes without escaping" $
tokenizeString "\"'''''''''\"" `shouldBe` success (StringLiteral "'''''''''") 11
it "consumes single quotes with escaping " $
tokenizeString "\"\\'\\'\\'\\'\\'\\'\"" `shouldBe` success (StringLiteral "''''''") 14
it "rejects invalid escape sequences" $
tokenizeString "\"\\x\"" `shouldBe` Nothing
it "rejects non-closed string literals" $
tokenizeString "\"hello, world!" `shouldBe` Nothing
it "rejects single double-quote" $
tokenizeString "\"" `shouldBe` Nothing
it "rejects multilined strings" $
tokenizeString "\"first line\nsecond line\"" `shouldBe` Nothing
it "supports empty input" $
tokenizeString "" `shouldBe` Nothing
describe "tokenizeComment" $ do
it "properly consumes comment" $
tokenizeComment ";some comment\n" `shouldBe` success (Comment "some comment") 13
it "properly consumes comment with whitespace padding" $
tokenizeComment "; \t some comment \t \n \t" `shouldBe` success (Comment " \t some comment \t ") 22
it "does not treat the input as a comment if it does not start with semicolon" $
tokenizeComment "some not valid comment\n" `shouldBe` Nothing
it "expands the comment till the end of the line" $
tokenizeComment "; some comment ; push 4 push 10\nadd" `shouldBe` success (Comment " some comment ; push 4 push 10") 31
it "parses the comment at the end of the input" $
tokenizeComment "; some comment " `shouldBe` success (Comment " some comment ") 15
it "supports empty input" $
tokenizeComment "" `shouldBe` Nothing
describe "sepTokenizer" $ do
it "produces the token only when the separator is present" $ do
let input = "abc-"
let expected = success Colon 3
let tokenizer _ = expected
sepTokenizer ('-'==) tokenizer input `shouldBe` expected
it "produces the token if its the end of input even if separator is not present" $ do
let input = "abc"
let expected = success Colon 3
let tokenizer _ = expected
sepTokenizer ('-'==) tokenizer input `shouldBe` expected
it "does not produce any token when the separator is not present" $ do
let input = "abcd"
let tokenizer _ = success Colon 3
sepTokenizer ('-'==) tokenizer input `shouldBe` Nothing
it "does not produce any token when the space is present instead" $ do
let input = "abc "
let tokenizer _ = success Colon 3
sepTokenizer ('-'==) tokenizer input `shouldBe` Nothing
it "does not change the number of consumed chars even though it's checking the separator presence" $ do
let input = "abc-"
let expected = success Colon 3
let tokenizer _ = expected
let (Just (TokenizeResult _ consumedChars)) = sepTokenizer ('-'==) tokenizer input
consumedChars `shouldBe` 3
it "supports empty input irrespective of wrapped tokenizer" $ do
let input = ""
let tokenizer _ = success Colon 3 -- MOCK: tokenizer returns Just even though the input is empty
sepTokenizer ('-'==) tokenizer input `shouldBe` Nothing
describe "anyTokenizer" $ do
it "returns the token if at least one subtokenizer produce that" $ do
let values = [ success Ampersand 1
, Nothing
, Nothing
, Nothing
, Nothing
]
let t = map (\x -> (\_ -> x)) values
anyTokenizer t "some not important input" `shouldBe` success Ampersand 1
it "returns the token of the first matching subtokenizer" $ do
let values = [ Nothing
, Nothing
, success (IntLiteral 4) 1
, Nothing
, Nothing
, success (StringLiteral "not me") 8
, Nothing
, success (StringLiteral "me neither") 12
, Nothing
, success Ampersand 1
, Nothing
, Nothing
, success Colon 1
, Nothing
]
let t = map (\x -> (\_ -> x)) values
anyTokenizer t "some not important input" `shouldBe` success (IntLiteral 4) 1
it "returns Nothing if no one of the tokenizers matches the input" $ do
let values = [ Nothing
, Nothing
, Nothing
, Nothing
, Nothing
]
let t = map (\x -> (\_ -> x)) values
anyTokenizer t "some not important input" `shouldBe` Nothing
it "always returns Nothing if no tokenizers are defined" $
anyTokenizer [] "some not important input" `shouldBe` Nothing
it "supports empty input irrespective of wrapped tokenizers" $ do
let input = ""
let values = [ success Ampersand 1
, success Colon 1
, success (IntLiteral 3) 1
, success (Operator Push) 4
]
let t = map (\x -> (\_ -> x)) values
anyTokenizer t input `shouldBe` Nothing
describe "tokenFilter" $ do
it "filters out whitespaces and comments" $ do
let tokens = [ Operator Push
, IntLiteral 4
, Comment "here is the identifier"
, Identifier "someId"
, WhiteSpace
, Colon
, WhiteSpace
, Ampersand
, NewLine
, WhiteSpace
, Comment "some comment"
]
let expected = [ Operator Push
, IntLiteral 4
, Identifier "someId"
, Colon
, Ampersand
, NewLine
]
filter tokenFilter tokens `shouldBe` expected
describe "tokenize" $ do
it "treats 'pop' as a operator instead of identifier" $
tokenize "pop" `shouldBe` Right [Operator Pop]
it "treats 'poop' as a identifier" $
tokenize "poop" `shouldBe` Right [Identifier "poop"]
it "treats operator as themselves instead of identifiers" $ do
let ops = [Nop ..]
let input = map show ops
let expected = map (\o -> Right [Operator o]) ops
map tokenize input `shouldBe` expected
it "treats operator-like names (with 's' appended) as identifiers" $ do
let ops = [Nop ..]
let input = map ((++"s") . show) ops
let expected = map (\i-> Right [Identifier i]) input
map tokenize input `shouldBe` expected
it "treats '\n' as a newline instead of whitespace" $
tokenize "\n" `shouldBe` Right [NewLine]
it "ignores comments" $ do
let input = "; this is some comment \n\
\ ; this is another comment"
tokenize input `shouldBe` Right [NewLine]
it "interprets 'push 5\npush5' as operator, int literal, NL and identifier" $ do
let input = "push 5\npush5"
let expected = [Operator Push, IntLiteral 5, NewLine, Identifier "push5"]
tokenize input `shouldBe` Right expected
it "treats 'someId1234' as 'someId1234' identifier instead of 'someId' identifier and 1234 int" $
tokenize "someId1234" `shouldBe` Right [Identifier "someId1234"]
it "accepts 'main: NL" $
tokenize "main: \n" `shouldBe` Right [Identifier "main", Colon, NewLine]
it "accepts 'call &sum NL" $
tokenize "call &sum \n" `shouldBe` Right [Operator Call, Ampersand, Identifier "sum", NewLine]
it "rejects '4push'" $
tokenize "4push" `shouldBe` Left "Unknown token: 4push"
it "interprets example #1" $ do
let input = "main: ; here we define some main label\n\
\ push 7 ; we push 7 to the stack\n\
\ push 0x04 ; we push 4 to the stack\n\
\ call &sum ; we call 'sum' subprogram\n\
\ halt \n\
\ \n\
\ sum: add\n\
\ ret"
let expected = [ Identifier "main", Colon, NewLine
, Operator Push, IntLiteral 7, NewLine
, Operator Push, IntLiteral 4, NewLine
, Operator Call, Ampersand, Identifier "sum", NewLine
, Operator Halt, NewLine
, NewLine
, Identifier "sum", Colon, Operator Add, NewLine
, Operator Ret
]
tokenize input `shouldBe` Right expected

1
test/Spec.hs Normal file
View File

@@ -0,0 +1 @@
{-# OPTIONS_GHC -F -pgmF hspec-discover #-}