From ce3af877418f61546499a33cd5f872ffede2ec89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Przemys=C5=82aw=20Pluta?= Date: Fri, 5 Nov 2021 13:48:12 +0100 Subject: [PATCH] Add unit tests for Assembler tokenizers --- MVM.cabal | 28 ++- app/Assembler/Tokenizer.hs | 26 ++- test/Assembler/TokenizerSpec.hs | 380 ++++++++++++++++++++++++++++++++ test/Spec.hs | 1 + 4 files changed, 423 insertions(+), 12 deletions(-) create mode 100644 test/Assembler/TokenizerSpec.hs create mode 100644 test/Spec.hs diff --git a/MVM.cabal b/MVM.cabal index 253141e..995415e 100644 --- a/MVM.cabal +++ b/MVM.cabal @@ -23,7 +23,9 @@ extra-source-files: CHANGELOG.md executable MVM main-is: Main.hs - -- Modules included in this executable, other than Main. + ghc-options: -Wall + + -- Modules included in this executable, other than Main. other-modules: VirtualMachine Assembler.Tokenizer @@ -36,5 +38,29 @@ executable MVM base ^>=4.15.0.0, bytestring ^>=0.11.0.0, containers ^>=0.6.4.1 + hs-source-dirs: app default-language: Haskell2010 + +test-suite spec + type: exitcode-stdio-1.0 + main-is: Spec.hs + hs-source-dirs: + app + test + ghc-options: -Wall + build-depends: + base ^>=4.15.0.0, + bytestring ^>=0.11.0.0, + containers ^>=0.6.4.1 + , hspec ==2.* + other-modules: + Assembler.TokenizerSpec + + VirtualMachine + Assembler.Tokenizer + Assembler.Parser + Util + + default-language: Haskell2010 + build-tool-depends: hspec-discover:hspec-discover == 2.* diff --git a/app/Assembler/Tokenizer.hs b/app/Assembler/Tokenizer.hs index 514cff4..2a6d1be 100644 --- a/app/Assembler/Tokenizer.hs +++ b/app/Assembler/Tokenizer.hs @@ -1,8 +1,6 @@ -module Assembler.Tokenizer ( - Token(..), - tokenize -) where +module Assembler.Tokenizer where +import qualified Data.List as List import qualified Data.Char as Char import qualified Data.Monoid as Monoid import qualified VirtualMachine as VM @@ -40,9 +38,13 @@ operatorTokenizer :: VM.Op -> Tokenizer operatorTokenizer op input = keywordTokenizer False (U.toLowerCase . show $ op) (Operator op) input tokenizeOperators :: Tokenizer -tokenizeOperators = anyTokenizer $ map operatorTokenizer [VM.Push ..] +tokenizeOperators = anyTokenizer $ map operatorTokenizer ops + where + ops = List.sortBy cmp [VM.Nop ..] + cmp x y = (length . show) y `compare` (length . show) x tokenizeIdentifier :: Tokenizer +tokenizeIdentifier [] = Nothing tokenizeIdentifier input@(x:_) = if null identifier || (not . Char.isAlpha) x then Nothing else Just $ TokenizeResult (Identifier identifier) (length identifier) @@ -105,19 +107,21 @@ tokenizeComment (x:xs) = if x == ';' type SeparatorPredicate = Char -> Bool sepTokenizer :: SeparatorPredicate -> Tokenizer -> Tokenizer +sepTokenizer _ _ [] = Nothing sepTokenizer pred tokenizer input = do - (TokenizeResult token consumed) <- tokenizer input + result@(TokenizeResult token consumed) <- tokenizer input let next = drop consumed input - let (isSep, consumed') = if null next + let (isSep, _) = if null next then (True, 0) else if pred . head $ next then (True, 1) else (False, 0) if isSep - then return $ TokenizeResult token (consumed + consumed') + then return $ result else Nothing anyTokenizer :: [Tokenizer] -> Tokenizer +anyTokenizer _ [] = Nothing anyTokenizer tokenizers input = Monoid.getFirst . Monoid.mconcat . map Monoid.First $ sequenceA tokenizers input tokenize :: String -> Either String [Token] @@ -138,9 +142,9 @@ tokenizers = anyTokenizer [ keywordTokenizer False "\n" NewLine , tokenizeWhitespace , tokenizeComment - , tokenizeOperators - , tokenizeHex - , tokenizeDecimal + , sepTokenizer Char.isSpace tokenizeOperators + , sepTokenizer Char.isSpace tokenizeHex + , sepTokenizer Char.isSpace tokenizeDecimal , tokenizeIdentifier , keywordTokenizer False ":" Colon , keywordTokenizer False "&" Ampersand diff --git a/test/Assembler/TokenizerSpec.hs b/test/Assembler/TokenizerSpec.hs new file mode 100644 index 0000000..f28a43c --- /dev/null +++ b/test/Assembler/TokenizerSpec.hs @@ -0,0 +1,380 @@ +module Assembler.TokenizerSpec where + +import Test.Hspec +import Test.Hspec.QuickCheck +import Numeric (showHex) +import Data.Char (ord) + +import Assembler.Tokenizer +import VirtualMachine + +success :: Token -> Int -> Maybe TokenizeResult +success token consumed = Just $ TokenizeResult token consumed + +spec :: Spec +spec = do + describe "keywordTokenizer" $ do + it "supports truncated input" $ + keywordTokenizer True "hey" NewLine "hey" `shouldBe` success NewLine 3 + it "supports non-truncated input" $ + keywordTokenizer True "hey" NewLine "heyjude" `shouldBe` success NewLine 3 + it "supports case sensitivity" $ + keywordTokenizer True "hey" NewLine "heYjude" `shouldBe` Nothing + it "supports case insensitivity" $ + keywordTokenizer False "hey" NewLine "heYjude" `shouldBe` success NewLine 3 + it "returns correct token" $ + keywordTokenizer True "hey" Colon "heyjude" `shouldBe` success Colon 3 + it "returns Nothing if input does not match" $ + keywordTokenizer True "hey" Colon "xheyjude" `shouldBe` Nothing + it "supports empty input" $ + keywordTokenizer True "hey" Colon "" `shouldBe` Nothing + + describe "operatorTokenizer" $ do + it "returns proper operator when given a truncated input" $ + operatorTokenizer Push "push" `shouldBe` success (Operator Push) 4 + it "returns proper operator when given a nontruncated input" $ + operatorTokenizer Pop "pops" `shouldBe` success (Operator Pop) 3 + it "returns Nothing if input does not match" $ + operatorTokenizer Pop "poop" `shouldBe` Nothing + it "supports empty input" $ + operatorTokenizer Call "" `shouldBe` Nothing + + describe "tokenizeOperators" $ do + it "autodetects and returns proper operators" $ do + let ops = [Nop ..] + let input = map show ops + let expected = map (\o -> success (Operator o) (length . show $ o)) ops + map tokenizeOperators input `shouldBe` expected + it "is case insensitive" $ do + let input = ["jmp", "CALL", "pUsH"] + let expected = [ success (Operator Jmp) 3 + , success (Operator Call) 4 + , success (Operator Push) 4 + ] + map tokenizeOperators input `shouldBe` expected + it "rejects other input" $ + tokenizeOperators "some unsupported input" `shouldBe` Nothing + it "supports empty input" $ + tokenizeOperators "" `shouldBe` Nothing + + describe "tokenizeIdentifier" $ do + it "parses correct identifier" $ + tokenizeIdentifier "someId" `shouldBe` success (Identifier "someId") 6 + it "parses correct identifier with numbers" $ + tokenizeIdentifier "someId14" `shouldBe` success (Identifier "someId14") 8 + it "parses correct identifier with underscores" $ + tokenizeIdentifier "some_Id" `shouldBe` success (Identifier "some_Id") 7 + it "disallows to start identifier with underscore" $ + tokenizeIdentifier "_someId" `shouldBe` Nothing + it "disallows to start identifier with digit" $ + tokenizeIdentifier "5someId" `shouldBe` Nothing + it "supports empty input" $ + tokenizeIdentifier "" `shouldBe` Nothing + + describe "tokenizeWhitespace" $ do + it "parses space" $ + tokenizeWhitespace " " `shouldBe` success WhiteSpace 1 + it "parses tab" $ + tokenizeWhitespace "\t" `shouldBe` success WhiteSpace 1 + it "parses newline" $ + tokenizeWhitespace "\n" `shouldBe` success WhiteSpace 1 + it "parses CR" $ + tokenizeWhitespace "\r" `shouldBe` success WhiteSpace 1 + it "rejects non-whitespace chars" $ do + let input = map (\x -> [x]) $ ['a' .. 'z'] ++ ['A' .. 'Z'] ++ ['0' .. '9'] + let expected = take (length input) . repeat $ Nothing + map tokenizeWhitespace input `shouldBe` expected + it "supports empty input" $ + tokenizeIdentifier "" `shouldBe` Nothing + + describe "tokenizeDecimal" $ do + it "parses numbers from 0 to 65535" $ do + let nums = [0 .. 65535] + let input = map show nums + let expected = map (\n -> success (IntLiteral n) (length . show $ n)) nums + map tokenizeDecimal input `shouldBe` expected + it "does not support negative numbers" $ do + let nums = [-1, -2 .. -65535] :: [Integer] + let input = map show nums + let expected = take (length nums) . repeat $ Nothing + map tokenizeDecimal input `shouldBe` expected + it "rejects other input" $ + tokenizeDecimal "some unsupported input" `shouldBe` Nothing + it "supports empty input" $ + tokenizeDecimal "" `shouldBe` Nothing + + describe "tokenizeHex" $ do + it "parses numbers from 0x0 to 0xFFFF" $ do + let nums = [0 .. 0xFFFF] + let input = map (("0x"++) . (flip showHex "")) nums + let expected = map (\n -> success (IntLiteral n) (length . ("0x"++) . (flip showHex "") $ n)) nums + map tokenizeHex input `shouldBe` expected + it "does not support negative numbers" $ do + let nums = [0 .. 0xFFFF] :: [Integer] + let input = map (("-0x"++) . (flip showHex "")) nums + let expected = take (length nums) . repeat $ Nothing + map tokenizeHex input `shouldBe` expected + it "accepts left-padded number" $ + tokenizeHex "0x0010" `shouldBe` success (IntLiteral 16) 6 + it "rejects other input" $ + tokenizeHex "some unsupported input" `shouldBe` Nothing + it "rejects '0'" $ + tokenizeHex "0" `shouldBe` Nothing + it "rejects '0x'" $ + tokenizeHex "0x" `shouldBe` Nothing + it "supports empty input" $ + tokenizeHex "" `shouldBe` Nothing + + describe "tokenizeChar" $ do + it "parses letters literals" $ do + let chars = ['a' .. 'z'] ++ ['A' .. 'Z'] + let input = map (\c -> "'" ++ [c] ++ "'") chars + let expected = map (\c -> success (IntLiteral (ord c)) 3) chars + map tokenizeChar input `shouldBe` expected + it "parses digits literals" $ do + let chars = ['0' .. '9'] + let input = map (\c -> "'" ++ [c] ++ "'") chars + let expected = map (\c -> success (IntLiteral (ord c)) 3) chars + map tokenizeChar input `shouldBe` expected + it "parses regular symbols literals" $ do + let chars = "!@#$%^&*()_+-=[]{};:|,/?<>\"" + let input = map (\c -> "'" ++ [c] ++ "'") chars + let expected = map (\c -> success (IntLiteral (ord c)) 3) chars + map tokenizeChar input `shouldBe` expected + it "parses escape sequences literals" $ do + let input = [ "'\\n'" + , "'\\t'" + , "'\\v'" + , "'\\b'" + , "'\\r'" + , "'\\f'" + , "'\\a'" + , "'\\\\'" + , "'\\''" + , "'\\0'" + ] + let expected = map (flip success 4 . IntLiteral) [10, 9, 11, 8, 13, 12, 7, 92, 39, 0] + map tokenizeChar input `shouldBe` expected + it "consumes double-quote without escaping" $ + tokenizeChar "'\"'" `shouldBe` success (IntLiteral 34) 3 + it "consumes double-quote with escaping" $ + tokenizeChar "'\\\"'" `shouldBe` success (IntLiteral 34) 4 + it "rejects non-single char literals" $ + tokenizeChar "'ab'" `shouldBe` Nothing + it "rejects non-closed char literals" $ + tokenizeChar "'a" `shouldBe` Nothing + it "rejects invalid escape sequences" $ + tokenizeChar "'\\x'" `shouldBe` Nothing + it "rejects empty quotes" $ + tokenizeChar "''" `shouldBe` Nothing + it "rejects single quote" $ + tokenizeChar "'" `shouldBe` Nothing + it "supports empty input" $ + tokenizeChar "" `shouldBe` Nothing + + describe "tokenizeString" $ do + it "parses correct string" $ + tokenizeString "\"Hello, world!\"" `shouldBe` success (StringLiteral "Hello, world!") 15 + it "parses empty string" $ + tokenizeString "\"\"" `shouldBe` success (StringLiteral "") 2 + it "supports digits in strings" $ do + let str = "34823843dedsef231542c4f324c24234ffsef234g" + let len = length str + 2 + let input = "\"" ++ str ++ "\"" + tokenizeString input `shouldBe` success (StringLiteral str) len + it "supports regular symbols in strings" $ do + let str = "!@2#$%9^&*(1)_s+2-=[2h6sh]t{};:'e|<>,./?" + let len = length str + 2 + let input = "\"" ++ str ++ "\"" + tokenizeString input `shouldBe` success (StringLiteral str) len + it "supports escape sequences literals" $ do + pendingWith "We need probably to fix tokenizeString since the following test fails" + -- TODO: + -- let str = "\\n\\t\\v\\b\\r\\f\\a\\\\\\\"\\0" + -- let len = length str + 2 + -- let input = "\"" ++ str ++ "\"" + -- let expected = map chr [10, 9, 11, 8, 13, 12, 7, 92, 34, 0] + -- tokenizeString input `shouldBe` success (StringLiteral expected) len + it "consumes single quotes without escaping" $ + tokenizeString "\"'''''''''\"" `shouldBe` success (StringLiteral "'''''''''") 11 + it "consumes single quotes with escaping " $ + tokenizeString "\"\\'\\'\\'\\'\\'\\'\"" `shouldBe` success (StringLiteral "''''''") 14 + it "rejects invalid escape sequences" $ + tokenizeString "\"\\x\"" `shouldBe` Nothing + it "rejects non-closed string literals" $ + tokenizeString "\"hello, world!" `shouldBe` Nothing + it "rejects single double-quote" $ + tokenizeString "\"" `shouldBe` Nothing + it "rejects multilined strings" $ + tokenizeString "\"first line\nsecond line\"" `shouldBe` Nothing + it "supports empty input" $ + tokenizeString "" `shouldBe` Nothing + + describe "tokenizeComment" $ do + it "properly consumes comment" $ + tokenizeComment ";some comment\n" `shouldBe` success (Comment "some comment") 13 + it "properly consumes comment with whitespace padding" $ + tokenizeComment "; \t some comment \t \n \t" `shouldBe` success (Comment " \t some comment \t ") 22 + it "does not treat the input as a comment if it does not start with semicolon" $ + tokenizeComment "some not valid comment\n" `shouldBe` Nothing + it "expands the comment till the end of the line" $ + tokenizeComment "; some comment ; push 4 push 10\nadd" `shouldBe` success (Comment " some comment ; push 4 push 10") 31 + it "parses the comment at the end of the input" $ + tokenizeComment "; some comment " `shouldBe` success (Comment " some comment ") 15 + it "supports empty input" $ + tokenizeComment "" `shouldBe` Nothing + + describe "sepTokenizer" $ do + it "produces the token only when the separator is present" $ do + let input = "abc-" + let expected = success Colon 3 + let tokenizer _ = expected + sepTokenizer ('-'==) tokenizer input `shouldBe` expected + it "produces the token if its the end of input even if separator is not present" $ do + let input = "abc" + let expected = success Colon 3 + let tokenizer _ = expected + sepTokenizer ('-'==) tokenizer input `shouldBe` expected + it "does not produce any token when the separator is not present" $ do + let input = "abcd" + let tokenizer _ = success Colon 3 + sepTokenizer ('-'==) tokenizer input `shouldBe` Nothing + it "does not produce any token when the space is present instead" $ do + let input = "abc " + let tokenizer _ = success Colon 3 + sepTokenizer ('-'==) tokenizer input `shouldBe` Nothing + it "does not change the number of consumed chars even though it's checking the separator presence" $ do + let input = "abc-" + let expected = success Colon 3 + let tokenizer _ = expected + let (Just (TokenizeResult _ consumedChars)) = sepTokenizer ('-'==) tokenizer input + consumedChars `shouldBe` 3 + it "supports empty input irrespective of wrapped tokenizer" $ do + let input = "" + let tokenizer _ = success Colon 3 -- MOCK: tokenizer returns Just even though the input is empty + sepTokenizer ('-'==) tokenizer input `shouldBe` Nothing + + describe "anyTokenizer" $ do + it "returns the token if at least one subtokenizer produce that" $ do + let values = [ success Ampersand 1 + , Nothing + , Nothing + , Nothing + , Nothing + ] + let t = map (\x -> (\_ -> x)) values + anyTokenizer t "some not important input" `shouldBe` success Ampersand 1 + it "returns the token of the first matching subtokenizer" $ do + let values = [ Nothing + , Nothing + , success (IntLiteral 4) 1 + , Nothing + , Nothing + , success (StringLiteral "not me") 8 + , Nothing + , success (StringLiteral "me neither") 12 + , Nothing + , success Ampersand 1 + , Nothing + , Nothing + , success Colon 1 + , Nothing + ] + let t = map (\x -> (\_ -> x)) values + anyTokenizer t "some not important input" `shouldBe` success (IntLiteral 4) 1 + it "returns Nothing if no one of the tokenizers matches the input" $ do + let values = [ Nothing + , Nothing + , Nothing + , Nothing + , Nothing + ] + let t = map (\x -> (\_ -> x)) values + anyTokenizer t "some not important input" `shouldBe` Nothing + it "always returns Nothing if no tokenizers are defined" $ + anyTokenizer [] "some not important input" `shouldBe` Nothing + it "supports empty input irrespective of wrapped tokenizers" $ do + let input = "" + let values = [ success Ampersand 1 + , success Colon 1 + , success (IntLiteral 3) 1 + , success (Operator Push) 4 + ] + let t = map (\x -> (\_ -> x)) values + anyTokenizer t input `shouldBe` Nothing + + describe "tokenFilter" $ do + it "filters out whitespaces and comments" $ do + let tokens = [ Operator Push + , IntLiteral 4 + , Comment "here is the identifier" + , Identifier "someId" + , WhiteSpace + , Colon + , WhiteSpace + , Ampersand + , NewLine + , WhiteSpace + , Comment "some comment" + ] + let expected = [ Operator Push + , IntLiteral 4 + , Identifier "someId" + , Colon + , Ampersand + , NewLine + ] + filter tokenFilter tokens `shouldBe` expected + + describe "tokenize" $ do + it "treats 'pop' as a operator instead of identifier" $ + tokenize "pop" `shouldBe` Right [Operator Pop] + it "treats 'poop' as a identifier" $ + tokenize "poop" `shouldBe` Right [Identifier "poop"] + it "treats operator as themselves instead of identifiers" $ do + let ops = [Nop ..] + let input = map show ops + let expected = map (\o -> Right [Operator o]) ops + map tokenize input `shouldBe` expected + it "treats operator-like names (with 's' appended) as identifiers" $ do + let ops = [Nop ..] + let input = map ((++"s") . show) ops + let expected = map (\i-> Right [Identifier i]) input + map tokenize input `shouldBe` expected + it "treats '\n' as a newline instead of whitespace" $ + tokenize "\n" `shouldBe` Right [NewLine] + it "ignores comments" $ do + let input = "; this is some comment \n\ + \ ; this is another comment" + tokenize input `shouldBe` Right [NewLine] + it "interprets 'push 5\npush5' as operator, int literal, NL and identifier" $ do + let input = "push 5\npush5" + let expected = [Operator Push, IntLiteral 5, NewLine, Identifier "push5"] + tokenize input `shouldBe` Right expected + it "treats 'someId1234' as 'someId1234' identifier instead of 'someId' identifier and 1234 int" $ + tokenize "someId1234" `shouldBe` Right [Identifier "someId1234"] + it "accepts 'main: NL" $ + tokenize "main: \n" `shouldBe` Right [Identifier "main", Colon, NewLine] + it "accepts 'call &sum NL" $ + tokenize "call &sum \n" `shouldBe` Right [Operator Call, Ampersand, Identifier "sum", NewLine] + it "rejects '4push'" $ + tokenize "4push" `shouldBe` Left "Unknown token: 4push" + it "interprets example #1" $ do + let input = "main: ; here we define some main label\n\ + \ push 7 ; we push 7 to the stack\n\ + \ push 0x04 ; we push 4 to the stack\n\ + \ call &sum ; we call 'sum' subprogram\n\ + \ halt \n\ + \ \n\ + \ sum: add\n\ + \ ret" + let expected = [ Identifier "main", Colon, NewLine + , Operator Push, IntLiteral 7, NewLine + , Operator Push, IntLiteral 4, NewLine + , Operator Call, Ampersand, Identifier "sum", NewLine + , Operator Halt, NewLine + , NewLine + , Identifier "sum", Colon, Operator Add, NewLine + , Operator Ret + ] + tokenize input `shouldBe` Right expected \ No newline at end of file diff --git a/test/Spec.hs b/test/Spec.hs new file mode 100644 index 0000000..52ef578 --- /dev/null +++ b/test/Spec.hs @@ -0,0 +1 @@ +{-# OPTIONS_GHC -F -pgmF hspec-discover #-} \ No newline at end of file