From ce3af877418f61546499a33cd5f872ffede2ec89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Przemys=C5=82aw=20Pluta?=
 <noreply.git@mx.bartlomiejpluta.com>
Date: Fri, 5 Nov 2021 13:48:12 +0100
Subject: [PATCH] Add unit tests for Assembler tokenizers

---
 MVM.cabal                       |  28 ++-
 app/Assembler/Tokenizer.hs      |  26 ++-
 test/Assembler/TokenizerSpec.hs | 380 ++++++++++++++++++++++++++++++++
 test/Spec.hs                    |   1 +
 4 files changed, 423 insertions(+), 12 deletions(-)
 create mode 100644 test/Assembler/TokenizerSpec.hs
 create mode 100644 test/Spec.hs

diff --git a/MVM.cabal b/MVM.cabal
index 253141e..995415e 100644
--- a/MVM.cabal
+++ b/MVM.cabal
@@ -23,7 +23,9 @@ extra-source-files: CHANGELOG.md
 executable MVM
     main-is:          Main.hs
 
-    -- Modules included in this executable, other than Main.
+    ghc-options: -Wall
+    
+    -- Modules included in this executable, other than Main.    
     other-modules:
       VirtualMachine
       Assembler.Tokenizer
@@ -36,5 +38,29 @@ executable MVM
       base ^>=4.15.0.0,
       bytestring ^>=0.11.0.0,
       containers ^>=0.6.4.1
+      
     hs-source-dirs:   app
     default-language: Haskell2010
+  
+test-suite spec
+  type: exitcode-stdio-1.0
+  main-is: Spec.hs
+  hs-source-dirs:
+      app
+      test
+  ghc-options: -Wall
+  build-depends:
+      base ^>=4.15.0.0,
+      bytestring ^>=0.11.0.0,
+      containers ^>=0.6.4.1
+    , hspec ==2.*
+  other-modules:      
+      Assembler.TokenizerSpec  
+
+      VirtualMachine
+      Assembler.Tokenizer
+      Assembler.Parser
+      Util          
+        
+  default-language: Haskell2010
+  build-tool-depends: hspec-discover:hspec-discover == 2.*
diff --git a/app/Assembler/Tokenizer.hs b/app/Assembler/Tokenizer.hs
index 514cff4..2a6d1be 100644
--- a/app/Assembler/Tokenizer.hs
+++ b/app/Assembler/Tokenizer.hs
@@ -1,8 +1,6 @@
-module Assembler.Tokenizer (
-  Token(..),
-  tokenize
-) where
+module Assembler.Tokenizer where
 
+import qualified Data.List as List
 import qualified Data.Char as Char
 import qualified Data.Monoid as Monoid
 import qualified VirtualMachine as VM
@@ -40,9 +38,13 @@ operatorTokenizer :: VM.Op -> Tokenizer
 operatorTokenizer op input = keywordTokenizer False (U.toLowerCase . show $ op) (Operator op) input
 
 tokenizeOperators :: Tokenizer
-tokenizeOperators = anyTokenizer $ map operatorTokenizer [VM.Push ..]
+tokenizeOperators = anyTokenizer $ map operatorTokenizer ops
+  where
+    ops = List.sortBy cmp [VM.Nop ..]
+    cmp x y = (length . show) y `compare` (length . show) x
 
 tokenizeIdentifier :: Tokenizer
+tokenizeIdentifier [] = Nothing
 tokenizeIdentifier input@(x:_) = if null identifier || (not . Char.isAlpha) x
   then Nothing
   else Just $ TokenizeResult (Identifier identifier) (length identifier)
@@ -105,19 +107,21 @@ tokenizeComment (x:xs) = if x == ';'
 
 type SeparatorPredicate = Char -> Bool
 sepTokenizer :: SeparatorPredicate -> Tokenizer -> Tokenizer
+sepTokenizer _ _ [] = Nothing
 sepTokenizer pred tokenizer input = do
-  (TokenizeResult token consumed) <- tokenizer input
+  result@(TokenizeResult token consumed) <- tokenizer input
   let next = drop consumed input
-  let (isSep, consumed') = if null next 
+  let (isSep, _) =  if null next 
                            then (True, 0)
                            else if pred . head $ next
                              then (True, 1)
                              else (False, 0)
   if isSep
-  then return $ TokenizeResult token (consumed + consumed')
+  then return $ result
   else Nothing
 
 anyTokenizer :: [Tokenizer] -> Tokenizer
+anyTokenizer _ [] = Nothing
 anyTokenizer tokenizers input = Monoid.getFirst . Monoid.mconcat . map Monoid.First $ sequenceA tokenizers input
 
 tokenize :: String -> Either String [Token]
@@ -138,9 +142,9 @@ tokenizers = anyTokenizer
   [ keywordTokenizer False "\n" NewLine
   , tokenizeWhitespace
   , tokenizeComment
-  , tokenizeOperators
-  , tokenizeHex
-  , tokenizeDecimal
+  , sepTokenizer Char.isSpace tokenizeOperators
+  , sepTokenizer Char.isSpace tokenizeHex
+  , sepTokenizer Char.isSpace tokenizeDecimal
   , tokenizeIdentifier
   , keywordTokenizer False ":" Colon
   , keywordTokenizer False "&" Ampersand
diff --git a/test/Assembler/TokenizerSpec.hs b/test/Assembler/TokenizerSpec.hs
new file mode 100644
index 0000000..f28a43c
--- /dev/null
+++ b/test/Assembler/TokenizerSpec.hs
@@ -0,0 +1,380 @@
+module Assembler.TokenizerSpec where
+
+import Test.Hspec
+import Test.Hspec.QuickCheck
+import Numeric (showHex)
+import Data.Char (ord)
+
+import Assembler.Tokenizer
+import VirtualMachine
+
+success :: Token -> Int -> Maybe TokenizeResult
+success token consumed = Just $ TokenizeResult token consumed
+
+spec :: Spec
+spec = do
+  describe "keywordTokenizer" $ do 
+    it "supports truncated input" $
+      keywordTokenizer True "hey" NewLine "hey" `shouldBe` success NewLine 3
+    it "supports non-truncated input" $
+      keywordTokenizer True "hey" NewLine "heyjude" `shouldBe` success NewLine 3
+    it "supports case sensitivity" $
+      keywordTokenizer True "hey" NewLine "heYjude" `shouldBe` Nothing
+    it "supports case insensitivity" $
+      keywordTokenizer False "hey" NewLine "heYjude" `shouldBe` success NewLine 3    
+    it "returns correct token" $
+      keywordTokenizer True "hey" Colon "heyjude" `shouldBe` success Colon 3
+    it "returns Nothing if input does not match" $
+      keywordTokenizer True "hey" Colon "xheyjude" `shouldBe` Nothing
+    it "supports empty input" $ 
+      keywordTokenizer True "hey" Colon "" `shouldBe` Nothing
+
+  describe "operatorTokenizer" $ do
+    it "returns proper operator when given a truncated input" $
+      operatorTokenizer Push "push" `shouldBe` success (Operator Push) 4
+    it "returns proper operator when given a nontruncated input" $
+      operatorTokenizer Pop "pops" `shouldBe` success (Operator Pop) 3
+    it "returns Nothing if input does not match" $
+      operatorTokenizer Pop "poop" `shouldBe` Nothing
+    it "supports empty input" $ 
+      operatorTokenizer Call "" `shouldBe` Nothing
+
+  describe "tokenizeOperators" $ do
+    it "autodetects and returns proper operators" $ do
+      let ops = [Nop ..]
+      let input = map show ops
+      let expected = map (\o -> success (Operator o) (length . show $ o)) ops
+      map tokenizeOperators input `shouldBe` expected
+    it "is case insensitive" $ do
+      let input = ["jmp", "CALL", "pUsH"]
+      let expected = [ success (Operator Jmp) 3
+                   , success (Operator Call) 4
+                   , success (Operator Push) 4
+                   ]
+      map tokenizeOperators input `shouldBe` expected
+    it "rejects other input" $
+      tokenizeOperators "some unsupported input" `shouldBe` Nothing
+    it "supports empty input" $ 
+      tokenizeOperators "" `shouldBe` Nothing
+
+  describe "tokenizeIdentifier" $ do
+    it "parses correct identifier" $
+      tokenizeIdentifier "someId" `shouldBe` success (Identifier "someId") 6
+    it "parses correct identifier with numbers" $
+      tokenizeIdentifier "someId14" `shouldBe` success (Identifier "someId14") 8
+    it "parses correct identifier with underscores" $
+      tokenizeIdentifier "some_Id" `shouldBe` success (Identifier "some_Id") 7      
+    it "disallows to start identifier with underscore" $    
+      tokenizeIdentifier "_someId" `shouldBe` Nothing
+    it "disallows to start identifier with digit" $    
+      tokenizeIdentifier "5someId" `shouldBe` Nothing
+    it "supports empty input" $ 
+      tokenizeIdentifier "" `shouldBe` Nothing
+
+  describe "tokenizeWhitespace" $ do
+    it "parses space" $
+      tokenizeWhitespace " " `shouldBe` success WhiteSpace 1
+    it "parses tab" $
+      tokenizeWhitespace "\t" `shouldBe` success WhiteSpace 1
+    it "parses newline" $
+      tokenizeWhitespace "\n" `shouldBe` success WhiteSpace 1
+    it "parses CR" $
+      tokenizeWhitespace "\r" `shouldBe` success WhiteSpace 1
+    it "rejects non-whitespace chars" $ do
+      let input = map (\x -> [x]) $ ['a' .. 'z'] ++ ['A' .. 'Z'] ++ ['0' .. '9']
+      let expected = take (length input) . repeat $ Nothing
+      map tokenizeWhitespace input `shouldBe` expected
+    it "supports empty input" $ 
+      tokenizeIdentifier "" `shouldBe` Nothing      
+  
+  describe "tokenizeDecimal" $ do
+    it "parses numbers from 0 to 65535" $ do
+      let nums = [0 .. 65535]
+      let input = map show nums
+      let expected = map (\n -> success (IntLiteral n) (length . show $ n)) nums
+      map tokenizeDecimal input `shouldBe` expected
+    it "does not support negative numbers" $ do
+      let nums = [-1, -2 .. -65535] :: [Integer]
+      let input = map show nums
+      let expected = take (length nums) . repeat $ Nothing
+      map tokenizeDecimal input `shouldBe` expected
+    it "rejects other input" $
+      tokenizeDecimal "some unsupported input" `shouldBe` Nothing   
+    it "supports empty input" $ 
+      tokenizeDecimal "" `shouldBe` Nothing             
+    
+  describe "tokenizeHex" $ do
+    it "parses numbers from 0x0 to 0xFFFF" $ do
+      let nums = [0 .. 0xFFFF]
+      let input = map (("0x"++) . (flip showHex "")) nums
+      let expected = map (\n -> success (IntLiteral n) (length . ("0x"++) . (flip showHex "") $ n)) nums
+      map tokenizeHex input `shouldBe` expected
+    it "does not support negative numbers" $ do
+      let nums = [0 .. 0xFFFF] :: [Integer]
+      let input = map (("-0x"++) . (flip showHex "")) nums
+      let expected = take (length nums) . repeat $ Nothing
+      map tokenizeHex input `shouldBe` expected      
+    it "accepts left-padded number" $
+      tokenizeHex "0x0010" `shouldBe` success (IntLiteral 16) 6
+    it "rejects other input" $
+      tokenizeHex "some unsupported input" `shouldBe` Nothing   
+    it "rejects '0'" $
+      tokenizeHex "0" `shouldBe` Nothing
+    it "rejects '0x'" $
+      tokenizeHex "0x" `shouldBe` Nothing      
+    it "supports empty input" $ 
+      tokenizeHex "" `shouldBe` Nothing   
+
+  describe "tokenizeChar" $ do
+    it "parses letters literals" $ do
+      let chars = ['a' .. 'z'] ++ ['A' .. 'Z']
+      let input = map (\c -> "'" ++ [c] ++ "'") chars
+      let expected = map (\c -> success (IntLiteral (ord c)) 3) chars
+      map tokenizeChar input `shouldBe` expected
+    it "parses digits literals" $ do
+      let chars = ['0' .. '9']
+      let input = map (\c -> "'" ++ [c] ++ "'") chars
+      let expected = map (\c -> success (IntLiteral (ord c)) 3) chars
+      map tokenizeChar input `shouldBe` expected      
+    it "parses regular symbols literals" $ do
+      let chars = "!@#$%^&*()_+-=[]{};:|,/?<>\""
+      let input = map (\c -> "'" ++ [c] ++ "'") chars
+      let expected = map (\c -> success (IntLiteral (ord c)) 3) chars
+      map tokenizeChar input `shouldBe` expected    
+    it "parses escape sequences literals" $ do      
+      let input = [ "'\\n'"
+                  , "'\\t'"
+                  , "'\\v'"
+                  , "'\\b'"
+                  , "'\\r'"
+                  , "'\\f'"
+                  , "'\\a'"
+                  , "'\\\\'"
+                  , "'\\''"
+                  , "'\\0'"
+                  ]
+      let expected = map (flip success 4 . IntLiteral) [10, 9, 11, 8, 13, 12, 7, 92, 39, 0]
+      map tokenizeChar input `shouldBe` expected
+    it "consumes double-quote without escaping" $
+      tokenizeChar "'\"'" `shouldBe` success (IntLiteral 34) 3
+    it "consumes double-quote with escaping" $
+      tokenizeChar "'\\\"'" `shouldBe` success (IntLiteral 34) 4
+    it "rejects non-single char literals" $
+      tokenizeChar "'ab'" `shouldBe` Nothing
+    it "rejects non-closed char literals" $
+      tokenizeChar "'a" `shouldBe` Nothing
+    it "rejects invalid escape sequences" $ 
+      tokenizeChar "'\\x'" `shouldBe` Nothing
+    it "rejects empty quotes" $
+      tokenizeChar "''" `shouldBe` Nothing
+    it "rejects single quote" $
+      tokenizeChar "'" `shouldBe` Nothing
+    it "supports empty input" $
+      tokenizeChar "" `shouldBe` Nothing
+
+  describe "tokenizeString" $ do
+    it "parses correct string" $
+      tokenizeString "\"Hello, world!\"" `shouldBe` success (StringLiteral "Hello, world!") 15
+    it "parses empty string" $
+      tokenizeString "\"\"" `shouldBe` success (StringLiteral "") 2
+    it "supports digits in strings" $ do
+      let str = "34823843dedsef231542c4f324c24234ffsef234g"
+      let len = length str + 2
+      let input = "\"" ++ str ++ "\""
+      tokenizeString input `shouldBe` success (StringLiteral str) len
+    it "supports regular symbols in strings" $ do
+      let str = "!@2#$%9^&*(1)_s+2-=[2h6sh]t{};:'e|<>,./?"
+      let len = length str + 2
+      let input = "\"" ++ str ++ "\""
+      tokenizeString input `shouldBe` success (StringLiteral str) len  
+    it "supports escape sequences literals" $ do
+      pendingWith "We need probably to fix tokenizeString since the following test fails"
+      -- TODO:
+      -- let str = "\\n\\t\\v\\b\\r\\f\\a\\\\\\\"\\0"
+      -- let len = length str + 2
+      -- let input = "\"" ++ str ++ "\""
+      -- let expected = map chr [10, 9, 11, 8, 13, 12, 7, 92, 34, 0]
+      -- tokenizeString input `shouldBe` success (StringLiteral expected) len  
+    it "consumes single quotes without escaping" $
+      tokenizeString "\"'''''''''\"" `shouldBe` success (StringLiteral "'''''''''") 11
+    it "consumes single quotes with escaping " $
+      tokenizeString "\"\\'\\'\\'\\'\\'\\'\"" `shouldBe` success (StringLiteral "''''''") 14
+    it "rejects invalid escape sequences" $
+      tokenizeString "\"\\x\"" `shouldBe` Nothing
+    it "rejects non-closed string literals" $
+      tokenizeString "\"hello, world!" `shouldBe` Nothing
+    it "rejects single double-quote" $
+      tokenizeString "\"" `shouldBe` Nothing
+    it "rejects multilined strings" $
+      tokenizeString "\"first line\nsecond line\"" `shouldBe` Nothing
+    it "supports empty input" $
+      tokenizeString "" `shouldBe` Nothing      
+    
+  describe "tokenizeComment" $ do
+    it "properly consumes comment" $ 
+      tokenizeComment ";some comment\n" `shouldBe` success (Comment "some comment") 13
+    it "properly consumes comment with whitespace padding" $ 
+      tokenizeComment ";  \t  some comment  \t \n  \t" `shouldBe` success (Comment "  \t  some comment  \t ") 22    
+    it "does not treat the input as a comment if it does not start with semicolon" $
+      tokenizeComment "some not valid comment\n" `shouldBe` Nothing
+    it "expands the comment till the end of the line" $ 
+      tokenizeComment "; some comment ; push 4 push 10\nadd" `shouldBe` success (Comment " some comment ; push 4 push 10") 31
+    it "parses the comment at the end of the input" $ 
+      tokenizeComment "; some comment " `shouldBe` success (Comment " some comment ") 15
+    it "supports empty input" $
+      tokenizeComment "" `shouldBe` Nothing
+
+  describe "sepTokenizer" $ do
+    it "produces the token only when the separator is present" $ do
+      let input = "abc-"
+      let expected = success Colon 3
+      let tokenizer _ = expected
+      sepTokenizer ('-'==) tokenizer input `shouldBe` expected
+    it "produces the token if its the end of input even if separator is not present" $ do
+      let input = "abc"
+      let expected = success Colon 3
+      let tokenizer _ = expected
+      sepTokenizer ('-'==) tokenizer input `shouldBe` expected
+    it "does not produce any token when the separator is not present" $ do
+      let input = "abcd"
+      let tokenizer _ = success Colon 3
+      sepTokenizer ('-'==) tokenizer input `shouldBe` Nothing
+    it "does not produce any token when the space is present instead" $ do
+      let input = "abc "
+      let tokenizer _ = success Colon 3
+      sepTokenizer ('-'==) tokenizer input `shouldBe` Nothing     
+    it "does not change the number of consumed chars even though it's checking the separator presence" $ do
+      let input = "abc-"
+      let expected = success Colon 3
+      let tokenizer _ = expected
+      let (Just (TokenizeResult _ consumedChars)) = sepTokenizer ('-'==) tokenizer input
+      consumedChars `shouldBe` 3
+    it "supports empty input irrespective of wrapped tokenizer" $ do
+      let input = ""      
+      let tokenizer _ = success Colon 3 -- MOCK: tokenizer returns Just even though the input is empty
+      sepTokenizer ('-'==) tokenizer input `shouldBe` Nothing  
+    
+  describe "anyTokenizer" $ do
+    it "returns the token if at least one subtokenizer produce that" $ do
+      let values = [ success Ampersand 1
+                   , Nothing
+                   , Nothing
+                   , Nothing
+                   , Nothing 
+                   ]
+      let t = map (\x -> (\_ -> x)) values
+      anyTokenizer t "some not important input" `shouldBe` success Ampersand 1
+    it "returns the token of the first matching subtokenizer" $ do
+      let values = [ Nothing
+                   , Nothing
+                   , success (IntLiteral 4) 1
+                   , Nothing
+                   , Nothing 
+                   , success (StringLiteral "not me") 8
+                   , Nothing
+                   , success (StringLiteral "me neither") 12
+                   , Nothing
+                   , success Ampersand 1
+                   , Nothing
+                   , Nothing
+                   , success Colon 1
+                   , Nothing
+                   ]
+      let t = map (\x -> (\_ -> x)) values
+      anyTokenizer t "some not important input" `shouldBe` success (IntLiteral 4) 1
+    it "returns Nothing if no one of the tokenizers matches the input" $ do
+      let values = [ Nothing
+                   , Nothing
+                   , Nothing
+                   , Nothing
+                   , Nothing 
+                   ]
+      let t = map (\x -> (\_ -> x)) values
+      anyTokenizer t "some not important input" `shouldBe` Nothing
+    it "always returns Nothing if no tokenizers are defined" $
+      anyTokenizer [] "some not important input" `shouldBe` Nothing  
+    it "supports empty input irrespective of wrapped tokenizers" $ do
+      let input = ""
+      let values = [ success Ampersand 1
+                   , success Colon 1
+                   , success (IntLiteral 3) 1
+                   , success (Operator Push) 4
+                   ]
+      let t = map (\x -> (\_ -> x)) values
+      anyTokenizer t input `shouldBe` Nothing     
+
+  describe "tokenFilter" $ do
+    it "filters out whitespaces and comments" $ do
+      let tokens = [ Operator Push
+                   , IntLiteral 4
+                   , Comment "here is the identifier"                   
+                   , Identifier "someId"
+                   , WhiteSpace                   
+                   , Colon
+                   , WhiteSpace                   
+                   , Ampersand
+                   , NewLine
+                   , WhiteSpace
+                   , Comment "some comment"                   
+                   ]
+      let expected = [ Operator Push
+                     , IntLiteral 4
+                     , Identifier "someId"
+                     , Colon
+                     , Ampersand
+                     , NewLine
+                     ]                   
+      filter tokenFilter tokens `shouldBe` expected
+
+  describe "tokenize" $ do
+    it "treats 'pop' as a operator instead of identifier" $
+      tokenize "pop" `shouldBe` Right [Operator Pop]
+    it "treats 'poop' as a identifier" $
+      tokenize "poop" `shouldBe` Right [Identifier "poop"] 
+    it "treats operator as themselves instead of identifiers" $ do
+      let ops = [Nop ..]
+      let input = map show ops
+      let expected = map (\o -> Right [Operator o]) ops 
+      map tokenize input `shouldBe` expected
+    it "treats operator-like names (with 's' appended) as identifiers" $ do
+      let ops = [Nop ..]
+      let input = map ((++"s") . show) ops
+      let expected = map (\i-> Right [Identifier i]) input 
+      map tokenize input `shouldBe` expected      
+    it "treats '\n' as a newline instead of whitespace" $
+      tokenize "\n" `shouldBe` Right [NewLine]
+    it "ignores comments" $ do
+      let input = "; this is some comment \n\
+                 \ ; this is another comment"
+      tokenize input `shouldBe` Right [NewLine]
+    it "interprets 'push 5\npush5' as operator, int literal, NL and identifier" $ do
+      let input = "push 5\npush5"
+      let expected = [Operator Push, IntLiteral 5, NewLine, Identifier "push5"]
+      tokenize input `shouldBe` Right expected
+    it "treats 'someId1234' as 'someId1234' identifier instead of 'someId' identifier and 1234 int" $
+      tokenize "someId1234" `shouldBe` Right [Identifier "someId1234"]
+    it "accepts 'main: NL" $
+      tokenize "main: \n" `shouldBe` Right [Identifier "main", Colon, NewLine]
+    it "accepts 'call &sum NL" $
+      tokenize "call &sum \n" `shouldBe` Right [Operator Call, Ampersand, Identifier "sum", NewLine]      
+    it "rejects '4push'" $
+      tokenize "4push" `shouldBe` Left "Unknown token: 4push"    
+    it "interprets example #1" $ do
+      let input = "main: ; here we define some main label\n\
+                \      push 7    ; we push 7 to the stack\n\
+                \      push 0x04 ; we push 4 to the stack\n\
+                \      call &sum ; we call 'sum' subprogram\n\
+                \      halt \n\
+                \ \n\
+                \ sum: add\n\
+                \      ret"
+      let expected = [ Identifier "main", Colon, NewLine
+                     , Operator Push, IntLiteral 7, NewLine
+                     , Operator Push, IntLiteral 4, NewLine
+                     , Operator Call, Ampersand, Identifier "sum", NewLine
+                     , Operator Halt, NewLine
+                     , NewLine
+                     , Identifier "sum", Colon, Operator Add, NewLine
+                     , Operator Ret
+                     ]    
+      tokenize input `shouldBe` Right expected        
\ No newline at end of file
diff --git a/test/Spec.hs b/test/Spec.hs
new file mode 100644
index 0000000..52ef578
--- /dev/null
+++ b/test/Spec.hs
@@ -0,0 +1 @@
+{-# OPTIONS_GHC -F -pgmF hspec-discover #-}
\ No newline at end of file