From 043516a55da3b67445c62665809e8c791a7af6d5 Mon Sep 17 00:00:00 2001 From: wojpok Date: Tue, 30 Dec 2025 12:06:31 +0100 Subject: [PATCH 01/14] effects and interface --- src/Parser/Lexer.fram | 195 +++++++++++++++++++++++++++++++++++++++++ src/Parser/Tokens.fram | 90 +++++++++++++++++++ 2 files changed, 285 insertions(+) create mode 100644 src/Parser/Lexer.fram create mode 100644 src/Parser/Tokens.fram diff --git a/src/Parser/Lexer.fram b/src/Parser/Lexer.fram new file mode 100644 index 0000000..3871323 --- /dev/null +++ b/src/Parser/Lexer.fram @@ -0,0 +1,195 @@ +import open Tokens +import Map as M +import List as L +import open Testing +import String as S + +# ----------------------------------------------------------------------------- +# Possible additions to stdlib +method mem (str : String) (chr : Char) = + match str.findChar chr with + | Some _ => True + | None => False + end + +method startsWith {?pos : Int} (str : String) (prefix : String) = + if str.length < prefix.length then + False + else + (let rec iter i = + if i >= prefix.length then + True + else if (prefix.get i != str.get i) then + False + else + iter (i + 1) + in + iter (pos.unwrapOr 0)) + +# ----------------------------------------------------------------------------- +# Fatal errors + +data Fatal E = {raise : {X} -> String ->[E] X} + +parameter E_Fatal : effect +parameter ~err : Fatal E_Fatal + +let fatalH = + handler Fatal + { effect raise msg = Err msg } + return x => Ok x + end + +let raise str = ~err.raise str + +# ----------------------------------------------------------------------------- +# Reading buffer + +data Caret = {idx : Int, str : String} +let mkCaret str = Caret {idx = 0, str} + +method moveBy (Caret {idx, str}) len = Caret {idx = idx + len, str} + +method getChar (Caret {idx, str}) = + if idx < str.length then + Some (str.get idx) + else if idx == str.length then + None + else + ~err.raise "getChar - out of position" + +data LexBuf E = + { seek : Unit ->[E] Unit + , curr : Unit ->[E] Option Char + , tryMatch : String ->[E] Bool + } + +let lexBufH caret = + handler LexBuf + { effect seek () / r = fn (c : Caret) => + r () (c.moveBy 1) + , effect curr () / r = fn (c : Caret) => + r c.getChar c + , effect tryMatch pattern / r = fn (c : Caret) => + if c.str.startsWith {pos=c.idx} pattern then + (let c = c.moveBy (pattern.length) in + r True c) + else + r False c + } + return x => fn c => (x, c) + finally c => c caret + end + +parameter E_LexBuf +parameter ~lb : LexBuf E_LexBuf + +# ----------------------------------------------------------------------------- +# Position tracker + +data Pos = {line : Int, col : Int} + +method moveBy (Pos {line, col}) len = Pos {line, col = col + len} +method newLine (Pos {line}) = Pos {line = line + 1, col = 0} + +data PosTracker E = + { moveBy : Int ->[E] Unit + , newLine : Unit ->[E] Unit + } + +let posTrackerH pos = + handler PosTracker + { effect moveBy len / r = fn (pos : Pos) => + r () (pos.moveBy len) + , effect newLine () / r = fn (pos : Pos) => + r () (pos.newLine) + } + return x => fn p => (x, p) + finally c => c pos + end + +parameter E_PosTracker +parameter ~pt : PosTracker E_PosTracker + +# ----------------------------------------------------------------------------- +# Bracket Stack + + +data BracketType = + | BT_Regular + | BT_Interp + +data CBracketTracker E = + { push : BracketType ->[E] Unit + , pop : Unit ->[E] BracketType + } + +data BracketStack = + { openBrackets : Int + , bracketStack : List Int + } + +method pushRegular (BracketStack {openBrackets, bracketStack}) = + BracketStack {openBrackets = openBrackets + 1, bracketStack} + +method pushInterp (BracketStack {openBrackets, bracketStack}) = + BracketStack {openBrackets = 0, bracketStack = openBrackets :: bracketStack} + +method popBracket (BracketStack {openBrackets, bracketStack}) = + if openBrackets == 0 then + match bracketStack with + | openBrackets :: bracketStack => + (BT_Interp, BracketStack {openBrackets, bracketStack}) + | _ => raise "Bracket mismatch" + end + else + (BT_Regular, BracketStack {openBrackets = openBrackets - 1, bracketStack}) + +let emptyStack = BracketStack {openBrackets = 0, bracketStack = []} + +let cbracketTrackerH stack = + handler CBracketTracker + { effect push ctype / r = fn (stack : BracketStack) => + match ctype with + | BT_Regular => r () stack.pushRegular + | BT_Interp => r () stack.pushInterp + end + , effect pop () / r = fn (stack : BracketStack) => + let (ty, stack) = stack.popBracket in + r ty stack + } + return x => fn s => (x, s) + finally c => c stack + end + +parameter E_Brackets : effect +parameter ~br : CBracketTracker E_Brackets + +# ----------------------------------------------------------------------------- +# Public interface + +abstr data LexerState = + { caret : Caret + , pos : Pos + , stack : BracketStack + } + +pub let mkLexerState str = LexerState + { caret = Caret {str, idx = 0} + , pos = Pos {line = 0, col = 0} + , stack = emptyStack + } + +let getTok (st : LexerState) = + let res = + handle ~err / E_Fatal with fatalH in + handle ~lb / E_LexBuf with lexBufH st.caret in + handle ~pt / E_PosTracker with posTrackerH st.pos in + handle ~br / E_Brackets with cbracketTrackerH st.stack in + () + in + match res with + | Err s => Err s + | Ok (((tok, stack), pos), caret) => + Ok (tok, LexerState {stack, pos, caret}) + end diff --git a/src/Parser/Tokens.fram b/src/Parser/Tokens.fram new file mode 100644 index 0000000..5f2c286 --- /dev/null +++ b/src/Parser/Tokens.fram @@ -0,0 +1,90 @@ +{# This file is part of Fram, released under MIT license. + # See LICENSE for details. + #} + +## # Tokens + +pub data Token = + # Identifiers + | LID of String + | UID of String + | TLID of String + | QLID of String + + # Operators + | OP_0 of String + | OP_20 of String + | OP_30 of String + | OP_40 of String + | OP_50 of String + | OP_60 of String + | OP_70 of String + | OP_80 of String + | OP_90 of String + | OP_100 of String + + # Data + | NUM of Int + | NUM64 of Int64 + | STR of String + | CSTR of String + | BSTR of String + | ESTR of String + | CHR of Char + + # Brackets + | BR_OPN + | BR_CLS + | SBR_OPN + | SBR_CLS + | CBR_OPN + | CBR_CLS + | ATTR_OPEN + + # Special symbols + | ARROW + | EFF_ARROW + | ARROW2 + | BAR + | COLON + | COMMA + | DOT + | EQ + | SEMICOLON2 + | SLASH + | GT_DOT + + # keywords + | KW_ABSTR + | KW_AS + | KW_DATA + | KW_EFFECT + | KW_ELSE + | KW_END + | KW_EXTERN + | KW_FINALLY + | KW_FN + | KW_HANDLE + | KW_HANDLER + | KW_IF + | KW_IMPORT + | KW_IN + | KW_LABEL + | KW_LET + | KW_MATCH + | KW_METHOD + | KW_MODULE + | KW_OF + | KW_OPEN + | KW_PARAMETER + | KW_PUB + | KW_REC + | KW_RETURN + | KW_SECTION + | KW_THEN + | KW_TYPE + | KW_WITH + | UNDERSCORE + | EOF + +pub data Tok = {token : Token, pos : Int} From df090a35f62d5384eb6176ab3cd810c0999bc2ae Mon Sep 17 00:00:00 2001 From: wojpok Date: Tue, 30 Dec 2025 12:22:56 +0100 Subject: [PATCH 02/14] basic operator parser --- src/Parser/Lexer.fram | 153 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) diff --git a/src/Parser/Lexer.fram b/src/Parser/Lexer.fram index 3871323..1bd4388 100644 --- a/src/Parser/Lexer.fram +++ b/src/Parser/Lexer.fram @@ -165,6 +165,159 @@ let cbracketTrackerH stack = parameter E_Brackets : effect parameter ~br : CBracketTracker E_Brackets +# ----------------------------------------------------------------------------- +# High level utility functions + +let popChar () = + match ~lb.curr () with + | None => None + | Some ch => + ~lb.seek (); + if ch == '\n' then + ~pt.newLine () + else + ~pt.moveBy 1; + Some ch + end + +# ----------------------------------------------------------------------------- +# Char type predicates + +let isWhite (chr : Char) = + ('\x09' <= chr && chr <= '\r') || chr == ' ' || chr == '\t' + +let isDigit (chr : Char) = + ('0' <= chr && chr <= '9') + +let isHexDigit (chr : Char) = + isDigit chr || ('a' <= chr && chr <= 'f') || ('A' <= chr && chr <= 'F') + +let isOpChar = "<>&$?!@^+-~*%;,=|:./".mem + +let isLidStart (chr : Char) = + ('a' <= chr && chr <= 'z') || chr =='-' + +let isUidStart (chr : Char) = + ('A' <= chr && chr <= 'Z') + +let isVarChar (char : Char) = + isLidStart char || isUidStart char || isDigit char || (char == '\'') + +# ----------------------------------------------------------------------------- +# Lexer + +let Map {module SMap} = M.make {Key = String} + +let keywords = + let kws = + [ ("abstr", KW_ABSTR) + , ("as", KW_AS) + , ("data", KW_DATA) + , ("effect", KW_EFFECT) + , ("else", KW_ELSE) + , ("end", KW_END) + , ("extern", KW_EXTERN) + , ("finally", KW_FINALLY) + , ("fn", KW_FN) + , ("handle", KW_HANDLE) + , ("handler", KW_HANDLER) + , ("if", KW_IF) + , ("import", KW_IMPORT) + , ("in", KW_IN) + , ("label", KW_LABEL) + , ("let", KW_LET) + , ("match", KW_MATCH) + , ("method", KW_METHOD) + , ("module", KW_MODULE) + , ("of", KW_OF) + , ("open", KW_OPEN) + , ("parameter", KW_PARAMETER) + , ("pub", KW_PUB) + , ("rec", KW_REC) + , ("return", KW_RETURN) + , ("section", KW_SECTION) + , ("then", KW_THEN) + , ("type", KW_TYPE) + , ("with", KW_WITH) + , ("_", UNDERSCORE) + ] + in + kws.foldLeft (fn (m : SMap.T _) (kw, tok) => m.add kw tok) SMap.empty + +let tokenizeIdent ident = + match keywords.find ident with + | Some kw => kw + | None => LID ident + end + +let operators = + let ops = + [ ("->", ARROW) + , ("|", BAR) + , ("=>", ARROW2) + , ("->>", EFF_ARROW) + , ("|", BAR) + , (":", COLON) + , (",", COMMA) + , (".", DOT) + , ("=", EQ) + , (";;", SEMICOLON2) + , ("/", SLASH) + , (">.", GT_DOT) + ] + in + ops.foldLeft (fn (m : SMap.T _) (kw, op) => m.add kw op) SMap.empty + +let tokenizeOp str = + match operators.find str with + | Some x => x + | None => + let isLong = str.length >= 2 in + let longMatch chr = isLong && (str.get 1 == chr) + let fst = str.get 0 in + if (fst == '?' || fst == '~') && not isLong then + raise "Disallowed operator" + else if fst == ';' then + OP_0 str + else if fst == '<' && longMatch '-' then + OP_20 str + else if fst == ':' && longMatch '=' then + OP_20 str + else if fst == ',' then + OP_30 str + else if fst == '|' && longMatch '|' then + OP_40 str + else if fst == '&' && longMatch '&' then + OP_50 str + else if "!=<>|&$?".mem fst then + OP_60 str + else if "@:^".mem fst then + OP_70 str + else if "+-~".mem fst then + OP_80 str + else if fst == '*' && longMatch '*' then + OP_100 str + else if "*/%.".mem fst then + OP_90 str + else + raise "Internal operator error" + end + +let rec token () = + match popChar () with + | None => EOF + | Some chr => + if isWhite chr || chr == '\n' then token () + # else if chr == '{' && tryGetChar '#' then KW_EFFECT + else if chr == '(' then BR_OPN + else if chr == ')' then BR_CLS + else if chr == '[' then SBR_OPN + else if chr == ']' then SBR_CLS + else if chr == '{' then CBR_OPN + else if chr == '}' then CBR_CLS + else raise "TODO" + end + # ----------------------------------------------------------------------------- # Public interface From efcb47849ded3b2ee2083332be436e3296475415 Mon Sep 17 00:00:00 2001 From: wojpok Date: Tue, 30 Dec 2025 12:39:47 +0100 Subject: [PATCH 03/14] basic tests --- src/Parser/Lexer.fram | 5 +- src/TestAll.fram | 1 + src/Tests/Lexer.fram | 116 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 119 insertions(+), 3 deletions(-) create mode 100644 src/TestAll.fram create mode 100644 src/Tests/Lexer.fram diff --git a/src/Parser/Lexer.fram b/src/Parser/Lexer.fram index 1bd4388..88d4c7e 100644 --- a/src/Parser/Lexer.fram +++ b/src/Parser/Lexer.fram @@ -1,7 +1,6 @@ import open Tokens import Map as M import List as L -import open Testing import String as S # ----------------------------------------------------------------------------- @@ -333,13 +332,13 @@ pub let mkLexerState str = LexerState , stack = emptyStack } -let getTok (st : LexerState) = +pub let getTok (st : LexerState) = let res = handle ~err / E_Fatal with fatalH in handle ~lb / E_LexBuf with lexBufH st.caret in handle ~pt / E_PosTracker with posTrackerH st.pos in handle ~br / E_Brackets with cbracketTrackerH st.stack in - () + token () in match res with | Err s => Err s diff --git a/src/TestAll.fram b/src/TestAll.fram new file mode 100644 index 0000000..a832f7c --- /dev/null +++ b/src/TestAll.fram @@ -0,0 +1 @@ +import Tests/Lexer diff --git a/src/Tests/Lexer.fram b/src/Tests/Lexer.fram new file mode 100644 index 0000000..1d491a8 --- /dev/null +++ b/src/Tests/Lexer.fram @@ -0,0 +1,116 @@ +import open Testing +import open Parser/Tokens +import open Parser/Lexer + +method equal (t1 : Token) (t2 : Token) = + match (t1, t2) with + # Identifiers + | LID t1, LID t2 => t1 == t2 + | UID t1, UID t2 => t1 == t2 + | TLID t1, TLID t2 => t1 == t2 + | QLID t1, QLID t2 => t1 == t2 + + # Operators + | OP_0 o1, OP_0 o2 => o1 == o2 + | OP_20 o1, OP_20 o2 => o1 == o2 + | OP_30 o1, OP_30 o2 => o1 == o2 + | OP_40 o1, OP_40 o2 => o1 == o2 + | OP_50 o1, OP_50 o2 => o1 == o2 + | OP_60 o1, OP_60 o2 => o1 == o2 + | OP_70 o1, OP_70 o2 => o1 == o2 + | OP_80 o1, OP_80 o2 => o1 == o2 + | OP_90 o1, OP_90 o2 => o1 == o2 + | OP_100 o1, OP_100 o2 => o1 == o2 + + # Data + | NUM v1, NUM v2 => v1 == v2 + | NUM64 v1, NUM64 v2 => v1 == v2 + | STR v1, STR v2 => v1 == v2 + | CSTR v1, CSTR v2 => v1 == v2 + | BSTR v1, BSTR v2 => v1 == v2 + | ESTR v1, ESTR v2 => v1 == v2 + | CHR v1, CHR v2 => v1 == v2 + + # Brackets + | BR_OPN, BR_OPN => True + | BR_CLS, BR_CLS => True + | SBR_OPN, SBR_OPN => True + | SBR_CLS, SBR_CLS => True + | CBR_OPN, CBR_OPN => True + | CBR_CLS, CBR_CLS => True + | ATTR_OPEN, ATTR_OPEN => True + + # Special symbols + | ARROW, ARROW => True + | EFF_ARROW, EFF_ARROW => True + | ARROW2, ARROW2 => True + | BAR, BAR => True + | COLON, COLON => True + | COMMA, COMMA => True + | DOT, DOT => True + | EQ, EQ => True + | SEMICOLON2, SEMICOLON2 => True + | SLASH, SLASH => True + | GT_DOT, GT_DOT => True + + # keywords + | KW_ABSTR , KW_ABSTR => True + | KW_AS , KW_AS => True + | KW_DATA , KW_DATA => True + | KW_EFFECT , KW_EFFECT => True + | KW_ELSE , KW_ELSE => True + | KW_END , KW_END => True + | KW_EXTERN, KW_EXTERN=> True + | KW_FINALLY , KW_FINALLY => True + | KW_FN , KW_FN => True + | KW_HANDLE , KW_HANDLE => True + | KW_HANDLER , KW_HANDLER => True + | KW_IF , KW_IF => True + | KW_IMPORT, KW_IMPORT=> True + | KW_IN , KW_IN => True + | KW_LABEL , KW_LABEL => True + | KW_LET , KW_LET => True + | KW_MATCH , KW_MATCH => True + | KW_METHOD , KW_METHOD => True + | KW_MODULE , KW_MODULE => True + | KW_OF , KW_OF => True + | KW_OPEN, KW_OPEN=> True + | KW_PARAMETER , KW_PARAMETER => True + | KW_PUB, KW_PUB=> True + | KW_REC, KW_REC=> True + | KW_RETURN , KW_RETURN => True + | KW_SECTION , KW_SECTION => True + | KW_THEN , KW_THEN => True + | KW_TYPE, KW_TYPE=> True + | KW_WITH, KW_WITH=> True + | UNDERSCORE, UNDERSCORE=> True + | EOF, EOF=> True + + | _, _ => False + end + +let unwrapToks str = + let st = mkLexerState str in + let rec iter st = + match getTok st with + | Err _ => [] # No EOF! + | Ok (EOF, _) => [EOF] + | Ok (tok, st) => tok :: iter st + end + in + iter st + +let () = + testCase "empty" (fn _ => + expectEq (unwrapToks "") [EOF]; + + expectEq (unwrapToks " \n\n \t \t \r ") [EOF]); + + testCase "parenthesis" (fn _ => + expectEq (unwrapToks "(") [BR_OPN, EOF]; + + expectEq (unwrapToks ")") [BR_CLS, EOF]; + + expectEq (unwrapToks "()") [BR_OPN, BR_CLS, EOF]; + + expectEq (unwrapToks "\n\n\t ) \n\n (") [BR_CLS, BR_OPN, EOF]) From 643785123b05a1bff2973f5b5fca3f935280aedc Mon Sep 17 00:00:00 2001 From: wojpok Date: Tue, 30 Dec 2025 18:35:10 +0100 Subject: [PATCH 04/14] block comments --- Bug.fram | 3 ++ src/Parser/Lexer.fram | 97 +++++++++++++++++++++++++++++++------------ src/Tests/Lexer.fram | 20 ++++++++- 3 files changed, 93 insertions(+), 27 deletions(-) create mode 100644 Bug.fram diff --git a/Bug.fram b/Bug.fram new file mode 100644 index 0000000..a10cded --- /dev/null +++ b/Bug.fram @@ -0,0 +1,3 @@ + +let rec iter (i : Int) (j : Int) = (if True then iter i else iter 0) j + diff --git a/src/Parser/Lexer.fram b/src/Parser/Lexer.fram index 88d4c7e..11c7df8 100644 --- a/src/Parser/Lexer.fram +++ b/src/Parser/Lexer.fram @@ -3,6 +3,10 @@ import Map as M import List as L import String as S +# dubugs +let print = extern dbl_printStrLn : String -> Unit +let show {X, method show : X ->> String} (e : X) = print e.show + # ----------------------------------------------------------------------------- # Possible additions to stdlib method mem (str : String) (chr : Char) = @@ -11,19 +15,20 @@ method mem (str : String) (chr : Char) = | None => False end -method startsWith {?pos : Int} (str : String) (prefix : String) = - if str.length < prefix.length then +method contains {?pos : Int} (str : String) (prefix : String) = + let startPos = pos.unwrapOr 0 in + if str.length - startPos < prefix.length then False else - (let rec iter i = - if i >= prefix.length then + (let rec iter i1 i2 = + if i1 - startPos >= prefix.length then True - else if (prefix.get i != str.get i) then + else if (str.get i1 != prefix.get i2) then False else - iter (i + 1) + iter (i1 + 1) (i2 + 1) in - iter (pos.unwrapOr 0)) + iter startPos 0) # ----------------------------------------------------------------------------- # Fatal errors @@ -58,9 +63,10 @@ method getChar (Caret {idx, str}) = ~err.raise "getChar - out of position" data LexBuf E = - { seek : Unit ->[E] Unit - , curr : Unit ->[E] Option Char - , tryMatch : String ->[E] Bool + { seek : Unit ->[E] Unit + , curr : Unit ->[E] Option Char + , tryMatch : String ->[E] Bool + , caret : Unit ->[E] Caret } let lexBufH caret = @@ -70,12 +76,13 @@ let lexBufH caret = , effect curr () / r = fn (c : Caret) => r c.getChar c , effect tryMatch pattern / r = fn (c : Caret) => - if c.str.startsWith {pos=c.idx} pattern then + if c.str.contains {pos=c.idx} pattern then (let c = c.moveBy (pattern.length) in r True c) else r False c - } + , effect caret () / r = fn c => r c c + } return x => fn c => (x, c) finally c => c caret end @@ -83,6 +90,10 @@ let lexBufH caret = parameter E_LexBuf parameter ~lb : LexBuf E_LexBuf +let getCaret () = ~lb.caret () +let currChar () = ~lb.curr () +let tryMatch pttrn = ~lb.tryMatch pttrn + # ----------------------------------------------------------------------------- # Position tracker @@ -179,6 +190,23 @@ let popChar () = Some ch end +let takeWhile pred = + let caret = getCaret () in + let rec iter (count : Int) = + let cond = + match ~lb.curr () with + | None => False + | Some x => pred x + end + in + if cond then + (~lb.seek (); iter (count + 1)) + else + count + in + let len = iter 0 in + caret.str.substring caret.idx len + # ----------------------------------------------------------------------------- # Char type predicates @@ -202,6 +230,9 @@ let isUidStart (chr : Char) = let isVarChar (char : Char) = isLidStart char || isUidStart char || isDigit char || (char == '\'') +let isCommentName (char : Char) = + not ("\x7f{}".mem char) && not ('\0' <= char && char <= ' ') + # ----------------------------------------------------------------------------- # Lexer @@ -302,20 +333,34 @@ let tokenizeOp str = raise "Internal operator error" end -let rec token () = - match popChar () with - | None => EOF - | Some chr => - if isWhite chr || chr == '\n' then token () - # else if chr == '{' && tryGetChar '#' then KW_EFFECT - else if chr == '(' then BR_OPN - else if chr == ')' then BR_CLS - else if chr == '[' then SBR_OPN - else if chr == ']' then SBR_CLS - else if chr == '{' then CBR_OPN - else if chr == '}' then CBR_CLS - else raise "TODO" - end +rec + let token () = + match popChar () with + | None => EOF + | Some chr => + if isWhite chr || chr == '\n' then token () + else if chr == '{' && tryMatch "#" then + (let name = takeWhile isCommentName in + blockComment (name + "#}")) + else if chr == '(' then BR_OPN + else if chr == ')' then BR_CLS + else if chr == '[' then SBR_OPN + else if chr == ']' then SBR_CLS + else if chr == '{' then CBR_OPN + else if chr == '}' then CBR_CLS + else raise "TODO" + end + + let blockComment closing = + if currChar () == Some '\n' then + (~lb.seek (); ~pt.newLine (); blockComment closing) + else if tryMatch closing then + token () + else if currChar () == None then + raise "Unexpected EOF while parsing block comment" + else + (~lb.seek (); blockComment closing) +end # ----------------------------------------------------------------------------- # Public interface diff --git a/src/Tests/Lexer.fram b/src/Tests/Lexer.fram index 1d491a8..2e522cc 100644 --- a/src/Tests/Lexer.fram +++ b/src/Tests/Lexer.fram @@ -89,6 +89,13 @@ method equal (t1 : Token) (t2 : Token) = | _, _ => False end +method show (t : Token) = + match t with + | EOF => "EOF" + | CBR_CLS => "CBR_CLS" + | _ => "?" + end + let unwrapToks str = let st = mkLexerState str in let rec iter st = @@ -113,4 +120,15 @@ let () = expectEq (unwrapToks "()") [BR_OPN, BR_CLS, EOF]; - expectEq (unwrapToks "\n\n\t ) \n\n (") [BR_CLS, BR_OPN, EOF]) + expectEq (unwrapToks "\n\n\t ) \n\n (") [BR_CLS, BR_OPN, EOF]); + + testCase "blockComments" (fn _ => + expectEqS (unwrapToks " {# #} ") [EOF]; + + expectEqS (unwrapToks " {## #} ##} ") [EOF]; + + expectEqS (unwrapToks " {#aaaa#} ") []; + + expectEqS (unwrapToks "{#aa\naa#}") [EOF]; + + expectEqS (unwrapToks "{# #} {## ##} {### ###}") [EOF]) From 76821c111b3ddb46465a0f9ad0ad5aa94a0d5604 Mon Sep 17 00:00:00 2001 From: wojpok Date: Tue, 30 Dec 2025 18:41:42 +0100 Subject: [PATCH 05/14] operators --- src/Parser/Lexer.fram | 3 +++ src/Tests/Lexer.fram | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/Parser/Lexer.fram b/src/Parser/Lexer.fram index 11c7df8..ef16517 100644 --- a/src/Parser/Lexer.fram +++ b/src/Parser/Lexer.fram @@ -348,6 +348,9 @@ rec else if chr == ']' then SBR_CLS else if chr == '{' then CBR_OPN else if chr == '}' then CBR_CLS + else if isOpChar chr then + (let op = chr.toString + takeWhile isOpChar in + tokenizeOp op) else raise "TODO" end diff --git a/src/Tests/Lexer.fram b/src/Tests/Lexer.fram index 2e522cc..e2c1459 100644 --- a/src/Tests/Lexer.fram +++ b/src/Tests/Lexer.fram @@ -131,4 +131,9 @@ let () = expectEqS (unwrapToks "{#aa\naa#}") [EOF]; - expectEqS (unwrapToks "{# #} {## ##} {### ###}") [EOF]) + expectEqS (unwrapToks "{# #} {## ##} {### ###}") [EOF]); + + testCase "operators" (fn _ => + expectEqS + (unwrapToks " + - * ^ && ||||| ") + [OP_80 "+", OP_80 "-", OP_90 "*", OP_70 "^", OP_50 "&&", OP_40 "|||||", EOF]) \ No newline at end of file From 71357e2afc8171d13a3a45f08ffe90eb63c27200 Mon Sep 17 00:00:00 2001 From: wojpok Date: Sat, 3 Jan 2026 11:52:33 +0100 Subject: [PATCH 06/14] basic strings --- src/Parser/Lexer.fram | 26 +++++++++++++++++++++++++- src/Tests/Lexer.fram | 8 +++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/src/Parser/Lexer.fram b/src/Parser/Lexer.fram index ef16517..de29ce2 100644 --- a/src/Parser/Lexer.fram +++ b/src/Parser/Lexer.fram @@ -351,18 +351,42 @@ rec else if isOpChar chr then (let op = chr.toString + takeWhile isOpChar in tokenizeOp op) + else if chr == '"' then + stringToken True [] else raise "TODO" end let blockComment closing = if currChar () == Some '\n' then - (~lb.seek (); ~pt.newLine (); blockComment closing) + (~lb.seek (); + ~pt.newLine (); + blockComment closing) else if tryMatch closing then token () else if currChar () == None then raise "Unexpected EOF while parsing block comment" else (~lb.seek (); blockComment closing) + + let stringToken isOpening buffer = + match popChar () with + | None => raise "Unclosed string token" + | Some chr => + if chr == '\n' then + (~pt.newLine (); + stringToken isOpening ('\n' :: buffer)) + else if chr == '"' then + (let str = charListToStr buffer.rev in + if isOpening then STR str else ESTR str) + else if chr == '\\' && tryMatch "{" then + (~br.push BT_Interp; + let str = charListToStr buffer.rev in + if isOpening then BSTR str else CSTR str) + else if chr == '\\' then + raise "TODO - string escape" + else + stringToken isOpening (chr :: buffer) + end end # ----------------------------------------------------------------------------- diff --git a/src/Tests/Lexer.fram b/src/Tests/Lexer.fram index e2c1459..a7d280e 100644 --- a/src/Tests/Lexer.fram +++ b/src/Tests/Lexer.fram @@ -93,6 +93,7 @@ method show (t : Token) = match t with | EOF => "EOF" | CBR_CLS => "CBR_CLS" + | STR s => "STR \{s.show}" | _ => "?" end @@ -136,4 +137,9 @@ let () = testCase "operators" (fn _ => expectEqS (unwrapToks " + - * ^ && ||||| ") - [OP_80 "+", OP_80 "-", OP_90 "*", OP_70 "^", OP_50 "&&", OP_40 "|||||", EOF]) \ No newline at end of file + [OP_80 "+", OP_80 "-", OP_90 "*", OP_70 "^", OP_50 "&&", OP_40 "|||||", EOF]); + + testCase "basic strings" (fn _ => + expectEqS + (unwrapToks " \"my string\" \"foo\nbar\" ") + [STR "my string", STR "foo\nbar", EOF]) \ No newline at end of file From cce6faf7b4986216b480182fd30a767f399dd6c7 Mon Sep 17 00:00:00 2001 From: wojpok Date: Sat, 3 Jan 2026 12:08:13 +0100 Subject: [PATCH 07/14] string interpolation --- src/Parser/Lexer.fram | 13 +++++++++++-- src/Tests/Lexer.fram | 21 ++++++++++++++++++++- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/Parser/Lexer.fram b/src/Parser/Lexer.fram index de29ce2..138f4a5 100644 --- a/src/Parser/Lexer.fram +++ b/src/Parser/Lexer.fram @@ -346,8 +346,17 @@ rec else if chr == ')' then BR_CLS else if chr == '[' then SBR_OPN else if chr == ']' then SBR_CLS - else if chr == '{' then CBR_OPN - else if chr == '}' then CBR_CLS + else if chr == '{' then + (~br.push BT_Regular; + CBR_OPN) + else if chr == '@' && tryMatch "{" then + (~br.push BT_Regular; + ATTR_OPEN) + else if chr == '}' then + (match ~br.pop () with + | BT_Regular => CBR_CLS + | BT_Interp => stringToken False [] + end) else if isOpChar chr then (let op = chr.toString + takeWhile isOpChar in tokenizeOp op) diff --git a/src/Tests/Lexer.fram b/src/Tests/Lexer.fram index a7d280e..72db171 100644 --- a/src/Tests/Lexer.fram +++ b/src/Tests/Lexer.fram @@ -92,8 +92,12 @@ method equal (t1 : Token) (t2 : Token) = method show (t : Token) = match t with | EOF => "EOF" + | CBR_OPN => "CBR_OPN" | CBR_CLS => "CBR_CLS" | STR s => "STR \{s.show}" + | BSTR s => "BSTR \{s.show}" + | CSTR s => "CSTR \{s.show}" + | ESTR s => "ESTR \{s.show}" | _ => "?" end @@ -142,4 +146,19 @@ let () = testCase "basic strings" (fn _ => expectEqS (unwrapToks " \"my string\" \"foo\nbar\" ") - [STR "my string", STR "foo\nbar", EOF]) \ No newline at end of file + [STR "my string", STR "foo\nbar", EOF]); + + testCase "interpolation" (fn _ => + expectEqS + (unwrapToks " {} ") + [CBR_OPN, CBR_CLS, EOF]; + + expectEqS + (unwrapToks "{}}") + [CBR_OPN, CBR_CLS]; # FAIL + + expectEqS + (unwrapToks " \"interp=\\{ {} }other=\\{++}\" ") + [BSTR "interp=", CBR_OPN, CBR_CLS, CSTR "other=", OP_80 "++", ESTR "", EOF]); + + () \ No newline at end of file From c774e59f6e33b85a4820978d132c4de3f4595c4b Mon Sep 17 00:00:00 2001 From: wojpok Date: Sat, 3 Jan 2026 12:46:17 +0100 Subject: [PATCH 08/14] chars --- src/Parser/Lexer.fram | 48 ++++++++++++++++++++++++++++++++++++++++++- src/Tests/Lexer.fram | 21 ++++++++++++++++--- 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/src/Parser/Lexer.fram b/src/Parser/Lexer.fram index 138f4a5..20c4c37 100644 --- a/src/Parser/Lexer.fram +++ b/src/Parser/Lexer.fram @@ -190,6 +190,10 @@ let popChar () = Some ch end +let forcePopChar () = + let ~onError () = raise "EOF" in + popChar () >.unwrapErr + let takeWhile pred = let caret = getCaret () in let rec iter (count : Int) = @@ -233,6 +237,19 @@ let isVarChar (char : Char) = let isCommentName (char : Char) = not ("\x7f{}".mem char) && not ('\0' <= char && char <= ' ') +# ----------------------------------------------------------------------------- +# Utils + +let parseHexDigit (dgt : Char) = + if '0' <= dgt && dgt <= '9' then + dgt.code - '0'.code + else if 'a' <= dgt && dgt <= 'f' then + dgt.code - 'a'.code + 10 + else if 'A' <= dgt && dgt <= 'F' then + dgt.code - 'A'.code + 10 + else + raise "Invalid hexadecimal digit" + # ----------------------------------------------------------------------------- # Lexer @@ -333,6 +350,27 @@ let tokenizeOp str = raise "Internal operator error" end +let parseEscape () = + let c = forcePopChar () in + if "\"\'\\".mem c then + c + else if c == '0' then '\0' + else if c == 'n' then '\n' + else if c == 'b' then '\b' + else if c == 't' then '\t' + else if c == 'r' then '\r' + else if c == 'v' then '\v' + else if c == 'a' then '\a' + else if c == 'f' then '\f' + else if "xX".mem c then + (let ch1 = forcePopChar () in + let ch2 = forcePopChar () in + let num = parseHexDigit ch1 * 16 + parseHexDigit ch2 in + let ~onError () = raise "impossible" in + chr num) + else + raise "Invalid escape: \{c.show}" + rec let token () = match popChar () with @@ -360,6 +398,13 @@ rec else if isOpChar chr then (let op = chr.toString + takeWhile isOpChar in tokenizeOp op) + else if chr == '\'' then + (let c = forcePopChar () in + let c = if c != '\\' then c else parseEscape () in + if forcePopChar () != '\'' then + raise "Unclosed Char" + else + CHR c) else if chr == '"' then stringToken True [] else raise "TODO" @@ -392,7 +437,8 @@ rec let str = charListToStr buffer.rev in if isOpening then BSTR str else CSTR str) else if chr == '\\' then - raise "TODO - string escape" + (let chr = parseEscape () in + stringToken isOpening (chr :: buffer)) else stringToken isOpening (chr :: buffer) end diff --git a/src/Tests/Lexer.fram b/src/Tests/Lexer.fram index 72db171..493a8bf 100644 --- a/src/Tests/Lexer.fram +++ b/src/Tests/Lexer.fram @@ -98,14 +98,15 @@ method show (t : Token) = | BSTR s => "BSTR \{s.show}" | CSTR s => "CSTR \{s.show}" | ESTR s => "ESTR \{s.show}" + | CHR c => "CHR \{c.show}" | _ => "?" end -let unwrapToks str = +let unwrapToks {~testLogger} str = let st = mkLexerState str in let rec iter st = match getTok st with - | Err _ => [] # No EOF! + | Err e => log e; [] # No EOF! | Ok (EOF, _) => [EOF] | Ok (tok, st) => tok :: iter st end @@ -160,5 +161,19 @@ let () = expectEqS (unwrapToks " \"interp=\\{ {} }other=\\{++}\" ") [BSTR "interp=", CBR_OPN, CBR_CLS, CSTR "other=", OP_80 "++", ESTR "", EOF]); - + + testCase "escapes" (fn _ => + expectEqS + (unwrapToks " \"\\a \\n \\t \\xFF \\xbC\" ") + [STR "\a \n \t \xff \xbc", EOF]); + + testCase "chars" (fn _ => + expectEqS + (unwrapToks " \'\\n\' ") + [CHR '\n', EOF]; + + expectEqS + (unwrapToks " \'a\' \'\\n\' \' \' \'\\xbc\' ") + [CHR 'a', CHR '\n', CHR ' ', CHR '\xBC', EOF]); + () \ No newline at end of file From 7a0e20c32cd2b6a027c54a1a9f3a142a28ff03cc Mon Sep 17 00:00:00 2001 From: wojpok Date: Tue, 6 Jan 2026 17:38:12 +0100 Subject: [PATCH 09/14] numbers --- src/Parser/Lexer.fram | 69 +++++++++++++++++++++++++++++++++++++++++++ src/Tests/Lexer.fram | 11 +++++++ 2 files changed, 80 insertions(+) diff --git a/src/Parser/Lexer.fram b/src/Parser/Lexer.fram index 20c4c37..c273c3f 100644 --- a/src/Parser/Lexer.fram +++ b/src/Parser/Lexer.fram @@ -250,6 +250,24 @@ let parseHexDigit (dgt : Char) = else raise "Invalid hexadecimal digit" +let parseDecDigit (dgt : Char) = + if '0' <= dgt && dgt <= '9' then + dgt.code - '0'.code + else + raise "Invalid decimal digit" + +let parseOctDigit (dgt : Char) = + if '0' <= dgt && dgt <= '7' then + dgt.code - '0'.code + else + raise "Invalid octal digit" + +let parseBinDigit (dgt : Char) = + if '0' == dgt || '1' == dgt then + dgt.code - '0'.code + else + raise "Invalid binary digit" + # ----------------------------------------------------------------------------- # Lexer @@ -350,6 +368,54 @@ let tokenizeOp str = raise "Internal operator error" end +let readNum (num : String) (base : Int) (parser : Char ->[E_Fatal] Int) = + if num.length == 0 then + raise "Invalid number"; + let rec iter (i : Int) (acc : Int) = + if i >= num.length then + NUM acc + else + (let digit = num.get i in + let acc = acc * base + parser digit in + if acc < 0 then + raise "Number is too big!" + else + iter (i + 1) acc) + in + iter 0 0 + +let readNum64 (num : String) (base : Int) (parser : Char ->[E_Fatal] Int) = + let base = base.toInt64 in + if num.length == 0 then + raise "Invalid number"; + let rec iter (i : Int) (acc : Int64) = + if i >= num.length then + NUM64 acc + else + (let digit = num.get i in + let acc = acc * base + (parser digit).toInt64 in + if acc < 0L then + raise "Number is too big!" + else + iter (i + 1) acc) + in + iter 0 0L + +let tokenizeNumber (num : String) = + let (base, parser) = + if num.contains "0b" then (2, parseBinDigit) + else if num.contains "0o" then (8, parseOctDigit) + else if num.contains "0x" then (16, parseHexDigit) + else (10, parseDecDigit) + let num = if base == 10 then num else num.substring 2 (num.length - 2) + let (num, reader) = + if num.get (num.length - 1) == 'L' then + (num.substring 0 (num.length - 1), readNum64) + else + (num, readNum) + in + reader num base parser + let parseEscape () = let c = forcePopChar () in if "\"\'\\".mem c then @@ -398,6 +464,9 @@ rec else if isOpChar chr then (let op = chr.toString + takeWhile isOpChar in tokenizeOp op) + else if isDigit chr then + (let num = chr.toString + takeWhile isVarChar in + tokenizeNumber num) else if chr == '\'' then (let c = forcePopChar () in let c = if c != '\\' then c else parseEscape () in diff --git a/src/Tests/Lexer.fram b/src/Tests/Lexer.fram index 493a8bf..8dc9081 100644 --- a/src/Tests/Lexer.fram +++ b/src/Tests/Lexer.fram @@ -176,4 +176,15 @@ let () = (unwrapToks " \'a\' \'\\n\' \' \' \'\\xbc\' ") [CHR 'a', CHR '\n', CHR ' ', CHR '\xBC', EOF]); + testCase "numbers" (fn _ => + expectEqS + (unwrapToks "42 0xFf 0o75 0b1010") + [NUM 42, NUM 255, NUM 61, NUM 10, EOF]; + + expectEqS + (unwrapToks "42L 0xFFL 0o75L 0b1010L") + [NUM64 42L, NUM64 255L, NUM64 61L, NUM64 10L, EOF]; + + expectEqS (unwrapToks "1000000000000000000000") []); + () \ No newline at end of file From 8844eae22c49ec62f81a0197c5f5cce59def9a8a Mon Sep 17 00:00:00 2001 From: wojpok Date: Tue, 6 Jan 2026 17:51:45 +0100 Subject: [PATCH 10/14] identifiers --- src/Parser/Lexer.fram | 20 +++++++++++++++++++- src/Tests/Lexer.fram | 5 +++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/Parser/Lexer.fram b/src/Parser/Lexer.fram index c273c3f..8d74f2a 100644 --- a/src/Parser/Lexer.fram +++ b/src/Parser/Lexer.fram @@ -30,6 +30,12 @@ method contains {?pos : Int} (str : String) (prefix : String) = in iter startPos 0) +method andmap opt f = + match opt with + | None => False + | Some x => f x + end + # ----------------------------------------------------------------------------- # Fatal errors @@ -461,9 +467,21 @@ rec | BT_Regular => CBR_CLS | BT_Interp => stringToken False [] end) - else if isOpChar chr then + # TODO Find better solution + else if ("?~".mem chr && (currChar ()).andmap isOpChar) + || (not ("?~".mem chr) && isOpChar chr) then (let op = chr.toString + takeWhile isOpChar in tokenizeOp op) + else if isLidStart chr then + (let tok = chr.toString + takeWhile isVarChar in + tokenizeIdent tok) + else if isUidStart chr then + (let uid = chr.toString + takeWhile isVarChar in + UID uid) + else if chr == '~' && (currChar ()).andmap isVarChar then + (TLID (takeWhile isVarChar)) + else if chr == '?' && (currChar ()).andmap isVarChar then + (QLID (takeWhile isVarChar)) else if isDigit chr then (let num = chr.toString + takeWhile isVarChar in tokenizeNumber num) diff --git a/src/Tests/Lexer.fram b/src/Tests/Lexer.fram index 8dc9081..b11d27d 100644 --- a/src/Tests/Lexer.fram +++ b/src/Tests/Lexer.fram @@ -187,4 +187,9 @@ let () = expectEqS (unwrapToks "1000000000000000000000") []); + testCase "identifiers" (fn _ => + expectEqS + (unwrapToks "foo Bar ~baz ?qux") + [LID "foo", UID "Bar", TLID "baz", QLID "qux", EOF]); + () \ No newline at end of file From 8698cf1551e314b539822d044a41ab5e3c8a0665 Mon Sep 17 00:00:00 2001 From: wojpok Date: Sun, 11 Jan 2026 16:22:55 +0100 Subject: [PATCH 11/14] reworked position trackers --- src/Parser/Lexer.fram | 72 +++++++++++++++++++++--------------------- src/Parser/Tokens.fram | 14 +++++++- src/Tests/Lexer.fram | 2 +- 3 files changed, 50 insertions(+), 38 deletions(-) diff --git a/src/Parser/Lexer.fram b/src/Parser/Lexer.fram index 8d74f2a..59d0448 100644 --- a/src/Parser/Lexer.fram +++ b/src/Parser/Lexer.fram @@ -55,12 +55,17 @@ let raise str = ~err.raise str # ----------------------------------------------------------------------------- # Reading buffer -data Caret = {idx : Int, str : String} -let mkCaret str = Caret {idx = 0, str} +type CaretPos = Pos +data Caret = {pos : CaretPos, str : String} +let mkCaret str = Caret {pos = Pos {idx = 0, line = 0, col = 0}, str} -method moveBy (Caret {idx, str}) len = Caret {idx = idx + len, str} +method moveBy (Caret {pos = Pos {idx, line, col}, str}) len = + Caret {pos = Pos {idx = idx + len, line, col = col + len}, str} -method getChar (Caret {idx, str}) = +method moveNewLine (Caret {pos = Pos {idx, line, col}, str}) = + Caret {pos = Pos {idx = idx + 1, line = line + 1, col = 0}, str} + +method getChar (Caret {pos = Pos {idx}, str}) = if idx < str.length then Some (str.get idx) else if idx == str.length then @@ -82,7 +87,7 @@ let lexBufH caret = , effect curr () / r = fn (c : Caret) => r c.getChar c , effect tryMatch pattern / r = fn (c : Caret) => - if c.str.contains {pos=c.idx} pattern then + if c.str.contains {pos=c.pos.idx} pattern then (let c = c.moveBy (pattern.length) in r True c) else @@ -103,22 +108,22 @@ let tryMatch pttrn = ~lb.tryMatch pttrn # ----------------------------------------------------------------------------- # Position tracker -data Pos = {line : Int, col : Int} - -method moveBy (Pos {line, col}) len = Pos {line, col = col + len} -method newLine (Pos {line}) = Pos {line = line + 1, col = 0} +data LexStart = {pos : Pos, file : String} data PosTracker E = - { moveBy : Int ->[E] Unit - , newLine : Unit ->[E] Unit + { setStart : Pos ->[E] Unit + , getStart : Unit ->[E] Pos + , setFilename : String ->[E] Unit } let posTrackerH pos = handler PosTracker - { effect moveBy len / r = fn (pos : Pos) => - r () (pos.moveBy len) - , effect newLine () / r = fn (pos : Pos) => - r () (pos.newLine) + { effect setStart pos / r = fn (lex : LexStart) => + r () (LexStart {pos, file = lex.file}) + , effect getStart () / r = fn (lex : LexStart) => + r lex.pos lex + , effect setFilename file / r = fn (lex : LexStart) => + r () (LexStart {pos = lex.pos, file}) } return x => fn p => (x, p) finally c => c pos @@ -130,7 +135,6 @@ parameter ~pt : PosTracker E_PosTracker # ----------------------------------------------------------------------------- # Bracket Stack - data BracketType = | BT_Regular | BT_Interp @@ -189,10 +193,6 @@ let popChar () = | None => None | Some ch => ~lb.seek (); - if ch == '\n' then - ~pt.newLine () - else - ~pt.moveBy 1; Some ch end @@ -215,7 +215,7 @@ let takeWhile pred = count in let len = iter 0 in - caret.str.substring caret.idx len + caret.str.substring caret.pos.idx len # ----------------------------------------------------------------------------- # Char type predicates @@ -500,7 +500,6 @@ rec let blockComment closing = if currChar () == Some '\n' then (~lb.seek (); - ~pt.newLine (); blockComment closing) else if tryMatch closing then token () @@ -514,8 +513,7 @@ rec | None => raise "Unclosed string token" | Some chr => if chr == '\n' then - (~pt.newLine (); - stringToken isOpening ('\n' :: buffer)) + stringToken isOpening ('\n' :: buffer) else if chr == '"' then (let str = charListToStr buffer.rev in if isOpening then STR str else ESTR str) @@ -536,26 +534,28 @@ end abstr data LexerState = { caret : Caret - , pos : Pos + , start : LexStart , stack : BracketStack } -pub let mkLexerState str = LexerState - { caret = Caret {str, idx = 0} - , pos = Pos {line = 0, col = 0} - , stack = emptyStack - } +pub let mkLexerState str file = + let pos = Pos {idx = 0, line = 0, col = 0} in + LexerState + { caret = Caret {str, pos} + , start = LexStart {file, pos} + , stack = emptyStack + } -pub let getTok (st : LexerState) = +pub let getTok (LexerState {caret, start, stack}) = let res = handle ~err / E_Fatal with fatalH in - handle ~lb / E_LexBuf with lexBufH st.caret in - handle ~pt / E_PosTracker with posTrackerH st.pos in - handle ~br / E_Brackets with cbracketTrackerH st.stack in + handle ~lb / E_LexBuf with lexBufH caret in + handle ~pt / E_PosTracker with posTrackerH (LexStart {pos = caret.pos, file = start.file}) in + handle ~br / E_Brackets with cbracketTrackerH stack in token () in match res with | Err s => Err s - | Ok (((tok, stack), pos), caret) => - Ok (tok, LexerState {stack, pos, caret}) + | Ok (((tok, stack), start), caret) => + Ok (tok, LexerState {stack, start, caret}) end diff --git a/src/Parser/Tokens.fram b/src/Parser/Tokens.fram index 5f2c286..3353a71 100644 --- a/src/Parser/Tokens.fram +++ b/src/Parser/Tokens.fram @@ -87,4 +87,16 @@ pub data Token = | UNDERSCORE | EOF -pub data Tok = {token : Token, pos : Int} +pub data Pos = + { idx : Int + , line : Int + , col : Int + } + +pub data PosSpan = + { file : String + , first : Pos + , last : Pos + } + +pub data Tok = {token : Token, pos : PosSpan} diff --git a/src/Tests/Lexer.fram b/src/Tests/Lexer.fram index b11d27d..8faef8c 100644 --- a/src/Tests/Lexer.fram +++ b/src/Tests/Lexer.fram @@ -103,7 +103,7 @@ method show (t : Token) = end let unwrapToks {~testLogger} str = - let st = mkLexerState str in + let st = mkLexerState str "test" in let rec iter st = match getTok st with | Err e => log e; [] # No EOF! From 9ad85f07c2c9d1260406924c3f96ec3c7e7f3470 Mon Sep 17 00:00:00 2001 From: wojpok Date: Sun, 11 Jan 2026 16:56:04 +0100 Subject: [PATCH 12/14] attemt at position tracking --- src/Parser/Lexer.fram | 13 ++++++---- src/Tests/Lexer.fram | 55 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/src/Parser/Lexer.fram b/src/Parser/Lexer.fram index 59d0448..45693a4 100644 --- a/src/Parser/Lexer.fram +++ b/src/Parser/Lexer.fram @@ -83,7 +83,7 @@ data LexBuf E = let lexBufH caret = handler LexBuf { effect seek () / r = fn (c : Caret) => - r () (c.moveBy 1) + r () (if c.getChar == Some '\n' then c.moveNewLine else c.moveBy 1) , effect curr () / r = fn (c : Caret) => r c.getChar c , effect tryMatch pattern / r = fn (c : Caret) => @@ -550,12 +550,17 @@ pub let getTok (LexerState {caret, start, stack}) = let res = handle ~err / E_Fatal with fatalH in handle ~lb / E_LexBuf with lexBufH caret in - handle ~pt / E_PosTracker with posTrackerH (LexStart {pos = caret.pos, file = start.file}) in + handle ~pt / E_PosTracker with posTrackerH + (LexStart {pos = caret.pos, file = start.file}) in handle ~br / E_Brackets with cbracketTrackerH stack in token () in match res with | Err s => Err s - | Ok (((tok, stack), start), caret) => - Ok (tok, LexerState {stack, start, caret}) + | Ok (((token, stack), start), caret) => + Ok ( Tok { token + , pos = PosSpan { first = start.pos + , last = caret.pos + , file = start.file}} + , LexerState {stack, start, caret}) end diff --git a/src/Tests/Lexer.fram b/src/Tests/Lexer.fram index 8faef8c..e3a8684 100644 --- a/src/Tests/Lexer.fram +++ b/src/Tests/Lexer.fram @@ -1,6 +1,7 @@ import open Testing import open Parser/Tokens import open Parser/Lexer +import List method equal (t1 : Token) (t2 : Token) = match (t1, t2) with @@ -102,17 +103,25 @@ method show (t : Token) = | _ => "?" end -let unwrapToks {~testLogger} str = +let unwrapToksWithPos str = let st = mkLexerState str "test" in let rec iter st = match getTok st with - | Err e => log e; [] # No EOF! - | Ok (EOF, _) => [EOF] - | Ok (tok, st) => tok :: iter st + | Err e => [] # No EOF! + | Ok (tok, st) => + tok :: + match tok with + | Tok {token = EOF} => [] + | _ => iter st + end end in iter st +# This annotation speeds up tests massively! +let (unwrapToks : String ->[] List Token) = fn str => + List.map (fn (t : Tok) => t.token) (unwrapToksWithPos str) + let () = testCase "empty" (fn _ => expectEq (unwrapToks "") [EOF]; @@ -192,4 +201,40 @@ let () = (unwrapToks "foo Bar ~baz ?qux") [LID "foo", UID "Bar", TLID "baz", QLID "qux", EOF]); - () \ No newline at end of file + () + + +let largeCode = " +import open /List + +let foo = bar baz (-0L) in + \"my code \\{ LT \\}\" +" + +data TokMatch = TokMatch of Token, Int, Int, Int, Int + +method equal (tok : Tok) ((TokMatch t ls cs le ce)) = + tok.token == t + && tok.pos.first.line == ls + && tok.pos.first.col == cs + && tok.pos.last.line == le + && tok.pos.last.col == ce + +let _ = testCase "positions" (fn _ => + let toks = unwrapToksWithPos largeCode in + assertDoesNotCallOnError (fn _ => + assertEq + (toks.nthErr 0) + (TokMatch KW_IMPORT 0 0 1 6); + + assertEq + (toks.nthErr 1) + (TokMatch KW_OPEN 1 6 1 11); + + assertEq + (toks.nthErr 2) + (TokMatch SLASH 1 11 1 13); + + assertEq + (toks.nthErr 3) + (TokMatch (UID "List") 1 13 1 17))) From 2b93913599c2b711941989e5004c381eb44929df Mon Sep 17 00:00:00 2001 From: wojpok Date: Sun, 18 Jan 2026 13:02:51 +0100 Subject: [PATCH 13/14] fixed up positioning --- src/Parser/Lexer.fram | 5 +++ src/Tests/Lexer.fram | 81 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 79 insertions(+), 7 deletions(-) diff --git a/src/Parser/Lexer.fram b/src/Parser/Lexer.fram index 45693a4..bcece1a 100644 --- a/src/Parser/Lexer.fram +++ b/src/Parser/Lexer.fram @@ -217,6 +217,10 @@ let takeWhile pred = let len = iter 0 in caret.str.substring caret.pos.idx len +let markTokStart () = + let pos = ~lb.caret () >.pos in + ~pt.setStart pos + # ----------------------------------------------------------------------------- # Char type predicates @@ -445,6 +449,7 @@ let parseEscape () = rec let token () = + markTokStart (); match popChar () with | None => EOF | Some chr => diff --git a/src/Tests/Lexer.fram b/src/Tests/Lexer.fram index e3a8684..8536b81 100644 --- a/src/Tests/Lexer.fram +++ b/src/Tests/Lexer.fram @@ -103,6 +103,15 @@ method show (t : Token) = | _ => "?" end +method show (Pos {idx, line, col}) = + "Pos {idx=\{idx}, line=\{line}, col=\{col}}" + +method show (PosSpan {first, last}) = + "PosSpan {first=\{first.show}, last=\{last.show}}" + +method show (Tok {token, pos}) = + "Tok {token=\{token.show}, pos=\{pos.show}}" + let unwrapToksWithPos str = let st = mkLexerState str "test" in let rec iter st = @@ -208,8 +217,8 @@ let largeCode = " import open /List let foo = bar baz (-0L) in - \"my code \\{ LT \\}\" -" + \"my code \\{ LT }\" + " data TokMatch = TokMatch of Token, Int, Int, Int, Int @@ -225,16 +234,74 @@ let _ = testCase "positions" (fn _ => assertDoesNotCallOnError (fn _ => assertEq (toks.nthErr 0) - (TokMatch KW_IMPORT 0 0 1 6); - + (TokMatch KW_IMPORT 1 0 1 6); + assertEq (toks.nthErr 1) - (TokMatch KW_OPEN 1 6 1 11); + (TokMatch KW_OPEN 1 7 1 11); assertEq (toks.nthErr 2) - (TokMatch SLASH 1 11 1 13); + (TokMatch SLASH 1 12 1 13); assertEq (toks.nthErr 3) - (TokMatch (UID "List") 1 13 1 17))) + (TokMatch (UID "List") 1 13 1 17); + + assertEq + (toks.nthErr 4) + (TokMatch KW_LET 3 0 3 3); + + assertEq + (toks.nthErr 5) + (TokMatch (LID "foo") 3 4 3 7); + + assertEq + (toks.nthErr 6) + (TokMatch EQ 3 8 3 9); + + assertEq + (toks.nthErr 7) + (TokMatch (LID "bar") 3 10 3 13); + + assertEq + (toks.nthErr 8) + (TokMatch (LID "baz") 3 14 3 17); + + assertEq + (toks.nthErr 9) + (TokMatch BR_OPN 3 18 3 19); + + assertEq + (toks.nthErr 10) + (TokMatch (OP_80 "-") 3 19 3 20); + + assertEq + (toks.nthErr 11) + (TokMatch (NUM64 0L) 3 20 3 22); + + assertEq + (toks.nthErr 12) + (TokMatch BR_CLS 3 22 3 23); + + assertEq + (toks.nthErr 13) + (TokMatch KW_IN 3 24 3 26); + + assertEq + (toks.nthErr 14) + (TokMatch (BSTR "my code ") 4 2 4 13); + + assertEq + (toks.nthErr 15) + (TokMatch (UID "LT") 4 14 4 16); + + assertEq + (toks.nthErr 16) + (TokMatch (ESTR "") 4 17 4 19); + + assertEq + (toks.nthErr 17) + (TokMatch EOF 5 2 5 2); + + ())) From 5253a8f2a69d62086bb9751e5d9207104cc62927 Mon Sep 17 00:00:00 2001 From: wojpok Date: Sun, 18 Jan 2026 13:08:59 +0100 Subject: [PATCH 14/14] single line comments --- src/Parser/Lexer.fram | 10 ++++++++++ src/Tests/Lexer.fram | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/src/Parser/Lexer.fram b/src/Parser/Lexer.fram index bcece1a..bc99d20 100644 --- a/src/Parser/Lexer.fram +++ b/src/Parser/Lexer.fram @@ -454,6 +454,8 @@ rec | None => EOF | Some chr => if isWhite chr || chr == '\n' then token () + else if chr == '#' then + singleComment () else if chr == '{' && tryMatch "#" then (let name = takeWhile isCommentName in blockComment (name + "#}")) @@ -513,6 +515,14 @@ rec else (~lb.seek (); blockComment closing) + let singleComment () = + if currChar () == None then # EOF + token () + else if currChar () == Some '\n' then + (let _ = popChar () in token ()) + else + (let _ = popChar () in singleComment ()) + let stringToken isOpening buffer = match popChar () with | None => raise "Unclosed string token" diff --git a/src/Tests/Lexer.fram b/src/Tests/Lexer.fram index 8536b81..904f8f4 100644 --- a/src/Tests/Lexer.fram +++ b/src/Tests/Lexer.fram @@ -209,6 +209,15 @@ let () = expectEqS (unwrapToks "foo Bar ~baz ?qux") [LID "foo", UID "Bar", TLID "baz", QLID "qux", EOF]); + + testCase "single line comments" (fn _ => + expectEqS + (unwrapToks "#asdasdas") + [EOF]; + + expectEqS + (unwrapToks "#anycomment\n10") + [NUM 10, EOF]); ()