diff --git a/src/dparse/ast.d b/src/dparse/ast.d index eba8d1a3..c18b06e4 100644 --- a/src/dparse/ast.d +++ b/src/dparse/ast.d @@ -73,6 +73,9 @@ shared static this() typeMap[typeid(TypeofExpression)] = 46; typeMap[typeid(UnaryExpression)] = 47; typeMap[typeid(XorExpression)] = 48; + typeMap[typeid(InterpolatedStringLiteralExpression)] = 49; + typeMap[typeid(InterpolatedStringLiteralPlain)] = 50; + typeMap[typeid(InterpolatedStringLiteralVariable)] = 51; } /// Describes which syntax was used in a list of declarations in the containing AST node @@ -167,6 +170,19 @@ abstract class ASTVisitor case 46: visit(cast(TypeofExpression) n); break; case 47: visit(cast(UnaryExpression) n); break; case 48: visit(cast(XorExpression) n); break; + // skip 49, 50, 51 (used for InterpolatedStringLiteralPart) + default: assert(false, __MODULE__ ~ " has a bug"); + } + } + + /// ditto + void dynamicDispatch(const InterpolatedStringLiteralPart n) + { + switch (typeMap.get(typeid(n), 0)) + { + case 49: visit(cast(InterpolatedStringLiteralExpression) n); break; + case 50: visit(cast(InterpolatedStringLiteralPlain) n); break; + case 51: visit(cast(InterpolatedStringLiteralVariable) n); break; default: assert(false, __MODULE__ ~ " has a bug"); } } @@ -289,6 +305,10 @@ abstract class ASTVisitor /** */ void visit(const Initialize initialize) { initialize.accept(this); } /** */ void visit(const Initializer initializer) { initializer.accept(this); } /** */ void visit(const InterfaceDeclaration interfaceDeclaration) { interfaceDeclaration.accept(this); } + /** */ void visit(const InterpolatedStringLiteral interpolatedStringLiteral) { interpolatedStringLiteral.accept(this); } + /** */ void visit(const InterpolatedStringLiteralExpression interpolatedStringLiteralExpression) { interpolatedStringLiteralExpression.accept(this); } + /** */ void visit(const InterpolatedStringLiteralPlain interpolatedStringLiteralPlain) { interpolatedStringLiteralPlain.accept(this); } + /** */ void visit(const InterpolatedStringLiteralVariable interpolatedStringLiteralVariable) { interpolatedStringLiteralVariable.accept(this); } /** */ void visit(const Invariant invariant_) { invariant_.accept(this); } /** */ void visit(const IsExpression isExpression) { isExpression.accept(this); } /** */ void visit(const KeyValuePair keyValuePair) { keyValuePair.accept(this); } @@ -2318,6 +2338,90 @@ final class InterfaceDeclaration : BaseNode mixin OpEquals; } +/// +final class InterpolatedStringLiteral : BaseNode +{ + import dparse.parser : ParserConfig; + + /// no-op, if you want to visit the nested expressions, use `acceptExpressions` + /// Note that those will have a different source `tokens` array and thus + /// will have different indices and such. Code accessing token neighbors by + /// pointer manipulation will break. + override void accept(ASTVisitor visitor) const + { + } + + /// Dynamically parses the interpolated string and calls the visitor on all + /// of its parts. Note that all the AST nodes visited from this function + /// will have a different `tokens` source array, so offsetting the pointers + /// past the array boundaries may cause out-of-bounds memory reads if you + /// don't check for them beforehand. + void acceptExpressions( + ParserConfig parserConfig, + LexerConfig lexerConfig, + StringCache* stringCache, + ASTVisitor visitor + ) const + { + import dparse.istring : parseIStringParts; + + foreach (part; parseIStringParts(parserConfig, lexerConfig, stringCache, literal)) + { + assert(part !is null); + visitor.dynamicDispatch(part); + } + } + + /// The raw token + inout(Token) literal() inout @safe pure nothrow @nogc return + { + if (!tokens.length) + return Token.init; + return tokens[0]; + } + + mixin OpEquals; +} + +/// +abstract class InterpolatedStringLiteralPart : BaseNode +{ + /// Index within the istringLiteral, so the value is always at least the + /// `istringLiteral` token index + 2, since interpolated strings start with + /// `i"` and all parts indices are within the quotes. + size_t index; +} + +/// +final class InterpolatedStringLiteralPlain : InterpolatedStringLiteralPart +{ + override void accept(ASTVisitor visitor) const + { + } + + /** */ string sourceText; +} + +/// +final class InterpolatedStringLiteralVariable : InterpolatedStringLiteralPart +{ + override void accept(ASTVisitor visitor) const + { + } + + /** */ string identifier; +} + +/// +final class InterpolatedStringLiteralExpression : InterpolatedStringLiteralPart +{ + override void accept(ASTVisitor visitor) const + { + } + + /** */ Expression expression; +} + /// final class Invariant : BaseNode { @@ -2798,7 +2902,7 @@ final class PrimaryExpression : ExpressionNode typeofExpression, typeidExpression, arrayLiteral, assocArrayLiteral, expression, dot, identifierOrTemplateInstance, isExpression, functionLiteralExpression,traitsExpression, mixinExpression, - importExpression, vector, arguments)); + importExpression, vector, arguments, interpolatedStringLiteral)); } /** */ Token dot; /** */ Token primary; @@ -2818,6 +2922,7 @@ final class PrimaryExpression : ExpressionNode /** */ Type type; /** */ Token typeConstructor; /** */ Arguments arguments; + /** */ InterpolatedStringLiteral interpolatedStringLiteral; mixin OpEquals; } diff --git a/src/dparse/istring.d b/src/dparse/istring.d new file mode 100644 index 00000000..124e0e9a --- /dev/null +++ b/src/dparse/istring.d @@ -0,0 +1,370 @@ +module dparse.istring; + +import dparse.ast; +import dparse.lexer; +import dparse.parser; + +import std.array; +import std.string; +import dparse.rollback_allocator; + +/// Splits the interpolated string into its parts (not recursive): +/// - `InterpolatedStringLiteralPlain` plain text part / no interpolation +/// - `InterpolatedStringLiteralVariable` a `$identifier` variable +/// - `InterpolatedStringLiteralExpression` a `$(...)` expression +/// +/// Params: +/// parserConfig = uses the allocator to create the returned class instances, +/// also used in whole for controlling the parser of embedded expressions. +/// lexerConfig = for embedded expressions, lexer config +/// stringCache = for embedded expressions, string cache +/// istring = the `tok!"istringLiteral"` token containing e.g. `i"hello $name"` +InterpolatedStringLiteralPart[] parseIStringParts( + ParserConfig parserConfig, + LexerConfig lexerConfig, + StringCache* stringCache, + return Token istring +) +{ + auto tokens = tokenizeIString(istring); + auto ret = new typeof(return)(tokens.length); + + auto allocator = parserConfig.allocator; + + foreach (i, ref retNode; ret) + { + auto token = tokens[i]; + switch (token.type) + { + case tok!"stringLiteral": + auto node = allocator.make!InterpolatedStringLiteralPlain(); + node.sourceText = token.text; + node.tokens = tokens[i .. i + 1]; + retNode = node; + break; + case tok!"identifier": + auto node = allocator.make!InterpolatedStringLiteralVariable(); + node.identifier = token.text; + node.tokens = tokens[i .. i + 1]; + retNode = node; + break; + case tok!"specialTokenSequence": + auto node = allocator.make!InterpolatedStringLiteralExpression(); + node.tokens = getTokensForParser(token.text, lexerConfig, stringCache); + scope parser = new Parser(); + with (parserConfig) + { + parser.fileName = fileName; + parser.tokens = node.tokens; + parser.messageFunction = messageFunction; + parser.messageDelegate = messageDelegate; + parser.allocator = parserConfig.allocator; + } + node.expression = parser.parseExpression(); + retNode = node; + break; + default: + assert(false); + } + retNode.index = token.index; + } + + return ret; +} + +/// Transforms `i"hello $name $(something + 2)"` into +/// - tok!"stringLiteral"(index:2, text:"hello ") +/// - tok!"identifier"(index:9, text:"name") +/// - tok!"stringLiteral"(index:13, text:" ") +/// - tok!"specialTokenSequence"(index:16, text:"something + 2") +/// all tokens are offset by the input `istring.{index, line, column}` +Token[] tokenizeIString(return Token istring) +in (istring.text.startsWith(`i"`)) +in (istring.text.endsWith(`"`)) +{ + import std.experimental.lexer : LexerRange; + + enum State + { + plain, + escape, + dollar, + identifier, + expression + } + + auto indexOffset = istring.index; + + auto bytes = cast(const(ubyte)[]) istring.text[0 .. $ - 1]; // remove trailing `"` + auto range = LexerRange(bytes, 0, istring.column, istring.line); + // skip `i"` + range.popFront(); + range.popFront(); + + void popFrontWhitespaceAware() + { + switch (range.bytes[range.index]) + { + case '\r': + range.popFront(); + if (!(range.index >= range.bytes.length) && range.bytes[range.index] == '\n') + { + range.popFront(); + range.incrementLine(); + } + else + range.incrementLine(); + return; + case '\n': + range.popFront(); + range.incrementLine(); + return; + case 0xe2: + auto lookahead = range.peek(3); + if (lookahead.length == 3 && lookahead[1] == 0x80 + && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)) + { + range.index+=3; + range.column+=3; + range.incrementLine(); + return; + } + else + { + range.popFront(); + return; + } + default: + range.popFront(); + return; + } + } + + State state; + int depth; + auto ret = appender!(Token[]); + auto startMark = range; + + while (!range.empty) + { + char c = cast(char) range.front; + final switch (state) + { + case State.dollar: + auto dollarIndex = range.index - 1; + bool skipConsume = false; + if (c == '(') + { + state = State.expression; + depth = 1; + } + else if (DLexer.isIdentifierSeparating(istring.text, range.index) + || (c >= '0' && c <= '9')) + goto case State.plain; + else + { + state = State.identifier; + skipConsume = true; + } + + if (startMark.index != dollarIndex) + { + Token t; + t.type = tok!"stringLiteral"; + t.text = istring.text[startMark.index .. dollarIndex]; + t.index = startMark.index + indexOffset; + t.column = startMark.column; + t.line = startMark.line; + ret ~= t; + } + + if (!skipConsume) + popFrontWhitespaceAware(); + startMark = range; + continue; + case State.plain: + if (c == '\\') + state = State.escape; + else if (c == '$') + state = State.dollar; + else + state = State.plain; + break; + case State.escape: + state = State.plain; + break; + case State.identifier: + if (!DLexer.isIdentifierSeparating(istring.text, range.index)) + break; + Token t; + t.type = tok!"identifier"; + t.text = istring.text[startMark.index .. range.index]; + t.index = startMark.index + indexOffset; + t.column = startMark.column; + t.line = startMark.line; + startMark = range; + ret ~= t; + goto case State.plain; + case State.expression: + if (c == '(') + depth++; + else if (c == ')') + { + depth--; + if (depth == 0) + { + Token t; + t.type = tok!"specialTokenSequence"; + t.text = istring.text[startMark.index .. range.index]; + t.index = startMark.index + indexOffset; + t.column = startMark.column; + t.line = startMark.line; + range.popFront(); + startMark = range; + ret ~= t; + state = State.plain; + continue; + } + } + break; + } + + popFrontWhitespaceAware(); + } + + final switch (state) + { + case State.dollar: + case State.plain: + case State.escape: + if (startMark.index == bytes.length) + break; + Token t; + t.type = tok!"stringLiteral"; + t.text = istring.text[startMark.index .. bytes.length]; + t.index = startMark.index + indexOffset; + t.column = startMark.column; + t.line = startMark.line; + ret ~= t; + break; + case State.identifier: + Token t; + t.type = tok!"identifier"; + t.text = istring.text[startMark.index .. bytes.length]; + t.index = startMark.index + indexOffset; + t.column = startMark.column; + t.line = startMark.line; + ret ~= t; + break; + case State.expression: + Token t; + t.type = tok!"specialTokenSequence"; + t.text = istring.text[startMark.index .. bytes.length]; + t.index = startMark.index + indexOffset; + t.column = startMark.column; + t.line = startMark.line; + ret ~= t; + break; + } + + return ret.data; +} + +/// +unittest +{ + import std.conv; + + Token input; + input.text = `i"hello $name $(something + 2)"`; + input.line = 5; + input.column = 10; + input.index = 100; + auto tokens = tokenizeIString(input); + + auto all = "tokens:\n" ~ tokens.to!(string[]).join("\n"); + + assert(tokens.length == 4, all); + + assert(tokens[0].type == tok!"stringLiteral", all); + assert(tokens[0].index == 102, all); + assert(tokens[0].column == 12, all); + assert(tokens[0].line == 5, all); + assert(tokens[0].text == "hello ", all); + + assert(tokens[1].type == tok!"identifier", all); + assert(tokens[1].index == 109, all); + assert(tokens[1].column == 19, all); + assert(tokens[1].line == 5, all); + assert(tokens[1].text == "name", all); + + assert(tokens[2].type == tok!"stringLiteral", all); + assert(tokens[2].index == 113, all); + assert(tokens[2].column == 23, all); + assert(tokens[2].line == 5, all); + assert(tokens[2].text == " ", all); + + assert(tokens[3].type == tok!"specialTokenSequence", all); + assert(tokens[3].index == 116, all); + assert(tokens[3].column == 26, all); + assert(tokens[3].line == 5, all); + assert(tokens[3].text == "something + 2", all); +} + +unittest +{ + auto test(string content) + { + Token input; + input.text = content; + input.line = 1; + input.column = 1; + input.index = 0; + auto tokens = tokenizeIString(input); + char[] ret = new char[content.length]; + ret[] = ' '; + foreach (t; tokens) + { + assert(t.text.length); + ret[t.index .. t.index + t.text.length] = + t.type == tok!"identifier" ? 'i' : + t.type == tok!"stringLiteral" ? '.' : + t.type == tok!"specialTokenSequence" ? '*' : '?'; + } + return ret; + } + + // dfmt off + + assert(test(`i"$name"`) + == ` iiii `); + + assert(test(`i"plain"`) + == ` ..... `); + + assert(test(`i"$(expression)"`) + == ` ********** `); + + assert(test(`i"$(expression"`) + == ` ********** `); + + assert(test(`i"$name "`) + == ` iiii. `); + + assert(test(`i"$ plain"`) + == ` ....... `); + + assert(test(`i"$0 plain"`) + == ` ........ `); + + assert(test(`i"$$0 plain"`) + == ` ......... `); + + assert(test(`i"$.1 plain"`) + == ` ......... `); + + assert(test(`i"I have $$money"`) + == ` ........ iiiii `); + + // dfmt on +} diff --git a/src/dparse/lexer.d b/src/dparse/lexer.d index 325f533f..f94c83b0 100644 --- a/src/dparse/lexer.d +++ b/src/dparse/lexer.d @@ -47,7 +47,7 @@ private immutable dynamicTokens = [ "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral", "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral", "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral", - "dstringLiteral", "stringLiteral", "wstringLiteral" + "dstringLiteral", "stringLiteral", "wstringLiteral", "istringLiteral", ]; private immutable pseudoTokenHandlers = [ @@ -68,6 +68,7 @@ private immutable pseudoTokenHandlers = [ "7", "lexDecimal", "8", "lexDecimal", "9", "lexDecimal", + "i\"", "lexInterpolatedString", "q\"", "lexDelimitedString", "q{", "lexTokenString", "r\"", "lexWysiwygString", @@ -1381,6 +1382,53 @@ private pure nothrow @safe: } } + void lexInterpolatedString(ref Token token) + { + mixin (tokenStart); + range.popFront(); + range.popFront(); + + bool lastDollar = false; + while (true) + { + if (range.index >= range.bytes.length) + { + error(token, "Error: unterminated interpolated string literal"); + return; + } + + if (lastDollar && range.front == '(') + { + range.popFront(); + tokenStringImpl(tok!"(", tok!")"); + } + else if (range.front == '"') + { + range.popFront(); + break; + } + else if (range.front == '\\') + { + if (!lexEscapeSequence()) + { + token = Token.init; + return; + } + } + else + { + if (range.front == '$') + lastDollar = true; + popFrontWhitespaceAware(); + } + } + + IdType charType; + lexStringSuffix(charType); + token = Token(tok!"istringLiteral", cache.intern(range.slice(mark)), line, + column, index); + } + void lexNormalDelimitedString(ref Token token, size_t mark, size_t line, size_t column, size_t index, ubyte open, ubyte close) { @@ -1458,15 +1506,14 @@ private pure nothrow @safe: token = Token(type, cache.intern(range.slice(mark)), line, column, index); } - void lexTokenString(ref Token token) + /// Pops from the source input range, nesting at tokens with the + /// type of `open` / `close`. Once a token with `close` type is reached, + /// eats that token and terminates, or when end of input is reached. + /// Assumes one opening token was already read beforehand and appended to + /// the string, e.g. `q{`. + /// Returns: false on unterminated string. + bool tokenStringImpl(IdType open, IdType close) { - mixin (tokenStart); - assert (range.bytes[range.index] == 'q'); - range.popFront(); - assert (range.bytes[range.index] == '{'); - range.popFront(); - auto app = appender!string(); - app.put("q{"); int depth = 1; immutable WhitespaceBehavior oldWhitespace = config.whitespaceBehavior; @@ -1482,25 +1529,18 @@ private pure nothrow @safe: advance(_front); if (range.index >= range.bytes.length) - { - error(token, "Error: unterminated token string literal"); - return; - } + return false; while (depth > 0 && !empty) { auto t = front(); - if (t.text is null) - app.put(str(t.type)); - else - app.put(t.text); - if (t.type == tok!"}") + if (t.type == close) { depth--; if (depth > 0) popFront(); } - else if (t.type == tok!"{") + else if (t.type == open) { depth++; popFront(); @@ -1508,11 +1548,27 @@ private pure nothrow @safe: else popFront(); } + + return depth == 0; + } + + void lexTokenString(ref Token token) + { + mixin (tokenStart); + assert (range.bytes[range.index] == 'q'); + range.popFront(); + assert (range.bytes[range.index] == '{'); + range.popFront(); + + if (!tokenStringImpl(tok!"{", tok!"}")) + { + error(token, "Error: unterminated token string literal"); + return; + } + IdType type = tok!"stringLiteral"; auto b = lexStringSuffix(type); - if (b != 0) - app.put(b); - token = Token(type, cache.intern(cast(const(ubyte)[]) app.data), line, + token = Token(type, cache.intern(range.slice(mark)), line, column, index); } @@ -1583,6 +1639,7 @@ private pure nothrow @safe: case '\'': case '"': case '?': + case '$': case '\\': case 'a': case 'b': @@ -1816,16 +1873,16 @@ private pure nothrow @safe: && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029"); } - bool isSeparating(size_t offset) @nogc + public static bool isIdentifierSeparating(scope const(char)[] code, size_t offset) @nogc { enum : ubyte { n, y, m // no, yes, maybe } - if (range.index + offset >= range.bytes.length) + if (offset >= code.length) return true; - auto c = range.bytes[range.index + offset]; + auto c = code[offset]; static immutable ubyte[256] LOOKUP_TABLE = [ y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, @@ -1851,15 +1908,21 @@ private pure nothrow @safe: return true; if (result == m) { - auto r = range; - range.popFrontN(offset); - return (r.canPeek(2) && (r.peek(2) == "\u2028" - || r.peek(2) == "\u2029")); + return (offset + 2 <= code.length) && ( + code[offset .. offset + 2] == "\u2028" || + code[offset .. offset + 2] == "\u2029" + ); } assert (false); } - + bool isSeparating(size_t offset) @nogc + { + return isIdentifierSeparating( + cast(const(char)[]) range.bytes, + range.index + offset + ); + } enum tokenStart = q{ size_t index = range.index; @@ -2531,3 +2594,31 @@ void main() { checkInvalidTrailingString(getTokensForParser(`x = q"foo`, cf, &ca)); checkInvalidTrailingString(getTokensForParser("x = '", cf, &ca)); } + +unittest +{ + void checkExactlyOneToken(string srcAndDst) + { + import std.conv; + + LexerConfig cf; + StringCache ca = StringCache(16); + + auto l = DLexer(srcAndDst, cf, &ca); + assert(l.front().type == tok!"istringLiteral", str(l.front.type)); + assert(l.front().text == srcAndDst, l.front.text); + l.popFront(); + assert(l.messages.empty); + assert(l.empty); + } + + checkExactlyOneToken(`i"hello"`); + checkExactlyOneToken(`i"$(i"world")"`); + checkExactlyOneToken(`i"$(i")")"`); + checkExactlyOneToken(`i"$(i"(")"`); + checkExactlyOneToken(`i"$(i"\"")"`); + checkExactlyOneToken(`i"\\$(i"\"")"`); + checkExactlyOneToken(`i"$$(i"\"")"`); + checkExactlyOneToken(`i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"$(i"")")")")")")")")")")")")")")")")")")")")")")")")"`); + checkExactlyOneToken(`i"$(i"i")$("i")"`); +} diff --git a/src/dparse/parser.d b/src/dparse/parser.d index 572dd6d8..927fdb99 100644 --- a/src/dparse/parser.d +++ b/src/dparse/parser.d @@ -4489,6 +4489,23 @@ class Parser return node; } + /** + * Parses an InterpolatedStringLiteral + * + * $(GRAMMAR $(RULEDEF InterpolatedStringLiteral): + * $(LITERAL istringLiteral) + * ;) + */ + InterpolatedStringLiteral parseInterpolatedStringLiteral() + { + mixin(traceEnterAndExit!(__FUNCTION__)); + auto startIndex = index; + auto node = allocator.make!InterpolatedStringLiteral; + expect(tok!"istringLiteral"); + node.tokens = tokens[startIndex .. index]; + return node; + } + /** * Parses an InExpression * @@ -5801,6 +5818,7 @@ class Parser * | $(LITERAL FloatLiteral) * | $(LITERAL StringLiteral)+ * | $(LITERAL CharacterLiteral) + * | $(LITERAL InterpolatedStringLiteral) * ;) */ PrimaryExpression parsePrimaryExpression() @@ -5919,6 +5937,9 @@ class Parser case tok!"import": mixin(parseNodeQ!(`node.importExpression`, `ImportExpression`)); break; + case tok!"istringLiteral": + mixin(parseNodeQ!(`node.interpolatedStringLiteral`, `InterpolatedStringLiteral`)); + break; case tok!"this": case tok!"super": foreach (L; Literals) { case L: }