diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 5e073ac..b52ab78 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -26,7 +26,7 @@ jobs:
- os: macOS-latest
arch: x86
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
with:
version: ${{ matrix.version }}
@@ -41,9 +41,13 @@ jobs:
${{ runner.os }}-test-${{ env.cache-name }}-
${{ runner.os }}-test-
${{ runner.os }}-
+ - uses: actions/cache@v4
+ with:
+ path: test/data/w3c
+ key: w3c-xmlconf-v20130923
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
- uses: julia-actions/julia-processcoverage@v1
- - uses: codecov/codecov-action@v1
+ - uses: codecov/codecov-action@v5
with:
- file: lcov.info
+ files: lcov.info
diff --git a/.gitignore b/.gitignore
index b000475..929dfc2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
*Manifest.toml
-*generated_xsd.jl
-*.xml
*.gz
+*.tar
*.DS_Store
+*.claude
+test/data/w3c/
+benchmarks/data/
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..13d6e29
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,158 @@
+# Changelog
+
+All notable changes to XML.jl will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+### Added
+- New streaming tokenizer (`XMLTokenizer` module) for fine-grained XML token iteration.
+- XPath support via `xpath(node, path)`.
+- `test/test_libxml2_testcases.jl`: 243 test cases borrowed from the [libxml2](https://github.com/GNOME/libxml2) test suite covering CDATA, comments, processing instructions, attributes, namespaces, DTD internal subsets, entity references, whitespace handling, Unicode, error cases, and real-world document patterns.
+- `AbstractTrees` package extension: loading both `XML` and `AbstractTrees` enables `print_tree`, `PreOrderDFS`, `Leaves`, etc. on `Node` and `LazyNode`.
+
+### Fixed
+- **Tokenizer: multi-byte UTF-8 in attribute values** — Parsing attribute values containing multi-byte UTF-8 characters (e.g., ` `) could produce a `StringIndexError` because `attr_value()` used byte arithmetic (`ncodeunits - 1`) instead of `prevind` to strip quotes. The same issue existed in `_read_attr_value!`.
+- **Tokenizer: quotes inside DTD comments** — A `"` or `'` character inside a `` comment within a DTD internal subset caused the tokenizer to misinterpret it as a quoted string delimiter, leading to an "Unterminated quoted string" error. The DOCTYPE body parser now correctly skips comment content.
+
+## [0.3.8]
+
+### Fixed
+- `XML.write` now respects `xml:space="preserve"` and suppresses indentation for elements with this attribute ([#49]).
+
+## [0.3.7]
+
+### Fixed
+- Resolved remaining issues from [#45] and fixed [#46] (whitespace preservation edge cases) ([#47]).
+
+## [0.3.6]
+
+### Added
+- `XML.write` respects `xml:space="preserve"` on elements, suppressing automatic indentation ([#45]).
+
+### Fixed
+- `String` type ambiguity on Julia nightly resolved ([#38]).
+
+## [0.3.5]
+
+### Fixed
+- `depth` and `parent` functions corrected to work properly with the DOM tree API ([#37]).
+- `escape` updated to no longer be idempotent — every `&` is now escaped, matching spec behavior ([#32], addressing [#31]).
+- `pushfirst!` support added for `Node` children ([#29]).
+
+## [0.3.4]
+
+### Fixed
+- Fixed [#26].
+- CI updated to use `julia-actions/cache@v4` and `lts` Julia version.
+
+## [0.3.3]
+
+### Added
+- `h` constructor for concise element creation (e.g., `h.div("hello"; class="main")`).
+
+### Fixed
+- Path definition error in README example ([#20]).
+
+## [0.3.2]
+
+### Fixed
+- Minor typos.
+
+## [0.3.1]
+
+### Added
+- Julia 1.6 compatibility ([#16]).
+
+### Changed
+- Smarter escaping logic.
+
+## [0.3.0]
+
+### Changed
+- Attribute internal representation changed from `Dict` to `OrderedDict` (later reverted to `Vector{Pair}`).
+
+## [0.2.3]
+
+### Fixed
+- Parse method fix.
+
+## [0.2.2]
+
+### Added
+- DTD parsing via `parse_dtd`.
+- `is_simple` and `simple_value` exports.
+- `setindex!` methods for modifying attributes.
+- `unescape` function.
+
+### Fixed
+- DOCTYPE parsing made case-insensitive.
+
+## [0.2.1]
+
+### Fixed
+- Write output fixes.
+
+## [0.2.0]
+
+### Changed
+- Major rewrite: introduced `NodeType` enum, `Node{S}` parametric struct, callable `NodeType` constructors, and `XML.write`.
+- Processing instruction support.
+- Benchmarks added.
+
+## [0.1.3]
+
+### Changed
+- Improved print output for `AbstractXMLNode`.
+
+## [0.1.2]
+
+### Added
+- AbstractTrees 0.4 compatibility ([#5]).
+
+## [0.1.1]
+
+### Added
+- `Node` implementation with `print_tree`.
+- Color output in REPL display.
+- Stopped stripping whitespace from text nodes.
+
+## [0.1.0]
+
+- Initial release.
+
+[Unreleased]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.8...HEAD
+[0.3.8]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.7...v0.3.8
+[0.3.7]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.6...v0.3.7
+[0.3.6]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.5...v0.3.6
+[0.3.5]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.4...v0.3.5
+[0.3.4]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.3...v0.3.4
+[0.3.3]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.2...v0.3.3
+[0.3.2]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.1...v0.3.2
+[0.3.1]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.0...v0.3.1
+[0.3.0]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.3...v0.3.0
+[0.2.3]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.2...v0.2.3
+[0.2.2]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.1...v0.2.2
+[0.2.1]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.0...v0.2.1
+[0.2.0]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.3...v0.2.0
+[0.1.3]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.2...v0.1.3
+[0.1.2]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.1...v0.1.2
+[0.1.1]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.0...v0.1.1
+[0.1.0]: https://github.com/JuliaComputing/XML.jl/releases/tag/v0.1.0
+
+[#5]: https://github.com/JuliaComputing/XML.jl/pull/5
+[#16]: https://github.com/JuliaComputing/XML.jl/pull/16
+[#20]: https://github.com/JuliaComputing/XML.jl/pull/20
+[#26]: https://github.com/JuliaComputing/XML.jl/issues/26
+[#29]: https://github.com/JuliaComputing/XML.jl/pull/29
+[#31]: https://github.com/JuliaComputing/XML.jl/issues/31
+[#32]: https://github.com/JuliaComputing/XML.jl/pull/32
+[#37]: https://github.com/JuliaComputing/XML.jl/pull/37
+[#38]: https://github.com/JuliaComputing/XML.jl/pull/38
+[#43]: https://github.com/JuliaComputing/XML.jl/issues/43
+[#45]: https://github.com/JuliaComputing/XML.jl/pull/45
+[#46]: https://github.com/JuliaComputing/XML.jl/issues/46
+[#47]: https://github.com/JuliaComputing/XML.jl/pull/47
+[#49]: https://github.com/JuliaComputing/XML.jl/pull/49
diff --git a/Project.toml b/Project.toml
index 49b96c0..a42a821 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,12 +1,14 @@
name = "XML"
uuid = "72c71f33-b9b6-44de-8c94-c961784809e2"
+version = "0.4.0"
authors = ["Josh Day and contributors"]
-version = "0.3.8"
-[deps]
-Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
-OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+[weakdeps]
+AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
+
+[extensions]
+XMLAbstractTreesExt = "AbstractTrees"
[compat]
-OrderedCollections = "1.4, 1.5"
-julia = "1.6"
+AbstractTrees = "0.4"
+julia = "1.9"
diff --git a/README.md b/README.md
index ddb1156..067c06d 100644
--- a/README.md
+++ b/README.md
@@ -4,39 +4,8 @@
Read and write XML in pure Julia.
-
-
-# Introduction
-
-This package offers fast data structures for reading and writing XML files with a consistent interface:
-
-
-
-### `Node`/`LazyNode` Interface:
-
-```
-nodetype(node) → XML.NodeType (an enum type)
-tag(node) → String or Nothing
-attributes(node) → OrderedDict{String, String} or Nothing
-value(node) → String or Nothing
-children(node) → Vector{typeof(node)}
-is_simple(node) → Bool (whether node is simple .e.g. item )
-simple_value(node) → e.g. "item" from item )
-```
-
-### Extended Interface for `LazyNode`
-
-```
-depth(node) → Int
-next(node) → typeof(node)
-prev(node) → typeof(node)
-parent(node) → typeof(node)
-```
-
-
-
# Quickstart
```julia
@@ -58,79 +27,76 @@ doc[end][2] # Second child of root
# Node Element (6 children)
```
-
-
-# Data Structures that Represent XML Nodes
+
-## Preliminary: `NodeType`
+# `Node` Interface
-- Each item in an XML DOM is classified by its `NodeType`.
-- Every `XML.jl` struct defines a `nodetype(x)` method that returns its `NodeType`.
+Every node in the XML DOM is represented by `Node`, a single type parametrized on its string storage.
-| NodeType | XML Representation | `Node` Constructor |
-|----------|--------------------|------------------|
-| `Document` | An entire document | `Document(children...)`
-| `DTD` | `` | `DTD(...) `
-| `Declaration` | `` | `Declaration(; attrs...)`
-| `ProcessingInstruction` | `` | `ProcessingInstruction(tag; attrs...)`
-| `Comment` | `` | `Comment(text)`
-| `CData` | `` | `CData(text)`
-| `Element` | ` children... ` | `Element(tag, children...; attrs...)`
-| `Text` | the `text` part of `text ` | `Text(text)`
+```
+nodetype(node) -> XML.NodeType (an enum)
+tag(node) -> String or Nothing
+attributes(node) -> XML.Attributes{String} or Nothing
+value(node) -> String or Nothing
+children(node) -> Vector{Node}
+is_simple(node) -> Bool (e.g. text )
+simple_value(node) -> e.g. "text" from text
+```
-## `Node`: Probably What You're Looking For
+## `NodeType`
-- `read`-ing a `Node` loads the entire XML DOM in memory.
-- See the table above for convenience constructors.
-- `Node`s have some additional methods that aid in construction/mutation:
+Each item in an XML DOM is classified by its `NodeType`:
-```julia
-# Add a child:
-push!(parent::Node, child::Node)
-
-# Replace a child:
-parent[2] = child
-
-# Add/change an attribute:
-node["key"] = value
+| NodeType | XML Representation | Constructor |
+|----------|--------------------|-------------|
+| `Document` | An entire document | `Document(children...)` |
+| `DTD` | `` | `DTD(...)` |
+| `Declaration` | `` | `Declaration(; attrs...)` |
+| `ProcessingInstruction` | `` | `ProcessingInstruction(tag; attrs...)` |
+| `Comment` | `` | `Comment(text)` |
+| `CData` | `` | `CData(text)` |
+| `Element` | ` children... ` | `Element(tag, children...; attrs...)` |
+| `Text` | the `text` part of `text ` | `Text(text)` |
-node["key"]
-```
+
-- `Node` is an immutable type. However, you can easily create a copy with one or more field values changed by using the `Node(::Node, children...; attrs...)` constructor where `children` are appended to the source node's children and `attrs` are appended to the node's attributes.
+## Mutation
```julia
-node = XML.Element("tag", "child")
-# Node Element (1 child)
+push!(parent, child) # Add a child
+parent[2] = child # Replace a child
+node["key"] = "value" # Add/change an attribute
+node["key"] # Get an attribute
+```
-simple_value(node)
-# "child"
+
-node2 = Node(node, "added"; id="my-id")
-# Node Element (2 children)
+## Tree Navigation
-node2.children
-# 2-element Vector{Node}:
-# Node Text "child"
-# Node Text "added"
+```julia
+depth(child, root) # Depth of child relative to root
+parent(child, root) # Parent of child within root's tree
+siblings(child, root) # Siblings of child within root's tree
```
-### Writing `Element` `Node`s with `XML.h`
+
+
+## Writing Elements with `XML.h`
Similar to [Cobweb.jl](https://github.com/JuliaComputing/Cobweb.jl#-creating-nodes-with-cobwebh), `XML.h` enables you to write elements with a simpler syntax:
```julia
using XML: h
-julia> node = h.parent(
- h.child("first child content", id="id1"),
- h.child("second child content", id="id2")
- )
+node = h.parent(
+ h.child("first child content", id="id1"),
+ h.child("second child content", id="id2")
+)
# Node Element (2 children)
-julia> print(XML.write(node))
+print(XML.write(node))
#
# first child content
# second child content
@@ -139,111 +105,228 @@ julia> print(XML.write(node))
-## `XML.LazyNode`: For Fast Iteration through an XML File
-
-A lazy data structure that just keeps track of the position in the raw data (`Vector{UInt8}`) to read from.
-
-- You can iterate over a `LazyNode` to "read" through an XML file:
-
-```julia
-doc = read(filename, LazyNode)
-
-foreach(println, doc)
-# LazyNode Declaration
-# LazyNode Element
-# LazyNode Element
-# LazyNode Element
-# LazyNode Text "Gambardella, Matthew"
-# LazyNode Element
-# ⋮
-```
-
-
-
# Reading
```julia
-# Reading from file:
+# From a file:
read(filename, Node)
-read(filename, LazyNode)
-
-# Parsing from string:
-parse(Node, str)
-parse(LazyNode, str)
+# From a string:
+parse(str, Node)
```
-
+
# Writing
```julia
XML.write(filename::String, node) # write to file
+XML.write(io::IO, node) # write to stream
+XML.write(node) # return String
+```
+
+`XML.write` respects `xml:space="preserve"` on elements, suppressing automatic indentation.
+
+
-XML.write(io::IO, node) # write to stream
+# XPath
-XML.write(node) # String
+Query nodes using a subset of XPath 1.0 via `xpath(node, path)`:
+
+```julia
+doc = parse("""
+
+ hello
+ world
+
+""", Node)
+
+root = doc[end]
+
+xpath(root, "//b") # All descendants
+xpath(root, "a[@id='2']/b") # inside
+xpath(root, "a[1]") # First child
+xpath(root, "//b/text()") # Text nodes inside all s
```
+### Supported syntax
+
+| Expression | Description |
+|------------|-------------|
+| `/` | Root / path separator |
+| `tag` | Child element by name |
+| `*` | Any child element |
+| `//` | Descendant-or-self (recursive) |
+| `.` | Current node |
+| `..` | Parent node |
+| `[n]` | Positional predicate (1-based) |
+| `[@attr]` | Has-attribute predicate |
+| `[@attr='v']` | Attribute-value predicate |
+| `text()` | Text node children |
+| `node()` | All node children |
+| `@attr` | Attribute value (returns strings) |
-
+
-# Performance
+# Streaming Tokenizer
-- XML.jl performs comparatively to [EzXML.jl](https://github.com/JuliaIO/EzXML.jl), which wraps the C library [libxml2](https://gitlab.gnome.org/GNOME/libxml2/-/wikis/home).
-- See the `benchmarks/suite.jl` for the code to produce these results.
-- The following output was generated in a Julia session with the following `versioninfo`:
+For large files or when you need fine-grained control, `XML.XMLTokenizer` provides a streaming tokenizer that yields tokens without building a DOM. Token kinds live in the `XML.XMLTokenizer.TokenKinds` baremodule (e.g. `TokenKinds.OPEN_TAG`, `TokenKinds.TEXT`).
-```
-julia> versioninfo()
-Julia Version 1.9.4
-Commit 8e5136fa297 (2023-11-14 08:46 UTC)
-Build Info:
- Official https://julialang.org/ release
-Platform Info:
- OS: macOS (arm64-apple-darwin22.4.0)
- CPU: 10 × Apple M1 Pro
- WORD_SIZE: 64
- LIBM: libopenlibm
- LLVM: libLLVM-14.0.6 (ORCJIT, apple-m1)
- Threads: 8 on 8 virtual cores
+```julia
+using XML.XMLTokenizer: tokenize
+
+for token in tokenize("text ")
+ println(token.kind, " => ", repr(String(token.raw)))
+end
+# OPEN_TAG => " ">"
+# OPEN_TAG => " "attr"
+# ATTR_VALUE => "\"val\""
+# TAG_CLOSE => ">"
+# TEXT => "text"
+# CLOSE_TAG => " ">"
+# CLOSE_TAG => " ">"
```
+
-### Reading an XML File
+# `LazyNode`
-```
- XML.LazyNode 0.009583
- XML.Node ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 1071.32
- EzXML.readxml ■■■■■■■■■ 284.346
- XMLDict.xml_dict ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 1231.47
+For read-only access without building a full DOM tree, use `LazyNode`. It stores only a reference to the source string and re-tokenizes on demand, using significantly less memory:
+
+```julia
+doc = parse(xml_string, LazyNode)
+doc = read("file.xml", LazyNode)
```
-### Writing an XML File
+`LazyNode` supports the same read-only interface as `Node`: `nodetype`, `tag`, `attributes`, `value`, `children`, `is_simple`, `simple_value`, plus integer and string indexing.
-```
- Write: XML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 289.638
- Write: EzXML ■■■■■■■■■■■■■ 93.4631
-```
+For streaming and high-throughput workloads, several extra accessors avoid materializing intermediate collections:
-### Lazily Iterating over Each Node
-```
- LazyNode ■■■■■■■■■ 51.752
- EzXML.StreamReader ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 226.271
+```julia
+sourcetext(n) # zero-copy SubString view of the node's raw source bytes
+eachchildnode(n) # lazy iterator over children — no Vector allocation
+children!(buf, n) # collect children into a reusable buffer
+eachattribute(n) # lazy iterator over attribute name=>value pairs
+is_simple_value(n) # combined is_simple + simple_value (one tokenizer pass)
+get(n, key, default) # single-attribute read without building Attributes
+XML.write(n) # zero-copy: returns node's original source text
+XML.write(n; normalize=true) # re-parse + pretty-print, collapses source whitespace
```
-### Collecting All Names/Tags in an XML File
-```
- XML.LazyNode ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 210.482
- EzXML.StreamReader ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 276.238
- EzXML.readxml ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 263.269
+### Memory-mapped files
+
+For very large files, combine `LazyNode` with memory mapping to avoid reading the entire file into heap memory:
+
+```julia
+using XML, Mmap, StringViews
+
+doc = open("very_large.xml") do io
+ sv = StringView(Mmap.mmap(io))
+ parse(sv, LazyNode)
+end
```
+
+# AbstractTrees Integration
+
+Loading [`AbstractTrees`](https://github.com/JuliaCollections/AbstractTrees.jl) alongside XML enables tree-walking utilities (`print_tree`, `PreOrderDFS`, `Leaves`, etc.) on both `Node` and `LazyNode`:
+
+```julia
+using XML, AbstractTrees
+
+doc = parse(" ", Node)
+print_tree(doc)
+# Document
+# └─
+# ├─
+# └─
+# └─
+
+for n in PreOrderDFS(doc)
+ nodetype(n) == Element && println(tag(n))
+end
+```
+
-# Possible Gotchas
+# Benchmarks
+
+Benchmark source: [benchmarks.jl](benchmarks/benchmarks.jl). Test data: `books.xml` (small, ~4 KB) and a generated XMark auction XML (medium, ~14 MB).
+
+
+
+```
+ Parse (small) — median time (ms)
+
+ XML.jl ■■■■■■■ 0.0374
+ XML.jl (SS) ■■■■■■■ 0.0339
+ EzXML ■■■■ 0.0218
+ LightXML ■■■■ 0.0218
+ XMLDict ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.200
+
+
+ Parse (medium) — median time (ms)
+
+ XML.jl ■■■■■■■■■■■■■■ 185.0
+ XML.jl (SS) ■■■■■■■■■■■■■ 168.0
+ EzXML ■■■■■■ 81.5
+ LightXML ■■■■■■■■ 107.0
+ XMLDict ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 520.0
+
+
+ Write (small) — median time (ms)
+
+ XML.jl ■■■■ 0.00929
+ EzXML ■■■■ 0.0103
+ LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.101
+
+
+ Write (medium) — median time (ms)
+
+ XML.jl ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 48.0
+ EzXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 52.6
+ LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 56.1
+
-- XML.jl doesn't automatically escape special characters (`<`, `>`, `&`, `"`, and `'` ) for you. However, we provide utility functions for doing the conversions back and forth:
- - `XML.escape(::String)` and `XML.unescape(::String)`
- - `XML.escape!(::Node)` and `XML.unescape!(::Node)`.
+ Read file — median time (ms)
+
+ XML.jl ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 193.0
+ EzXML ■■■■■■■■■■■■■■■■■■■■■■■■■ 121.0
+ LightXML ■■■■■■■■■■■■■■■■■■■■ 95.6
+
+
+ Collect tags (small) — median time (ms)
+
+ XML.jl ■■■■■■ 0.000586
+ EzXML ■■■■■■■■■■■■■■■■■■■■■■ 0.00205
+ LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.00368
+
+
+ Collect tags (medium) — median time (ms)
+
+ XML.jl ■■■■■■■■■■■■■■■■■■ 13.1
+ EzXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 29.4
+ LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 23.2
+```
+
+```julia
+versioninfo()
+# Julia Version 1.12.6
+# Commit 15346901f00 (2026-04-09 19:20 UTC)
+# Build Info:
+# Official https://julialang.org release
+# Platform Info:
+# OS: macOS (arm64-apple-darwin24.0.0)
+# CPU: 10 × Apple M1 Pro
+# WORD_SIZE: 64
+# LLVM: libLLVM-18.1.7 (ORCJIT, apple-m1)
+# GC: Built with stock GC
+# Threads: 8 default, 1 interactive, 8 GC (on 8 virtual cores)
+# Environment:
+# JULIA_NUM_THREADS = auto
+```
diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml
index ed90996..043988c 100644
--- a/benchmarks/Project.toml
+++ b/benchmarks/Project.toml
@@ -2,7 +2,8 @@
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
-OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+LightXML = "9c8b4983-aa76-5018-a973-4c85ecc9e179"
UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
XML = "72c71f33-b9b6-44de-8c94-c961784809e2"
XMLDict = "228000da-037f-5747-90a9-8195ccbf91a5"
diff --git a/benchmarks/XMarkGenerator.jl b/benchmarks/XMarkGenerator.jl
new file mode 100644
index 0000000..7f780a0
--- /dev/null
+++ b/benchmarks/XMarkGenerator.jl
@@ -0,0 +1,377 @@
+"""
+ XMarkGenerator
+
+XMark-inspired XML benchmark data generator. Produces well-formed XML documents modeling an
+internet auction site, following the XMark benchmark DTD structure.
+
+ include("xml_generator.jl")
+ using .XMarkGenerator
+
+ xml = generate_xmark(1.0) # return String (~14 MB)
+ generate_xmark("out.xml", 5.0) # write to file (~68 MB)
+ generate_xmark(stdout, 0.1; seed=123) # write to IO (~1.4 MB)
+"""
+module XMarkGenerator
+
+using Random
+
+export generate_xmark
+
+#-----------------------------------------------------------------# Word lists
+const WORDS = [
+ "about", "above", "across", "after", "again", "against", "along", "already", "also",
+ "always", "among", "another", "answer", "around", "asked", "away", "back", "because",
+ "become", "been", "before", "began", "behind", "being", "below", "between", "body",
+ "book", "both", "brought", "build", "built", "business", "came", "cannot", "carry",
+ "cause", "certain", "change", "children", "city", "close", "come", "complete", "could",
+ "country", "course", "cover", "current", "dark", "days", "deep", "development",
+ "different", "direction", "does", "done", "door", "down", "draw", "during", "each",
+ "early", "earth", "east", "education", "effort", "eight", "either", "else", "end",
+ "enough", "even", "every", "example", "experience", "face", "fact", "family", "feel",
+ "field", "find", "first", "five", "follow", "food", "force", "form", "found", "four",
+ "from", "full", "gave", "general", "give", "going", "gone", "good", "government",
+ "great", "green", "ground", "group", "grow", "half", "hand", "happen", "hard", "have",
+ "head", "help", "here", "high", "himself", "hold", "home", "hope", "house", "however",
+ "hundred", "idea", "important", "inch", "include", "increase", "island", "just", "keep",
+ "kind", "knew", "know", "land", "large", "last", "later", "learn", "left", "less",
+ "letter", "life", "light", "like", "line", "list", "little", "live", "long", "look",
+ "lost", "made", "main", "make", "many", "mark", "matter", "mean", "might", "mind",
+ "miss", "money", "morning", "most", "mother", "move", "much", "music", "must", "name",
+ "near", "need", "never", "next", "night", "nothing", "notice", "number", "often",
+ "once", "only", "open", "order", "other", "over", "page", "paper", "part", "past",
+ "pattern", "people", "perhaps", "period", "person", "picture", "place", "plan", "plant",
+ "play", "point", "position", "possible", "power", "present", "problem", "produce",
+ "product", "program", "public", "pull", "purpose", "question", "quite", "reach", "read",
+ "real", "receive", "record", "remember", "rest", "result", "right", "river", "room",
+ "round", "rule", "same", "school", "second", "seem", "sentence", "service", "seven",
+ "several", "shall", "short", "should", "show", "side", "since", "sing", "size", "small",
+ "social", "some", "song", "soon", "south", "space", "stand", "start", "state", "still",
+ "stood", "story", "strong", "study", "such", "sure", "system", "table", "take", "tell",
+ "test", "their", "them", "then", "there", "these", "thing", "think", "those", "thought",
+ "three", "through", "time", "together", "took", "toward", "travel", "tree", "true",
+ "turn", "under", "unit", "until", "upon", "usually", "value", "very", "voice", "walk",
+ "want", "watch", "water", "well", "went", "were", "west", "what", "where", "which",
+ "while", "white", "whole", "will", "with", "without", "woman", "word", "work", "world",
+ "would", "write", "year", "young",
+]
+const FIRST_NAMES = ["James", "John", "Robert", "Michael", "William", "David", "Richard",
+ "Joseph", "Thomas", "Charles", "Mary", "Patricia", "Jennifer", "Linda", "Barbara",
+ "Elizabeth", "Susan", "Jessica", "Sarah", "Karen"]
+const LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller",
+ "Davis", "Rodriguez", "Martinez", "Wilson", "Anderson", "Taylor", "Thomas", "Hernandez",
+ "Moore", "Martin", "Jackson", "Thompson", "White"]
+const COUNTRIES = ["United States", "Germany", "France", "Japan", "Australia", "Brazil",
+ "Canada", "India", "China", "Mexico", "Argentina", "Spain", "Italy", "United Kingdom",
+ "Netherlands", "Sweden", "Norway", "Finland", "Denmark", "Belgium"]
+const CITIES = ["New York", "London", "Paris", "Tokyo", "Sydney", "Berlin", "Rome",
+ "Madrid", "Amsterdam", "Toronto", "Moscow", "Beijing", "Seoul", "Mumbai", "Cairo",
+ "Dublin", "Prague", "Vienna", "Warsaw", "Budapest"]
+const STREETS = ["Main", "Oak", "Elm", "Maple", "Pine", "Cedar", "Birch", "Walnut",
+ "Cherry", "Ash", "Spruce", "Willow", "Poplar", "Laurel", "Juniper"]
+const EDUCATIONS = ["High School", "College", "Graduate", "Associate", "Master", "Doctorate"]
+const GENDERS = ["male", "female"]
+const PAYMENTS = ["Creditcard", "Money order", "Personal check", "Cash"]
+const SHIPPING = ["Will ship only within country", "Will ship internationally",
+ "Buyer pays fixed shipping costs", "Free shipping", "See description for shipping"]
+const REGIONS = ["africa", "asia", "australia", "europe", "namerica", "samerica"]
+
+#-----------------------------------------------------------------# Random data helpers
+rand_word(rng) = rand(rng, WORDS)
+rand_date(rng) = string(rand(rng, 1999:2025), "/", lpad(rand(rng, 1:12), 2, '0'), "/", lpad(rand(rng, 1:28), 2, '0'))
+rand_time(rng) = string(lpad(rand(rng, 0:23), 2, '0'), ":", lpad(rand(rng, 0:59), 2, '0'), ":", lpad(rand(rng, 0:59), 2, '0'))
+rand_price(rng) = string(rand(rng, 1:9999), ".", lpad(rand(rng, 0:99), 2, '0'))
+rand_phone(rng) = string("+", rand(rng, 1:99), " (", rand(rng, 100:999), ") ", rand(rng, 1000000:9999999))
+rand_zip(rng) = string(lpad(rand(rng, 0:99999), 5, '0'))
+rand_cc(rng) = join(rand(rng, 1000:9999, 4), " ")
+rand_email(rng) = string(lowercase(rand(rng, FIRST_NAMES)), rand(rng, 1:999), "@", lowercase(rand(rng, LAST_NAMES)), ".com")
+
+#-----------------------------------------------------------------# XML writing helpers
+function xml_escape_char(io::IO, c::Char)
+ if c == '&'; print(io, "&")
+ elseif c == '<'; print(io, "<")
+ elseif c == '>'; print(io, ">")
+ elseif c == '"'; print(io, """)
+ else; print(io, c)
+ end
+end
+
+function write_escaped(io::IO, s::AbstractString)
+ for c in s
+ xml_escape_char(io, c)
+ end
+end
+
+function write_text_content(rng, io; min_words=10, max_words=50)
+ n = rand(rng, min_words:max_words)
+ for i in 1:n
+ i > 1 && print(io, ' ')
+ w = rand_word(rng)
+ r = rand(rng)
+ if r < 0.03
+ print(io, "", w, " ")
+ elseif r < 0.06
+ print(io, "", w, " ")
+ elseif r < 0.08
+ print(io, "", w, " ")
+ else
+ print(io, w)
+ end
+ end
+end
+
+function write_description(rng, io, indent)
+ println(io, indent, "")
+ if rand(rng) < 0.7
+ print(io, indent, " ")
+ write_text_content(rng, io; min_words=15, max_words=80)
+ println(io, " ")
+ else
+ println(io, indent, " ")
+ for _ in 1:rand(rng, 2:6)
+ print(io, indent, " ")
+ write_text_content(rng, io; min_words=8, max_words=40)
+ println(io, " ")
+ end
+ println(io, indent, " ")
+ end
+ println(io, indent, " ")
+end
+
+function write_annotation(rng, io, indent, n_people)
+ println(io, indent, "")
+ println(io, indent, " ")
+ write_description(rng, io, string(indent, " "))
+ println(io, indent, " ", rand(rng, 1:10), " ")
+ println(io, indent, " ")
+end
+
+#-----------------------------------------------------------------# Section writers
+function write_item(rng, io, id, n_categories)
+ featured = rand(rng) < 0.1 ? " featured=\"yes\"" : ""
+ println(io, " - ")
+ println(io, "
", rand(rng, CITIES), " ")
+ println(io, " ", rand(rng, 1:50), " ")
+ println(io, " ", rand_word(rng), " ", rand_word(rng), " ", rand_word(rng), " ")
+ println(io, " ", rand(rng, PAYMENTS), " ")
+ write_description(rng, io, " ")
+ println(io, " ", rand(rng, SHIPPING), " ")
+ for _ in 1:rand(rng, 1:3)
+ println(io, " ")
+ end
+ println(io, " ")
+ for _ in 1:rand(rng, 0:5)
+ println(io, " ")
+ println(io, " ", rand_email(rng), " ")
+ println(io, " ", rand_email(rng), " ")
+ println(io, " ", rand_date(rng), " ")
+ print(io, " ")
+ write_text_content(rng, io; min_words=10, max_words=60)
+ println(io, " ")
+ println(io, " ")
+ end
+ println(io, " ")
+ println(io, " ")
+end
+
+function write_categories(rng, io, n)
+ println(io, " ")
+ for i in 1:n
+ println(io, " ")
+ println(io, " ", rand_word(rng), " ", rand_word(rng), " ")
+ write_description(rng, io, " ")
+ println(io, " ")
+ end
+ println(io, " ")
+end
+
+function write_catgraph(rng, io, n_edges, n_categories)
+ println(io, " ")
+ for _ in 1:n_edges
+ from = string("category",rand(rng, 1:n_categories))
+ to = string("category",rand(rng, 1:n_categories))
+ println(io, " ")
+ end
+ println(io, " ")
+end
+
+function write_people(rng, io, n, n_categories, n_open)
+ println(io, " ")
+ for i in 1:n
+ println(io, " ")
+ println(io, " ", rand(rng, FIRST_NAMES), " ", rand(rng, LAST_NAMES), " ")
+ println(io, " ", rand_email(rng), " ")
+ if rand(rng) < 0.8
+ println(io, " ", rand_phone(rng), " ")
+ end
+ if rand(rng) < 0.7
+ println(io, " ")
+ println(io, " ", rand(rng, 1:9999), " ", rand(rng, STREETS), " St ")
+ println(io, " ", rand(rng, CITIES), " ")
+ println(io, " ", rand(rng, COUNTRIES), " ")
+ if rand(rng) < 0.5
+ println(io, " ", rand_word(rng), " ")
+ end
+ println(io, " ", rand_zip(rng), " ")
+ println(io, " ")
+ end
+ if rand(rng) < 0.5
+ println(io, " http://www.", lowercase(rand(rng, LAST_NAMES)), ".com/~",
+ lowercase(rand(rng, FIRST_NAMES)), " ")
+ end
+ if rand(rng) < 0.6
+ println(io, " ", rand_cc(rng), " ")
+ end
+ if rand(rng) < 0.7
+ income = rand(rng) < 0.8 ? string(" income=\"", rand(rng, 10000.0:0.01:250000.0), "\"") : ""
+ println(io, " ")
+ for _ in 1:rand(rng, 0:4)
+ println(io, " ")
+ end
+ if rand(rng) < 0.8
+ println(io, " ", rand(rng, EDUCATIONS), " ")
+ end
+ if rand(rng) < 0.7
+ println(io, " ", rand(rng, GENDERS), " ")
+ end
+ println(io, " ", rand_word(rng), " ")
+ if rand(rng) < 0.8
+ println(io, " ", rand(rng, 18:85), " ")
+ end
+ println(io, " ")
+ end
+ if n_open > 0 && rand(rng) < 0.3
+ println(io, " ")
+ for _ in 1:rand(rng, 1:5)
+ println(io, " ")
+ end
+ println(io, " ")
+ end
+ println(io, " ")
+ end
+ println(io, " ")
+end
+
+function write_open_auctions(rng, io, n, n_items, n_people)
+ println(io, " ")
+ for i in 1:n
+ println(io, " ")
+ println(io, " ", rand_price(rng), " ")
+ if rand(rng) < 0.5
+ println(io, " ", rand_price(rng), " ")
+ end
+ for _ in 1:rand(rng, 0:12)
+ println(io, " ")
+ println(io, " ", rand_date(rng), " ")
+ println(io, " ", rand_time(rng), " ")
+ println(io, " ")
+ println(io, " ", rand_price(rng), " ")
+ println(io, " ")
+ end
+ println(io, " ", rand_price(rng), " ")
+ if rand(rng) < 0.3
+ println(io, " ", rand(rng, ["Yes", "No"]), " ")
+ end
+ println(io, " ")
+ println(io, " ")
+ write_annotation(rng, io, " ", n_people)
+ println(io, " ", rand(rng, 1:10), " ")
+ println(io, " ", rand(rng, ["Regular", "Featured"]), " ")
+ println(io, " ")
+ println(io, " ", rand_date(rng), " ")
+ println(io, " ", rand_date(rng), " ")
+ println(io, " ")
+ println(io, " ")
+ end
+ println(io, " ")
+end
+
+function write_closed_auctions(rng, io, n, n_open, n_items, n_people)
+ println(io, " ")
+ for i in 1:n
+ println(io, " ")
+ println(io, " ")
+ println(io, " ")
+ # Use item IDs that don't overlap with open auctions
+ item_id = n_open + i
+ item_id = item_id <= n_items ? item_id : rand(rng, 1:n_items)
+ println(io, " ")
+ println(io, " ", rand_price(rng), " ")
+ println(io, " ", rand_date(rng), " ")
+ println(io, " ", rand(rng, 1:10), " ")
+ println(io, " ", rand(rng, ["Regular", "Featured"]), " ")
+ if rand(rng) < 0.7
+ write_annotation(rng, io, " ", n_people)
+ end
+ println(io, " ")
+ end
+ println(io, " ")
+end
+
+#-----------------------------------------------------------------# Main entry points
+"""
+ generate_xmark([io_or_filename], factor; seed=42)
+
+Generate an XMark-style auction XML document. `factor` scales all entity counts linearly.
+
+Approximate output sizes (may vary slightly):
+- `factor=0.1` → ~1.4 MB
+- `factor=1.0` → ~14 MB
+- `factor=2.0` → ~27 MB
+- `factor=5.0` → ~68 MB
+"""
+function generate_xmark(io::IO, factor::Real; seed::Int=42)
+ factor > 0 || throw(ArgumentError("factor must be positive, got $factor"))
+ rng = Xoshiro(seed)
+
+ n_per_region = max(1, round(Int, 500 * factor))
+ n_people = max(1, round(Int, 5000 * factor))
+ n_categories = max(1, round(Int, 200 * factor))
+ n_open = max(1, round(Int, 2000 * factor))
+ n_closed = max(1, round(Int, 1500 * factor))
+ n_edges = max(1, round(Int, 1000 * factor))
+ n_items = n_per_region * 6
+
+ # Clamp auctions to available items
+ n_open = min(n_open, n_items)
+ n_closed = min(n_closed, max(1, n_items - n_open))
+
+ println(io, "")
+ println(io, "")
+
+ # Regions with items
+ println(io, " ")
+ item_id = 0
+ for region in REGIONS
+ println(io, " <", region, ">")
+ for _ in 1:n_per_region
+ item_id += 1
+ write_item(rng, io, item_id, n_categories)
+ end
+ println(io, " ", region, ">")
+ end
+ println(io, " ")
+
+ write_categories(rng, io, n_categories)
+ write_catgraph(rng, io, n_edges, n_categories)
+ write_people(rng, io, n_people, n_categories, n_open)
+ write_open_auctions(rng, io, n_open, n_items, n_people)
+ write_closed_auctions(rng, io, n_closed, n_open, n_items, n_people)
+
+ println(io, " ")
+ nothing
+end
+
+function generate_xmark(filename::AbstractString, factor::Real; seed::Int=42)
+ open(filename, "w") do io
+ generate_xmark(io, factor; seed)
+ end
+ filename
+end
+
+function generate_xmark(factor::Real; seed::Int=42)
+ io = IOBuffer()
+ generate_xmark(io, factor; seed)
+ String(take!(io))
+end
+
+end # module
diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl
new file mode 100644
index 0000000..7bd2cb1
--- /dev/null
+++ b/benchmarks/benchmarks.jl
@@ -0,0 +1,527 @@
+using XML
+using XML: Element, nodetype, tag, children
+using EzXML: EzXML
+using XMLDict: XMLDict
+using LightXML: LightXML
+using BenchmarkTools
+using DataFrames
+using InteractiveUtils
+
+include("XMarkGenerator.jl")
+using .XMarkGenerator
+
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10
+BenchmarkTools.DEFAULT_PARAMETERS.samples = 20000
+
+#-----------------------------------------------------------------------------# Test data
+# Small file (~120 lines)
+small_file = joinpath(@__DIR__, "..", "test", "data", "books.xml")
+small_xml = read(small_file, String)
+
+# Medium file (generated XMark auction XML, ~14 MB)
+medium_file = joinpath(@__DIR__, "data", "xmark.xml")
+if !isfile(medium_file)
+ mkpath(dirname(medium_file))
+ @info "Generating XMark benchmark XML..."
+ generate_xmark(medium_file, 1.0)
+end
+medium_xml = read(medium_file, String)
+
+df = DataFrame(kind=String[], name=String[], bench=BenchmarkTools.Trial[])
+
+macro add_benchmark(kind, name, expr...)
+ esc(:(let
+ @info string($kind, " - ", $name)
+ bench = @benchmark $(expr...)
+ push!(df, (; kind=$kind, name=$name, bench))
+ end))
+end
+
+const SSNode = Node{SubString{String}}
+
+#-----------------------------------------------------------------------------# Parse (small)
+@add_benchmark "Parse (small)" "XML.jl" parse($small_xml, Node)
+@add_benchmark "Parse (small)" "XML.jl (SS)" parse($small_xml, SSNode)
+@add_benchmark "Parse (small)" "EzXML" EzXML.parsexml($small_xml)
+@add_benchmark "Parse (small)" "LightXML" LightXML.parse_string($small_xml)
+@add_benchmark "Parse (small)" "XMLDict" XMLDict.xml_dict($small_xml)
+
+#-----------------------------------------------------------------------------# Parse (medium)
+@add_benchmark "Parse (medium)" "XML.jl" parse($medium_xml, Node)
+@add_benchmark "Parse (medium)" "XML.jl (SS)" parse($medium_xml, SSNode)
+@add_benchmark "Parse (medium)" "EzXML" EzXML.parsexml($medium_xml)
+@add_benchmark "Parse (medium)" "LightXML" LightXML.parse_string($medium_xml)
+@add_benchmark "Parse (medium)" "XMLDict" XMLDict.xml_dict($medium_xml)
+
+#-----------------------------------------------------------------------------# Write (small)
+@add_benchmark "Write (small)" "XML.jl" XML.write(o) setup=(o = parse(small_xml, Node))
+@add_benchmark "Write (small)" "EzXML" sprint(print, o) setup=(o = EzXML.parsexml(small_xml))
+@add_benchmark "Write (small)" "LightXML" LightXML.save_file(o, f) setup=(o = LightXML.parse_string(small_xml); f = tempname()) teardown=(LightXML.free(o); rm(f, force=true))
+
+#-----------------------------------------------------------------------------# Write (medium)
+@add_benchmark "Write (medium)" "XML.jl" XML.write(o) setup=(o = parse(medium_xml, Node))
+@add_benchmark "Write (medium)" "EzXML" sprint(print, o) setup=(o = EzXML.parsexml(medium_xml))
+@add_benchmark "Write (medium)" "LightXML" LightXML.save_file(o, f) setup=(o = LightXML.parse_string(medium_xml); f = tempname()) teardown=(LightXML.free(o); rm(f, force=true))
+
+#-----------------------------------------------------------------------------# Read from file
+@add_benchmark "Read file" "XML.jl" read($medium_file, Node)
+@add_benchmark "Read file" "EzXML" EzXML.readxml($medium_file)
+@add_benchmark "Read file" "LightXML" LightXML.parse_file($medium_file)
+
+#-----------------------------------------------------------------------------# Collect element tags
+function xml_collect_tags(node)
+ out = String[]
+ _xml_collect_tags!(out, node)
+ out
+end
+function _xml_collect_tags!(out, node)
+ for c in children(node)
+ if nodetype(c) === Element
+ push!(out, tag(c))
+ _xml_collect_tags!(out, c)
+ end
+ end
+end
+
+function ezxml_collect_tags(node::EzXML.Node)
+ out = String[]
+ _ezxml_collect_tags!(out, node)
+ out
+end
+function _ezxml_collect_tags!(out, node::EzXML.Node)
+ for child in EzXML.eachelement(node)
+ push!(out, child.name)
+ _ezxml_collect_tags!(out, child)
+ end
+end
+
+function lightxml_collect_tags(root::LightXML.XMLElement)
+ out = String[]
+ _lightxml_collect_tags!(out, root)
+ out
+end
+function _lightxml_collect_tags!(out, el::LightXML.XMLElement)
+ for child in LightXML.child_elements(el)
+ push!(out, LightXML.name(child))
+ _lightxml_collect_tags!(out, child)
+ end
+end
+
+@add_benchmark "Collect tags (small)" "XML.jl" xml_collect_tags(o) setup=(o = parse(small_xml, Node))
+@add_benchmark "Collect tags (small)" "EzXML" ezxml_collect_tags(o.root) setup=(o = EzXML.parsexml(small_xml))
+@add_benchmark "Collect tags (small)" "LightXML" lightxml_collect_tags(LightXML.root(o)) setup=(o = LightXML.parse_string(small_xml)) teardown=(LightXML.free(o))
+
+@add_benchmark "Collect tags (medium)" "XML.jl" xml_collect_tags(o) setup=(o = parse(medium_xml, Node))
+@add_benchmark "Collect tags (medium)" "EzXML" ezxml_collect_tags(o.root) setup=(o = EzXML.parsexml(medium_xml))
+@add_benchmark "Collect tags (medium)" "LightXML" lightxml_collect_tags(LightXML.root(o)) setup=(o = LightXML.parse_string(medium_xml)) teardown=(LightXML.free(o))
+
+#-----------------------------------------------------------------------------# XLSX-pattern fixtures
+# These fixtures mirror the shapes that XLSX.jl exercises:
+# - `sst_xml` matches `xl/sharedStrings.xml` (lots of small `… ` entries
+# separated by whitespace — the layout that exposes the LazyNode write/normalize choice)
+# - `ws_xml` matches `xl/sheetN.xml` (a `` with many ``s of `… `)
+
+@info "Generating XLSX-pattern fixtures..."
+
+sst_xml = let buf = IOBuffer()
+ print(buf, "\n")
+ print(buf, "\n")
+ for i in 1:50000
+ print(buf, " shared string value number ", i, " \n")
+ end
+ print(buf, " ")
+ String(take!(buf))
+end
+
+ws_xml = let buf = IOBuffer()
+ print(buf, "\n")
+ print(buf, "\n")
+ print(buf, "\n")
+ for r in 1:3000
+ print(buf, " ")
+ for c in 1:15
+ col = Char(UInt32('A') + c - 1)
+ print(buf, "", r * c, " ")
+ end
+ print(buf, "
\n")
+ end
+ print(buf, " ")
+ String(take!(buf))
+end
+
+# String-heavy worksheet: cells reference the shared string table (`t="s"`, `` = SST
+# index). This is the most common real-world shape and the one where the `has_entities`
+# short-circuit and zero-copy accessors matter most for XLSX.jl `readtable`.
+ws_str_xml = let buf = IOBuffer()
+ print(buf, "\n")
+ print(buf, "\n")
+ print(buf, "\n")
+ for r in 1:5000
+ print(buf, " ")
+ for c in 1:8
+ col = Char(UInt32('A') + c - 1)
+ print(buf, "", (r * c) % 50000, " ")
+ end
+ print(buf, "
\n")
+ end
+ print(buf, " ")
+ String(take!(buf))
+end
+
+# Entity-heavy SST: every needs decoding, exercising the `has_entities` slow path.
+sst_entity_xml = let buf = IOBuffer()
+ print(buf, "")
+ for i in 1:50000
+ print(buf, "A & B <tag> #", i, " ")
+ end
+ print(buf, " ")
+ String(take!(buf))
+end
+
+@info " sst_xml: $(round(length(sst_xml) / 1024 / 1024, digits=2)) MB ($(50000) )"
+@info " ws_xml: $(round(length(ws_xml) / 1024 / 1024, digits=2)) MB ($(3000) × $(15) )"
+@info " ws_str_xml: $(round(length(ws_str_xml) / 1024 / 1024, digits=2)) MB ($(5000) × $(8) string )"
+@info " sst_entity_xml: $(round(length(sst_entity_xml) / 1024 / 1024, digits=2)) MB (entity-heavy)"
+
+# Helper: walk a Node-based subtree and concatenate its text content.
+function _node_unformatted(io::IO, el::Node{String})
+ XML.tag(el) == "rPh" && return
+ if XML.tag(el) == "t"
+ if XML.is_simple(el)
+ write(io, XML.simple_value(el))
+ else
+ v = XML.value(el)
+ isnothing(v) || write(io, v)
+ end
+ return
+ end
+ for c in XML.children(el)
+ _node_unformatted(io, c)
+ end
+end
+_node_unformatted(el::Node{String}) = sprint(_node_unformatted, el)
+
+#-----------------------------------------------------------------------------# Parse: XLSX shapes
+@add_benchmark "Parse SST (LazyNode)" "XML.jl" parse($sst_xml, LazyNode)
+@add_benchmark "Parse SST (LazyNode)" "Node (for ref)" parse($sst_xml, Node)
+@add_benchmark "Parse worksheet (LazyNode)" "XML.jl" parse($ws_xml, LazyNode)
+@add_benchmark "Parse worksheet (LazyNode)" "Node (for ref)" parse($ws_xml, Node)
+
+#-----------------------------------------------------------------------------# SST loading (XLSX.jl sst.jl pattern)
+# Mirrors `sst_load!`: stream children, capture raw XML + unformatted text per entry.
+
+@add_benchmark "SST: write each " "LazyNode + write (zero-copy)" begin
+ out = String[]
+ sst_el = doc[end]
+ for si in XML.eachchildnode(sst_el)
+ XML.nodetype(si) === XML.Element || continue
+ push!(out, XML.write(si))
+ end
+ out
+end setup=(doc = parse(sst_xml, LazyNode))
+
+@add_benchmark "SST: write each " "LazyNode + write (normalize)" begin
+ out = String[]
+ sst_el = doc[end]
+ for si in XML.eachchildnode(sst_el)
+ XML.nodetype(si) === XML.Element || continue
+ push!(out, XML.write(si; normalize=true))
+ end
+ out
+end setup=(doc = parse(sst_xml, LazyNode))
+
+@add_benchmark "SST: write each " "Node (for ref)" begin
+ out = String[]
+ sst_el = doc[end]
+ for si in XML.children(sst_el)
+ XML.tag(si) == "si" || continue
+ push!(out, XML.write(si))
+ end
+ out
+end setup=(doc = parse(sst_xml, Node))
+
+@add_benchmark "SST: unformatted text" "LazyNode + is_simple_value" begin
+ out = Vector{Union{Nothing,SubString{String},String}}()
+ sst_el = doc[end]
+ for si in XML.eachchildnode(sst_el)
+ XML.nodetype(si) === XML.Element || continue
+ for t in XML.eachchildnode(si)
+ XML.nodetype(t) === XML.Element || continue
+ XML.tag(t) == "t" || continue
+ push!(out, XML.is_simple_value(t))
+ end
+ end
+ out
+end setup=(doc = parse(sst_xml, LazyNode))
+
+@add_benchmark "SST: unformatted text" "Node (for ref)" begin
+ out = String[]
+ sst_el = doc[end]
+ for si in XML.children(sst_el)
+ XML.tag(si) == "si" || continue
+ push!(out, _node_unformatted(si))
+ end
+ out
+end setup=(doc = parse(sst_xml, Node))
+
+#-----------------------------------------------------------------------------# Worksheet: nested row/cell loops (XLSX.jl cell.jl pattern)
+# Mirrors `Cell(c::LazyNode, ws)` and `get_rowcells!`: iterate , then , then attrs + .
+
+@add_benchmark "Worksheet: collect rows" "children() (fresh Vector each call)" begin
+ sd = doc[end][1] #
+ XML.children(sd)
+end setup=(doc = parse(ws_xml, LazyNode))
+
+@add_benchmark "Worksheet: collect rows" "children!(buf, n) (reused buffer)" begin
+ sd = doc[end][1]
+ XML.children!(buf, sd)
+end setup=(doc = parse(ws_xml, LazyNode); buf = XML.LazyNode{String}[])
+
+@add_benchmark "Worksheet: attribute scan" "eachattribute" begin
+ n = 0
+ sd = doc[end][1]
+ for row in XML.eachchildnode(sd)
+ XML.nodetype(row) === XML.Element || continue
+ for c in XML.eachchildnode(row)
+ XML.nodetype(c) === XML.Element || continue
+ for (k, v) in XML.eachattribute(c)
+ n += sizeof(v)
+ end
+ end
+ end
+ n
+end setup=(doc = parse(ws_xml, LazyNode))
+
+@add_benchmark "Worksheet: attribute scan" "attributes() (materialize dict)" begin
+ n = 0
+ sd = doc[end][1]
+ for row in XML.eachchildnode(sd)
+ XML.nodetype(row) === XML.Element || continue
+ for c in XML.eachchildnode(row)
+ XML.nodetype(c) === XML.Element || continue
+ a = XML.attributes(c)
+ isnothing(a) && continue
+ for (_, v) in a
+ n += sizeof(v)
+ end
+ end
+ end
+ n
+end setup=(doc = parse(ws_xml, LazyNode))
+
+@add_benchmark "Worksheet: single attr fetch" "get(c, \"r\", \"\")" begin
+ n = 0
+ sd = doc[end][1]
+ for row in XML.eachchildnode(sd)
+ XML.nodetype(row) === XML.Element || continue
+ for c in XML.eachchildnode(row)
+ XML.nodetype(c) === XML.Element || continue
+ n += sizeof(get(c, "r", ""))
+ end
+ end
+ n
+end setup=(doc = parse(ws_xml, LazyNode))
+
+@add_benchmark "Worksheet: single attr fetch" "attributes(c)[\"r\"]" begin
+ n = 0
+ sd = doc[end][1]
+ for row in XML.eachchildnode(sd)
+ XML.nodetype(row) === XML.Element || continue
+ for c in XML.eachchildnode(row)
+ XML.nodetype(c) === XML.Element || continue
+ a = XML.attributes(c)
+ isnothing(a) && continue
+ n += sizeof(a["r"])
+ end
+ end
+ n
+end setup=(doc = parse(ws_xml, LazyNode))
+
+@add_benchmark "Worksheet: value" "is_simple_value" begin
+ n = 0
+ sd = doc[end][1]
+ for row in XML.eachchildnode(sd)
+ XML.nodetype(row) === XML.Element || continue
+ for c in XML.eachchildnode(row)
+ XML.nodetype(c) === XML.Element || continue
+ for v in XML.eachchildnode(c)
+ XML.nodetype(v) === XML.Element || continue
+ val = XML.is_simple_value(v)
+ isnothing(val) || (n += sizeof(val))
+ end
+ end
+ end
+ n
+end setup=(doc = parse(ws_xml, LazyNode))
+
+@add_benchmark "Worksheet: value" "is_simple + simple_value" begin
+ n = 0
+ sd = doc[end][1]
+ for row in XML.eachchildnode(sd)
+ XML.nodetype(row) === XML.Element || continue
+ for c in XML.eachchildnode(row)
+ XML.nodetype(c) === XML.Element || continue
+ for v in XML.eachchildnode(c)
+ XML.nodetype(v) === XML.Element || continue
+ if XML.is_simple(v)
+ n += sizeof(XML.simple_value(v))
+ end
+ end
+ end
+ end
+ n
+end setup=(doc = parse(ws_xml, LazyNode))
+
+#-----------------------------------------------------------------------------# End-to-end XLSX.jl hot loops
+# The micro-benchmarks above isolate single operations. These mirror the *combined* work
+# XLSX.jl actually does per entry, so a regression in any sub-operation (parse, accessor,
+# entity short-circuit, iterator allocation) shows up where it matters for spreadsheet read
+# performance.
+
+# Mirrors XLSX.jl `sst.jl` `unformatted_text` / `gather_strings!`: recursively walk an
+# subtree concatenating text content.
+function _xlsx_unformatted(io::IO, e::XML.LazyNode)
+ t = XML.tag(e)
+ t == "rPh" && return nothing
+ if t == "t"
+ v = XML.is_simple_value(e)
+ isnothing(v) || write(io, v)
+ else
+ for ch in XML.eachchildnode(e)
+ XML.nodetype(ch) === XML.Element && _xlsx_unformatted(io, ch)
+ end
+ end
+ nothing
+end
+
+# Mirrors XLSX.jl `sst.jl` `sst_load!`: stream , capture raw XML + unformatted text.
+@add_benchmark "XLSX sst_load! (end-to-end)" "LazyNode" begin
+ sst_el = doc[end]
+ shared = String[]
+ unformatted = String[]
+ for si in XML.eachchildnode(sst_el)
+ XML.nodetype(si) === XML.Element || continue
+ XML.tag(si) == "si" || continue
+ push!(shared, XML.write(si))
+ io = IOBuffer()
+ _xlsx_unformatted(io, si)
+ push!(unformatted, String(take!(io)))
+ end
+ (length(shared), length(unformatted))
+end setup=(doc = parse(sst_xml, LazyNode))
+
+# Mirrors XLSX.jl `cell.jl` `Cell(c, ws)` + `get_rowcells!`: per cell, read the r/s/t
+# attributes and the value, exactly as the reader does. Numeric worksheet.
+@add_benchmark "XLSX cell read (end-to-end)" "numeric ws" begin
+ sd = doc[end][1]
+ ncells = 0
+ acc = 0
+ for row in XML.eachchildnode(sd)
+ XML.nodetype(row) === XML.Element || continue
+ for c in XML.eachchildnode(row)
+ XML.nodetype(c) === XML.Element || continue
+ ref = get(c, "r", "")
+ t = get(c, "t", "")
+ s = get(c, "s", "")
+ acc += sizeof(ref) + sizeof(t) + sizeof(s)
+ for child in XML.eachchildnode(c)
+ XML.nodetype(child) === XML.Element || continue
+ if XML.tag(child) == "v"
+ v = XML.is_simple_value(child)
+ isnothing(v) || (acc += sizeof(v))
+ end
+ end
+ ncells += 1
+ end
+ end
+ (ncells, acc)
+end setup=(doc = parse(ws_xml, LazyNode))
+
+# Same loop on the string-heavy worksheet (t="s", SST-indexed) — the common real shape
+# and the one most sensitive to the entity short-circuit / zero-copy accessors.
+@add_benchmark "XLSX cell read (end-to-end)" "string ws" begin
+ sd = doc[end][1]
+ ncells = 0
+ acc = 0
+ for row in XML.eachchildnode(sd)
+ XML.nodetype(row) === XML.Element || continue
+ for c in XML.eachchildnode(row)
+ XML.nodetype(c) === XML.Element || continue
+ ref = get(c, "r", "")
+ t = get(c, "t", "")
+ s = get(c, "s", "")
+ acc += sizeof(ref) + sizeof(t) + sizeof(s)
+ for child in XML.eachchildnode(c)
+ XML.nodetype(child) === XML.Element || continue
+ if XML.tag(child) == "v"
+ v = XML.is_simple_value(child)
+ isnothing(v) || (acc += sizeof(v))
+ end
+ end
+ ncells += 1
+ end
+ end
+ (ncells, acc)
+end setup=(doc = parse(ws_str_xml, LazyNode))
+
+# Realistic-string SST: entries containing characters that DO need entity decoding, so the
+# `has_entities` slow path is exercised (catches regressions in the decode branch).
+@add_benchmark "XLSX sst_load! (end-to-end)" "LazyNode (entity-heavy)" begin
+ sst_el = doc[end]
+ n = 0
+ for si in XML.eachchildnode(sst_el)
+ XML.nodetype(si) === XML.Element || continue
+ XML.tag(si) == "si" || continue
+ for t in XML.eachchildnode(si)
+ XML.nodetype(t) === XML.Element || continue
+ v = XML.is_simple_value(t)
+ isnothing(v) || (n += sizeof(v))
+ end
+ end
+ n
+end setup=(doc = parse(sst_entity_xml, LazyNode))
+
+#-----------------------------------------------------------------------------# Write benchmarks_results.md
+_fmt_ms(t) = string(round(t, sigdigits=3), " ms")
+
+function _compare_indicator(xml_ms, other_ms)
+ ratio = xml_ms / other_ms
+ pct = abs(round((ratio - 1) * 100, digits=1))
+ ratio > 1.05 ? "(XML.jl $(pct)% slower)" : ratio < 0.95 ? "(XML.jl $(pct)% faster)" : "(~same)"
+end
+
+outfile = joinpath(@__DIR__, "benchmarks_results.md")
+open(outfile, "w") do io
+ println(io, "# XML.jl Benchmarks\n")
+ println(io, "```")
+ for kind in unique(df.kind)
+ g = groupby(df, :kind)
+ haskey(g, (;kind)) || continue
+ sub = g[(;kind)]
+ println(io, kind)
+ # Find XML.jl baseline (first row starting with "XML.jl")
+ xml_row = findfirst(r -> startswith(r.name, "XML.jl") && !contains(r.name, "(SS)"), eachrow(sub))
+ xml_ms = isnothing(xml_row) ? nothing : median(sub[xml_row, :bench]).time / 1e6
+ for row in eachrow(sub)
+ ms = median(row.bench).time / 1e6
+ indicator = ""
+ if !isnothing(xml_ms) && !startswith(row.name, "XML.jl")
+ indicator = " " * _compare_indicator(xml_ms, ms)
+ end
+ println(io, "\t", rpad(row.name, 16), lpad(_fmt_ms(ms), 12), indicator)
+ end
+ println(io)
+ end
+ println(io, "```")
+
+ println(io, "\n```julia")
+ println(io, "versioninfo()")
+ buf = IOBuffer()
+ InteractiveUtils.versioninfo(buf)
+ for line in eachline(IOBuffer(take!(buf)))
+ println(io, "# ", line)
+ end
+ println(io, "```")
+end
+
+println("Results written to $outfile")
diff --git a/benchmarks/benchmarks_results.md b/benchmarks/benchmarks_results.md
new file mode 100644
index 0000000..60c6ae0
--- /dev/null
+++ b/benchmarks/benchmarks_results.md
@@ -0,0 +1,101 @@
+# XML.jl Benchmarks
+
+```
+Parse (small)
+ XML.jl 0.0378 ms
+ XML.jl (SS) 0.0349 ms
+ EzXML 0.0224 ms (XML.jl 68.8% slower)
+ LightXML 0.022 ms (XML.jl 72.3% slower)
+ XMLDict 0.209 ms (XML.jl 81.9% faster)
+
+Parse (medium)
+ XML.jl 201.0 ms
+ XML.jl (SS) 190.0 ms
+ EzXML 80.3 ms (XML.jl 150.7% slower)
+ LightXML 114.0 ms (XML.jl 76.1% slower)
+ XMLDict 608.0 ms (XML.jl 66.9% faster)
+
+Write (small)
+ XML.jl 0.00957 ms
+ EzXML 0.0108 ms (XML.jl 11.7% faster)
+ LightXML 0.105 ms (XML.jl 90.9% faster)
+
+Write (medium)
+ XML.jl 48.3 ms
+ EzXML 36.9 ms (XML.jl 30.9% slower)
+ LightXML 56.2 ms (XML.jl 14.1% faster)
+
+Read file
+ XML.jl 191.0 ms
+ EzXML 115.0 ms (XML.jl 67.2% slower)
+ LightXML 97.4 ms (XML.jl 96.6% slower)
+
+Collect tags (small)
+ XML.jl 0.000602 ms
+ EzXML 0.0021 ms (XML.jl 71.4% faster)
+ LightXML 0.00381 ms (XML.jl 84.2% faster)
+
+Collect tags (medium)
+ XML.jl 12.7 ms
+ EzXML 16.3 ms (XML.jl 21.8% faster)
+ LightXML 23.5 ms (XML.jl 45.9% faster)
+
+Parse SST (LazyNode)
+ XML.jl 5.29e-6 ms
+ Node (for ref) 45.8 ms (XML.jl 100.0% faster)
+
+Parse worksheet (LazyNode)
+ XML.jl 5.21e-6 ms
+ Node (for ref) 69.6 ms (XML.jl 100.0% faster)
+
+SST: write each
+ LazyNode + write (zero-copy) 93.0 ms
+ LazyNode + write (normalize) 157.0 ms
+ Node (for ref) 9.83 ms
+
+SST: unformatted text
+ LazyNode + is_simple_value 102.0 ms
+ Node (for ref) 5.31 ms
+
+Worksheet: collect rows
+ children() (fresh Vector each call) 87.9 ms
+ children!(buf, n) (reused buffer) 87.9 ms
+
+Worksheet: attribute scan
+ eachattribute 87.8 ms
+ attributes() (materialize dict) 87.2 ms
+
+Worksheet: single attr fetch
+ get(c, "r", "") 87.6 ms
+ attributes(c)["r"] 88.0 ms
+
+Worksheet: value
+ is_simple_value 87.1 ms
+ is_simple + simple_value 87.8 ms
+
+XLSX sst_load! (end-to-end)
+ LazyNode 149.0 ms
+ LazyNode (entity-heavy) 113.0 ms
+
+XLSX cell read (end-to-end)
+ numeric ws 87.9 ms
+ string ws 80.2 ms
+
+```
+
+```julia
+versioninfo()
+# Julia Version 1.12.6
+# Commit 15346901f00 (2026-04-09 19:20 UTC)
+# Build Info:
+# Official https://julialang.org release
+# Platform Info:
+# OS: macOS (arm64-apple-darwin24.0.0)
+# CPU: 10 × Apple M1 Pro
+# WORD_SIZE: 64
+# LLVM: libLLVM-18.1.7 (ORCJIT, apple-m1)
+# GC: Built with stock GC
+# Threads: 8 default, 1 interactive, 8 GC (on 8 virtual cores)
+# Environment:
+# JULIA_NUM_THREADS = auto
+```
diff --git a/benchmarks/compare.jl b/benchmarks/compare.jl
new file mode 100644
index 0000000..4bdc22a
--- /dev/null
+++ b/benchmarks/compare.jl
@@ -0,0 +1,290 @@
+#= Compare current dev XML.jl against the last released version.
+
+Usage:
+ julia benchmarks/compare.jl [tag]
+
+`tag` defaults to the latest git tag (e.g. v0.3.8).
+
+This script:
+1. Runs benchmarks using the current (dev) code
+2. Checks out the release tag into a temp worktree
+3. Runs the same benchmarks against that version
+4. Prints a side-by-side comparison
+=#
+
+using BenchmarkTools, Serialization, InteractiveUtils
+
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5
+BenchmarkTools.DEFAULT_PARAMETERS.samples = 10000
+
+const ROOT = dirname(@__DIR__)
+
+const RELEASE_TAG = if length(ARGS) >= 1
+ ARGS[1]
+else
+ tags = readlines(`git -C $ROOT tag --sort=version:refname`)
+ filter!(t -> startswith(t, "v"), tags)
+ last(tags)
+end
+
+const SMALL_FILE = joinpath(ROOT, "test", "data", "books.xml")
+const SMALL_XML = read(SMALL_FILE, String)
+
+# Generate medium file if needed
+include(joinpath(ROOT, "benchmarks", "XMarkGenerator.jl"))
+using .XMarkGenerator
+const MEDIUM_FILE = joinpath(ROOT, "benchmarks", "data", "xmark.xml")
+if !isfile(MEDIUM_FILE)
+ mkpath(dirname(MEDIUM_FILE))
+ @info "Generating XMark benchmark XML..."
+ generate_xmark(MEDIUM_FILE, 1.0)
+end
+const MEDIUM_XML = read(MEDIUM_FILE, String)
+
+#-----------------------------------------------------------------------------# Helpers
+function _collect_tags!(out, node)
+ for c in XML.children(node)
+ if XML.nodetype(c) === XML.Element
+ push!(out, XML.tag(c))
+ _collect_tags!(out, c)
+ end
+ end
+end
+
+function bench_collect_tags(node)
+ out = String[]
+ _collect_tags!(out, node)
+ out
+end
+
+#-----------------------------------------------------------------------------# Run dev benchmarks
+println("="^60)
+println(" XML.jl Benchmark Comparison")
+println(" Current (dev) vs $RELEASE_TAG")
+println("="^60)
+println()
+
+print("Running dev benchmarks...")
+flush(stdout)
+
+using XML
+
+dev_results = Dict{String, BenchmarkTools.Trial}()
+
+const SSNode = Node{SubString{String}}
+
+dev_small = parse(SMALL_XML, Node)
+dev_small_ss = parse(SMALL_XML, SSNode)
+dev_medium = parse(MEDIUM_XML, Node)
+dev_medium_ss = parse(MEDIUM_XML, SSNode)
+
+dev_results["Parse (small), String"] = @benchmark parse($SMALL_XML, Node)
+dev_results["Parse (small), SubString"] = @benchmark parse($SMALL_XML, SSNode)
+dev_results["Parse (medium), String"] = @benchmark parse($MEDIUM_XML, Node)
+dev_results["Parse (medium), SubString"] = @benchmark parse($MEDIUM_XML, SSNode)
+dev_results["Write (small)"] = @benchmark XML.write($dev_small)
+dev_results["Write (medium)"] = @benchmark XML.write($dev_medium)
+dev_results["Read file (medium), String"] = @benchmark read($MEDIUM_FILE, Node)
+dev_results["Read file (medium), SubString"] = @benchmark parse(read($MEDIUM_FILE, String), SSNode)
+dev_results["Collect tags (small), String"] = @benchmark bench_collect_tags($dev_small)
+dev_results["Collect tags (small), SubString"] = @benchmark bench_collect_tags($dev_small_ss)
+dev_results["Collect tags (medium), String"] = @benchmark bench_collect_tags($dev_medium)
+dev_results["Collect tags (medium), SubString"] = @benchmark bench_collect_tags($dev_medium_ss)
+
+# LazyNode benchmarks
+dev_lazy_small = parse(SMALL_XML, LazyNode)
+dev_lazy_medium = parse(MEDIUM_XML, LazyNode)
+
+dev_results["Parse (small), LazyNode"] = @benchmark parse($SMALL_XML, LazyNode)
+dev_results["Parse (medium), LazyNode"] = @benchmark parse($MEDIUM_XML, LazyNode)
+dev_results["Write (small), LazyNode"] = @benchmark XML.write($(dev_lazy_small[1]))
+dev_results["Write (medium), LazyNode"] = @benchmark XML.write($(dev_lazy_medium[1]))
+dev_results["sourcetext, small"] = @benchmark sourcetext($(dev_lazy_small[1]))
+dev_results["sourcetext, medium"] = @benchmark sourcetext($(dev_lazy_medium[1]))
+dev_lazy_medium_root = let ch = children(dev_lazy_medium)
+ i = findfirst(c -> nodetype(c) === Element, ch)
+ ch[i]
+end
+dev_results["children vs eachchildnode, children"] = @benchmark children($dev_lazy_medium_root)
+dev_results["children vs eachchildnode, eachchildnode"] = @benchmark collect(eachchildnode($dev_lazy_medium_root))
+
+# SST-like benchmark: many children, write each one
+const SST_N = 10_000
+const SST_XML = "" * join("""string_$i """ for i in 1:SST_N) * " "
+dev_sst_node = parse(SST_XML, Node)
+dev_sst_lazy = parse(SST_XML, LazyNode)
+dev_sst_root_node = only(children(dev_sst_node))
+dev_sst_root_lazy = only(children(dev_sst_lazy))
+
+function bench_sst_node(xml)
+ root = only(children(parse(xml, Node)))
+ out = String[]
+ for c in XML.children(root)
+ XML.nodetype(c) === XML.Element && push!(out, XML.write(c))
+ end
+ out
+end
+function bench_sst_lazy_children(xml)
+ root = only(children(parse(xml, LazyNode)))
+ out = String[]
+ for c in XML.children(root)
+ XML.nodetype(c) === XML.Element && push!(out, XML.write(c))
+ end
+ out
+end
+function bench_sst_lazy_eachchildnode(xml)
+ root = only(children(parse(xml, LazyNode)))
+ out = String[]
+ for c in XML.eachchildnode(root)
+ XML.nodetype(c) === XML.Element && push!(out, XML.write(c))
+ end
+ out
+end
+
+dev_results["SST (parse+iterate+write), Node"] = @benchmark bench_sst_node($SST_XML)
+dev_results["SST (parse+iterate+write), LazyNode+children"] = @benchmark bench_sst_lazy_children($SST_XML)
+dev_results["SST (parse+iterate+write), LazyNode+eachchildnode"] = @benchmark bench_sst_lazy_eachchildnode($SST_XML)
+
+println(" done")
+
+#-----------------------------------------------------------------------------# Run release benchmarks via temp worktree + separate process
+print("Setting up $RELEASE_TAG worktree...")
+flush(stdout)
+
+worktree_dir = mktempdir()
+run(pipeline(`git -C $ROOT worktree add $worktree_dir $RELEASE_TAG`, stdout=devnull, stderr=devnull))
+println(" done")
+
+release_results_file = joinpath(worktree_dir, "_results.jls")
+
+release_script = joinpath(worktree_dir, "_bench.jl")
+write(release_script, """
+using Pkg
+Pkg.activate(; temp=true)
+Pkg.develop(path=$(repr(worktree_dir)))
+Pkg.add("BenchmarkTools")
+Pkg.add("Serialization")
+
+using BenchmarkTools, Serialization, XML
+
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5
+BenchmarkTools.DEFAULT_PARAMETERS.samples = 10000
+
+small_xml = read($(repr(SMALL_FILE)), String)
+medium_xml = read($(repr(MEDIUM_FILE)), String)
+results = Dict{String, BenchmarkTools.Trial}()
+
+results["Parse (small)"] = @benchmark parse(\$small_xml, Node)
+
+try
+ SSNode = Node{SubString{String}}
+ results["Parse (small, SS)"] = @benchmark parse(\$small_xml, SSNode)
+ results["Parse (medium, SS)"] = @benchmark parse(\$medium_xml, SSNode)
+catch
+end
+
+results["Parse (medium)"] = @benchmark parse(\$medium_xml, Node)
+
+small_node = parse(small_xml, Node)
+medium_node = parse(medium_xml, Node)
+results["Write (small)"] = @benchmark XML.write(\$small_node)
+results["Write (medium)"] = @benchmark XML.write(\$medium_node)
+results["Read file (medium)"] = @benchmark read($(repr(MEDIUM_FILE)), Node)
+
+function _collect_tags!(out, node)
+ for c in XML.children(node)
+ if XML.nodetype(c) === XML.Element
+ push!(out, XML.tag(c))
+ _collect_tags!(out, c)
+ end
+ end
+end
+function bench_collect_tags(node)
+ out = String[]
+ _collect_tags!(out, node)
+ out
+end
+results["Collect tags (small)"] = @benchmark bench_collect_tags(\$small_node)
+results["Collect tags (medium)"] = @benchmark bench_collect_tags(\$medium_node)
+
+try
+ lazy_small = parse(small_xml, LazyNode)
+ lazy_medium = parse(medium_xml, LazyNode)
+ results["Parse (small), LazyNode"] = @benchmark parse(\$small_xml, LazyNode)
+ results["Parse (medium), LazyNode"] = @benchmark parse(\$medium_xml, LazyNode)
+catch
+end
+
+serialize($(repr(release_results_file)), results)
+""")
+
+print("Running $RELEASE_TAG benchmarks...")
+flush(stdout)
+run(pipeline(`julia $release_script`, stdout=devnull, stderr=devnull))
+release_results = deserialize(release_results_file)
+println(" done")
+
+# Cleanup worktree
+run(pipeline(`git -C $ROOT worktree remove --force $worktree_dir`, stdout=devnull, stderr=devnull))
+
+#-----------------------------------------------------------------------------# Write compare_results.md
+_fmt_ms(t) = string(round(t, sigdigits=3), " ms")
+
+function _compare_indicator(dev_ms, rel_ms)
+ change = (dev_ms / rel_ms - 1) * 100
+ pct = abs(round(change, digits=1))
+ change < -5 ? "($(pct)% faster)" : change > 5 ? "($(pct)% slower)" : "(~same)"
+end
+
+groups = [
+ ("Parse (small)", "Parse (small)", ["Parse (small), String", "Parse (small), SubString", "Parse (small), LazyNode"]),
+ ("Parse (medium)", "Parse (medium)", ["Parse (medium), String", "Parse (medium), SubString", "Parse (medium), LazyNode"]),
+ ("Write (small)", "Write (small)", ["Write (small)", "Write (small), LazyNode"]),
+ ("Write (medium)", "Write (medium)", ["Write (medium)", "Write (medium), LazyNode"]),
+ ("Read file (medium)", "Read file (medium)", ["Read file (medium), String", "Read file (medium), SubString"]),
+ ("Collect tags (small)", "Collect tags (small)", ["Collect tags (small), String", "Collect tags (small), SubString"]),
+ ("Collect tags (medium)","Collect tags (medium)", ["Collect tags (medium), String", "Collect tags (medium), SubString"]),
+ ("sourcetext", nothing, ["sourcetext, small", "sourcetext, medium"]),
+ ("children vs eachchildnode (medium)", nothing, ["children vs eachchildnode, children", "children vs eachchildnode, eachchildnode"]),
+ ("SST-like: parse+iterate+write (10k)", nothing, ["SST (parse+iterate+write), Node", "SST (parse+iterate+write), LazyNode+children", "SST (parse+iterate+write), LazyNode+eachchildnode"]),
+]
+
+outfile = joinpath(@__DIR__, "compare_results.md")
+open(outfile, "w") do io
+ println(io, "# XML.jl Benchmark Comparison: dev vs $RELEASE_TAG\n")
+ println(io, "```")
+ for (title, rel_key, dev_keys) in groups
+ rel_ms = (!isnothing(rel_key) && haskey(release_results, rel_key)) ? median(release_results[rel_key]).time / 1e6 : nothing
+ any(k -> haskey(dev_results, k), dev_keys) || (isnothing(rel_ms) && continue)
+
+ println(io, title)
+ if !isnothing(rel_ms)
+ println(io, "\t", rpad(RELEASE_TAG, 16), lpad(_fmt_ms(rel_ms), 12))
+ end
+ for dk in dev_keys
+ haskey(dev_results, dk) || continue
+ dev_ms = median(dev_results[dk]).time / 1e6
+ label = occursin(", ", dk) ? split(dk, ", "; limit=2)[2] : "dev"
+ ms_str = lpad(_fmt_ms(dev_ms), 12)
+ padlen = max(16, length(label) + 2)
+ if isnothing(rel_ms)
+ println(io, "\t", rpad(label, padlen), ms_str)
+ else
+ println(io, "\t", rpad(label, padlen), ms_str, " ", _compare_indicator(dev_ms, rel_ms))
+ end
+ end
+ println(io)
+ end
+ println(io, "```")
+
+ println(io, "\n```julia")
+ println(io, "versioninfo()")
+ buf = IOBuffer()
+ InteractiveUtils.versioninfo(buf)
+ for line in eachline(IOBuffer(take!(buf)))
+ println(io, "# ", line)
+ end
+ println(io, "```")
+end
+
+println("Results written to $outfile")
diff --git a/benchmarks/compare_results.md b/benchmarks/compare_results.md
new file mode 100644
index 0000000..dffbcae
--- /dev/null
+++ b/benchmarks/compare_results.md
@@ -0,0 +1,71 @@
+# XML.jl Benchmark Comparison: dev vs v0.3.8
+
+```
+Parse (small)
+ v0.3.8 0.139 ms
+ String 0.0409 ms (70.6% faster)
+ SubString 0.033 ms (76.3% faster)
+ LazyNode 6.33e-6 ms (100.0% faster)
+
+Parse (medium)
+ v0.3.8 829.0 ms
+ String 200.0 ms (75.8% faster)
+ SubString 163.0 ms (80.4% faster)
+ LazyNode 6.33e-6 ms (100.0% faster)
+
+Write (small)
+ v0.3.8 0.032 ms
+ dev 0.0215 ms (32.6% faster)
+ LazyNode 0.000217 ms (99.3% faster)
+
+Write (medium)
+ v0.3.8 156.0 ms
+ dev 99.2 ms (36.3% faster)
+ LazyNode 0.000273 ms (100.0% faster)
+
+Read file (medium)
+ v0.3.8 755.0 ms
+ String 193.0 ms (74.4% faster)
+ SubString 179.0 ms (76.3% faster)
+
+Collect tags (small)
+ v0.3.8 0.00064 ms
+ String 0.000714 ms (11.7% slower)
+ SubString 0.00211 ms (230.3% slower)
+
+Collect tags (medium)
+ v0.3.8 21.6 ms
+ String 13.3 ms (38.7% faster)
+ SubString 20.3 ms (6.2% faster)
+
+sourcetext
+ small 0.000191 ms
+ medium 0.000248 ms
+
+children vs eachchildnode (medium)
+ children 76.8 ms
+ eachchildnode 80.4 ms
+
+SST-like: parse+iterate+write (10k)
+ Node 9.01 ms
+ LazyNode+children 9.78 ms
+ LazyNode+eachchildnode 10.4 ms
+
+```
+
+```julia
+versioninfo()
+# Julia Version 1.12.6
+# Commit 15346901f00 (2026-04-09 19:20 UTC)
+# Build Info:
+# Official https://julialang.org release
+# Platform Info:
+# OS: macOS (arm64-apple-darwin24.0.0)
+# CPU: 10 × Apple M1 Pro
+# WORD_SIZE: 64
+# LLVM: libLLVM-18.1.7 (ORCJIT, apple-m1)
+# GC: Built with stock GC
+# Threads: 8 default, 1 interactive, 8 GC (on 8 virtual cores)
+# Environment:
+# JULIA_NUM_THREADS = auto
+```
diff --git a/benchmarks/dict_benchmarks.jl b/benchmarks/dict_benchmarks.jl
new file mode 100644
index 0000000..7dd90a3
--- /dev/null
+++ b/benchmarks/dict_benchmarks.jl
@@ -0,0 +1,71 @@
+using XML
+using BenchmarkTools
+
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5
+
+#-----------------------------------------------------------------------------# Setup
+sizes = [2, 5, 10, 20]
+
+function make_xml(n::Int)
+ attrs = join((" attr$i=\"value$i\"" for i in 1:n))
+ " "
+end
+
+function make_pairs(n::Int)
+ Pair{String,String}["attr$i" => "value$i" for i in 1:n]
+end
+
+pt(t) = BenchmarkTools.prettytime(t)
+
+function printrow(n, op, t_dict, t_attr)
+ pct = round(100 * (t_dict - t_attr) / t_dict, digits=1)
+ label = pct > 0 ? "$(pct)% faster" : "$(-pct)% slower"
+ println(rpad("$n attrs", 10), " | ", rpad(op, 22), " | ",
+ rpad("Dict $(pt(t_dict))", 22), " | ",
+ rpad("Attributes $(pt(t_attr))", 26), " | ", label)
+end
+
+#-----------------------------------------------------------------------------# Benchmarks
+println("=" ^ 110)
+println(" Attributes vs Dict Benchmarks")
+println("=" ^ 110)
+println(rpad("Size", 10), " | ", rpad("Operation", 22), " | ",
+ rpad("Dict", 22), " | ", rpad("Attributes", 26), " | Change")
+println("-" ^ 110)
+
+for n in sizes
+ pairs = make_pairs(n)
+ d = Dict(pairs)
+ a = XML.Attributes(pairs)
+ key_mid = "attr$(n ÷ 2 + 1)"
+ key_last = "attr$n"
+
+ tests = [
+ ("construct", () -> @benchmark(Dict($pairs)), () -> @benchmark(XML.Attributes($pairs))),
+ ("getindex [mid]", () -> @benchmark($d[$key_mid]), () -> @benchmark($a[$key_mid])),
+ ("getindex [last]", () -> @benchmark($d[$key_last]), () -> @benchmark($a[$key_last])),
+ ("get [miss]", () -> @benchmark(get($d, "nope", nothing)), () -> @benchmark(get($a, "nope", nothing))),
+ ("haskey [hit]", () -> @benchmark(haskey($d, $key_mid)), () -> @benchmark(haskey($a, $key_mid))),
+ ("keys", () -> @benchmark(collect(keys($d))), () -> @benchmark(keys($a))),
+ ("iterate", () -> @benchmark(sum(length(v) for (_,v) in $d)), () -> @benchmark(sum(length(v) for (_,v) in $a))),
+ ]
+
+ for (op, bench_dict, bench_attr) in tests
+ t_dict = median(bench_dict()).time
+ t_attr = median(bench_attr()).time
+ printrow(n, op, t_dict, t_attr)
+ end
+ println("-" ^ 110)
+end
+
+#-----------------------------------------------------------------------------# End-to-end: attributes() call on parsed Node
+println()
+println(rpad("Size", 10), " | ", rpad("Operation", 22), " | Time")
+println("-" ^ 50)
+for n in sizes
+ doc = parse(make_xml(n), Node)
+ el = doc[1]
+ t = median(@benchmark(attributes($el))).time
+ println(rpad("$n attrs", 10), " | ", rpad("attributes(node)", 22), " | ", pt(t))
+end
+println()
diff --git a/benchmarks/suite.jl b/benchmarks/suite.jl
deleted file mode 100644
index e06dc61..0000000
--- a/benchmarks/suite.jl
+++ /dev/null
@@ -1,74 +0,0 @@
-using Pkg
-Pkg.activate(@__DIR__)
-
-using XML
-using EzXML: EzXML
-using XMLDict: XMLDict
-using BenchmarkTools
-using DataFrames
-using UnicodePlots
-using OrderedCollections: OrderedDict
-
-
-BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10
-BenchmarkTools.DEFAULT_PARAMETERS.samples = 20000
-
-
-# nasa.xml was downloaded from:
-# http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/www/repository.html#nasa
-file = joinpath(@__DIR__, "nasa.xml")
-
-df = DataFrame(kind=String[], name=String[], bench=BenchmarkTools.Trial[])
-
-macro add_benchmark(kind, name, expr...)
- esc(:(let
- @info string($kind, " - ", $name)
- bench = @benchmark $(expr...)
- push!(df, (; kind=$kind, name=$name, bench))
- end))
-end
-
-#-----------------------------------------------------------------------------# Write
-@add_benchmark "Write" "XML.write" XML.write($(tempname()), o) setup = (o = read(file, Node))
-@add_benchmark "Write" "EzXML.writexml" EzXML.write($(tempname()), o) setup = (o = EzXML.readxml(file))
-
-#-----------------------------------------------------------------------------# Read
-@add_benchmark "Read" "XML.LazyNode" read($file, LazyNode)
-@add_benchmark "Read" "XML.Node" read($file, Node)
-@add_benchmark "Read" "EzXML.readxml" EzXML.readxml($file)
-@add_benchmark "Read" "XMLDict.xml_dict" XMLDict.xml_dict(read($file, String))
-
-#-----------------------------------------------------------------------------# Lazy Iteration
-@add_benchmark "Lazy Iteration" "LazyNode" for x in read($file, LazyNode); end
-@add_benchmark "Lazy Iteration" "EzXML.StreamReader" (reader = open(EzXML.StreamReader, $file); for x in reader; end; close(reader))
-
-#-----------------------------------------------------------------------------# Lazy Iteration: Collect Tags
-@add_benchmark "Collect Tags" "LazyNode" [tag(x) for x in o] setup = (o = read(file, LazyNode))
-@add_benchmark "Collect Tags" "EzXML.StreamReader" [r.name for x in r if x == EzXML.READER_ELEMENT] setup=(r=open(EzXML.StreamReader, file)) teardown=(close(r))
-
-function get_tags(o::EzXML.Node)
- out = String[]
- for node in EzXML.eachelement(o)
- push!(out, node.name)
- for tag in get_tags(node)
- push!(out, tag)
- end
- end
- out
-end
-@add_benchmark "Collect Tags" "EzXML.readxml" get_tags(o.root) setup=(o = EzXML.readxml(file))
-
-
-#-----------------------------------------------------------------------------# Plots
-function plot(df, kind)
- g = groupby(df, :kind)
- sub = g[(;kind)]
- x = map(row -> "$(row.name)", eachrow(sub))
- y = map(x -> median(x).time / 1000^2, sub.bench)
- display(barplot(x, y, title = "$kind Time (ms)", border=:none, width=50))
-end
-
-plot(df, "Read")
-plot(df, "Write")
-plot(df, "Lazy Iteration")
-plot(df, "Collect Tags")
diff --git a/ext/XMLAbstractTreesExt.jl b/ext/XMLAbstractTreesExt.jl
new file mode 100644
index 0000000..60add31
--- /dev/null
+++ b/ext/XMLAbstractTreesExt.jl
@@ -0,0 +1,71 @@
+module XMLAbstractTreesExt
+
+using XML: XML, Node, LazyNode, NodeType, Element, Text, CData, Comment,
+ Declaration, DTD, Document, ProcessingInstruction,
+ nodetype, tag, value, attributes
+import AbstractTrees
+
+#-----------------------------------------------------------------------------# children
+AbstractTrees.children(n::Node) = XML.children(n)
+AbstractTrees.children(n::LazyNode) = XML.children(n)
+
+#-----------------------------------------------------------------------------# nodevalue
+AbstractTrees.nodevalue(n::Node) = n
+AbstractTrees.nodevalue(n::LazyNode) = n
+
+#-----------------------------------------------------------------------------# printnode
+# Single-line label for `print_tree`; mirrors the REPL `show` for each NodeType but
+# without trailing child-count annotations (AbstractTrees draws the structure).
+_printnode(io::IO, n::Union{Node, LazyNode}) = _printnode(io, n, nodetype(n))
+
+function _printnode(io::IO, n, ::Val{Element})
+ print(io, '<', tag(n))
+ attrs = attributes(n)
+ if !isnothing(attrs)
+ for (k, v) in attrs
+ print(io, ' ', k, '=', '"', v, '"')
+ end
+ end
+ print(io, '>')
+end
+
+_printnode(io::IO, n, ::Val{Text}) = show(io, value(n))
+_printnode(io::IO, n, ::Val{Comment}) = print(io, "")
+_printnode(io::IO, n, ::Val{CData}) = print(io, "")
+_printnode(io::IO, n, ::Val{DTD}) = print(io, "')
+
+function _printnode(io::IO, n, ::Val{Declaration})
+ print(io, "")
+end
+
+function _printnode(io::IO, n, ::Val{ProcessingInstruction})
+ print(io, "", tag(n))
+ v = value(n)
+ !isnothing(v) && print(io, ' ', v)
+ print(io, "?>")
+end
+
+_printnode(io::IO, n, ::Val{Document}) = print(io, "Document")
+
+# Dispatch helper: avoid an Enum branch chain by tag-dispatching on Val{NodeType}.
+_printnode(io::IO, n, nt::NodeType) = _printnode(io, n, Val(nt))
+
+AbstractTrees.printnode(io::IO, n::Node) = _printnode(io, n)
+AbstractTrees.printnode(io::IO, n::LazyNode) = _printnode(io, n)
+
+#-----------------------------------------------------------------------------# traits
+AbstractTrees.NodeType(::Type{<:Node}) = AbstractTrees.HasNodeType()
+AbstractTrees.NodeType(::Type{<:LazyNode}) = AbstractTrees.HasNodeType()
+AbstractTrees.nodetype(::Type{N}) where {N <: Node} = N
+AbstractTrees.nodetype(::Type{L}) where {L <: LazyNode} = L
+
+AbstractTrees.ChildIndexing(::Type{<:Node}) = AbstractTrees.IndexedChildren()
+
+end # module
diff --git a/src/XML.jl b/src/XML.jl
index 273bfda..a431541 100644
--- a/src/XML.jl
+++ b/src/XML.jl
@@ -1,31 +1,66 @@
module XML
-using Mmap
-using OrderedCollections: OrderedDict
-
export
- # Core Types:
- Node, LazyNode,
- # Interface:
- children, nodetype, tag, attributes, value, is_simple, simplevalue, simple_value,
- # Extended Interface for LazyNode:
- parent, depth, next, prev
+ Node, LazyNode, NodeType, Attributes,
+ CData, Comment, Declaration, Document, DTD, Element, ProcessingInstruction, Text,
+ nodetype, tag, attributes, value, children, children!, eachchildnode, eachattribute,
+ is_simple, simple_value, is_simple_value, sourcetext,
+ depth, siblings,
+ xpath,
+ h
+
+include("XMLTokenizer.jl")
+using .XMLTokenizer:
+ XMLTokenizer, tokenize, tag_name, attr_value, pi_target,
+ TokenKinds, Token, Tokenizer, TokenizerState
#-----------------------------------------------------------------------------# escape/unescape
-const escape_chars = ('&' => "&", '<' => "<", '>' => ">", "'" => "'", '"' => """)
+const ESCAPE_CHARS = ('&' => "&", '<' => "<", '>' => ">", '\'' => "'", '"' => """)
+
+"""
+ escape(x::AbstractString) -> String
+
+Escape the five XML predefined entities: `&` `<` `>` `'` `"`.
+
+!!! note "Changed in v0.4"
+ `escape` is no longer idempotent. In previous versions, already-escaped sequences like
+ `&` were left untouched. Now every `&` is escaped, so `escape("&")` produces
+ `"&"`. Call `escape` only on raw, unescaped text.
+"""
+escape(x::AbstractString) = replace(x, ESCAPE_CHARS...)
+
+# Replace a numeric character reference with its Unicode character.
+# Numeric character references encode characters by code point: decimal (é → é) or hex (é → é).
+function _unescape_charref(ref::AbstractString)
+ is_hex = length(ref) > 3 && ref[3] in ('x', 'X')
+ digits = SubString(ref, is_hex ? 4 : 3, length(ref) - 1)
+ cp = tryparse(UInt32, digits; base = is_hex ? 16 : 10)
+ !isnothing(cp) && isvalid(Char, cp) ? string(Char(cp)) : ref
+end
+
+"""
+ unescape(x::AbstractString) -> String
+ unescape(x::SubString{String}) -> Union{SubString{String}, String}
+
+Unescape XML entities in `x`: the five predefined entities (`&` `<` `>` `'`
+`"`) and numeric character references (`{`, `«`). Each reference is processed
+exactly once (no double-unescaping).
+
+When `x` is a `SubString{String}` containing no `&`, the input is returned unchanged with
+no allocation — the common case for typical XML attribute and text content.
+"""
function unescape(x::AbstractString)
- result = x
- for (pat, r) in reverse.(escape_chars)
- result = replace(result, pat => r)
- end
- return result
+ s = string(x)
+ occursin('&', s) || return s
+ occursin("", s) && (s = replace(s, r"[xX]?[0-9a-fA-F]+;" => _unescape_charref))
+ replace(s, "<" => "<", ">" => ">", "'" => "'", """ => "\"", "&" => "&")
end
-function escape(x::String)
- result = replace(x, r"&(?!amp;|quot;|apos;|gt;|lt;)" => "&")
- for (pat, r) in escape_chars[2:end]
- result = replace(result, pat => r)
- end
- return result
+
+function unescape(x::SubString{String})
+ occursin('&', x) || return x
+ s = String(x)
+ occursin("", s) && (s = replace(s, r"[xX]?[0-9a-fA-F]+;" => _unescape_charref))
+ replace(s, "<" => "<", ">" => ">", "'" => "'", """ => "\"", "&" => "&")
end
#-----------------------------------------------------------------------------# NodeType
@@ -34,9 +69,9 @@ end
- Document # prolog & root Element
- DTD #
- Declaration #
- - ProcessingInstruction #
+ - ProcessingInstruction #
- Comment #
- - CData #
+ - CData #
- Element # children...
- Text # text
@@ -45,381 +80,1131 @@ NodeTypes can be used to construct XML.Nodes:
Document(children...)
DTD(value)
Declaration(; attributes)
- ProcessingInstruction(tag, attributes)
+ ProcessingInstruction(tag, content)
Comment(text)
CData(text)
Element(tag, children...; attributes)
Text(text)
"""
-@enum(NodeType, CData, Comment, Declaration, Document, DTD, Element, ProcessingInstruction, Text)
+@enum NodeType::UInt8 CData Comment Declaration Document DTD Element ProcessingInstruction Text
+#-----------------------------------------------------------------------------# Attributes
+"""
+ Attributes{S} <: AbstractDict{S, S}
+
+An ordered dictionary of XML attributes backed by a `Vector{Pair{S, S}}`.
+Returned by [`attributes`](@ref). Preserves insertion order and supports the
+full `AbstractDict` interface (`get`, `haskey`, `keys`, `values`, iteration, etc.).
+"""
+struct Attributes{S} <: AbstractDict{S, S}
+ entries::Vector{Pair{S, S}}
+end
+
+Base.length(a::Attributes) = length(a.entries)
+Base.iterate(a::Attributes, state...) = iterate(a.entries, state...)
+
+function Base.getindex(a::Attributes, key::AbstractString)
+ for (k, v) in a.entries
+ k == key && return v
+ end
+ throw(KeyError(key))
+end
-#-----------------------------------------------------------------------------# includes
-include("raw.jl")
-include("dtd.jl")
+function Base.get(a::Attributes, key::AbstractString, default)
+ for (k, v) in a.entries
+ k == key && return v
+ end
+ default
+end
+
+function Base.haskey(a::Attributes, key::AbstractString)
+ any(p -> first(p) == key, a.entries)
+end
-abstract type AbstractXMLNode end
+Base.keys(a::Attributes) = first.(a.entries)
+Base.values(a::Attributes) = last.(a.entries)
-#-----------------------------------------------------------------------------# LazyNode
+#-----------------------------------------------------------------------------# Node
"""
- LazyNode(file::AbstractString)
- LazyNode(data::XML.Raw)
+ Node{S}
+
+In-memory DOM node parameterized on the string storage type `S` (typically `String`, or
+`SubString{String}` for zero-copy parsing). Every kind of XML node — `Element`, `Text`,
+`Comment`, `CData`, `ProcessingInstruction`, `Declaration`, `DTD`, `Document` — is
+represented by a single `Node{S}` whose [`NodeType`](@ref) determines which fields are
+populated.
-A Lazy representation of an XML node.
+ parse(xml, Node) # parse a string into a Node{String}
+ parse(xml, Node{SubString{String}}) # zero-copy variant
+ read(filename, Node) # read & parse a file
+
+Use the accessor functions ([`nodetype`](@ref), [`tag`](@ref), [`attributes`](@ref),
+[`value`](@ref), [`children`](@ref)) rather than the raw fields when navigating a tree.
+Integer indexing returns children (`node[1]`); string indexing returns attribute values
+(`node["class"]`).
"""
-mutable struct LazyNode <: AbstractXMLNode
- raw::Raw
- tag::Union{Nothing, String}
- attributes::Union{Nothing, OrderedDict{String, String}}
- value::Union{Nothing, String}
-end
-LazyNode(raw::Raw) = LazyNode(raw, nothing, nothing, nothing)
+struct Node{S}
+ nodetype::NodeType
+ tag::Union{Nothing, S}
+ attributes::Union{Nothing, Vector{Pair{S, S}}}
+ value::Union{Nothing, S}
+ children::Union{Nothing, Vector{Node{S}}}
-function Base.getproperty(o::LazyNode, x::Symbol)
- x === :raw && return getfield(o, :raw)
- x === :nodetype && return nodetype(o.raw)
- x === :tag && return isnothing(getfield(o, x)) ? setfield!(o, x, tag(o.raw)) : getfield(o, x)
- x === :attributes && return isnothing(getfield(o, x)) ? setfield!(o, x, attributes(o.raw)) : getfield(o, x)
- x === :value && return isnothing(getfield(o, x)) ? setfield!(o, x, value(o.raw)) : getfield(o, x)
- x === :depth && return depth(o.raw)
- x === :children && return LazyNode.(children(o.raw))
- error("type LazyNode has no field $(x)")
+ function Node{S}(nodetype::NodeType, tag, attributes, value, children) where {S}
+ if nodetype in (Text, Comment, CData, DTD)
+ isnothing(tag) && isnothing(attributes) && !isnothing(value) && isnothing(children) ||
+ error("$nodetype nodes only accept a value.")
+ elseif nodetype === Element
+ !isnothing(tag) && isnothing(value) ||
+ error("Element nodes require a tag and no value.")
+ elseif nodetype === Declaration
+ isnothing(tag) && isnothing(value) && isnothing(children) ||
+ error("Declaration nodes only accept attributes.")
+ elseif nodetype === ProcessingInstruction
+ !isnothing(tag) && isnothing(attributes) && isnothing(children) ||
+ error("ProcessingInstruction nodes require a tag and only accept a value.")
+ elseif nodetype === Document
+ isnothing(tag) && isnothing(attributes) && isnothing(value) ||
+ error("Document nodes only accept children.")
+ end
+ new{S}(nodetype, tag, attributes, value, children)
+ end
end
-Base.propertynames(o::LazyNode) = (:raw, :nodetype, :tag, :attributes, :value, :depth, :children)
-Base.show(io::IO, o::LazyNode) = _show_node(io, o)
+#-----------------------------------------------------------------------------# interface
+"""
+ nodetype(node) -> NodeType
+
+Return the [`NodeType`](@ref) of `node` (`Element`, `Text`, `Comment`, `CData`,
+`ProcessingInstruction`, `Declaration`, `DTD`, or `Document`).
+"""
+nodetype(o::Node) = o.nodetype
+
+"""
+ tag(node) -> Union{String, SubString{String}, Nothing}
+
+Return the tag name of `node`. Defined for `Element` (element name) and
+`ProcessingInstruction` (target name); returns `nothing` for other node types.
+"""
+tag(o::Node) = o.tag
+
+"""
+ attributes(node::Node) -> Union{Nothing, Attributes{String}}
+
+Return the attributes of an `Element` or `Declaration` node as an [`Attributes`](@ref) dict,
+or `nothing` if the node has no attributes.
+
+!!! note "Changed in v0.4"
+ In previous versions, `attributes` returned an `OrderedDict` from OrderedCollections.jl.
+ It now returns an [`Attributes`](@ref), an ordered `AbstractDict` backed by a
+ `Vector{Pair}`.
+"""
+attributes(o::Node) = isnothing(o.attributes) ? nothing : Attributes(o.attributes)
+
+"""
+ value(node) -> Union{String, SubString{String}, Nothing}
+
+Return the textual content of `node`. Defined for `Text`, `Comment`, `CData`, `DTD`, and
+`ProcessingInstruction`; returns `nothing` for `Element`, `Declaration`, and `Document`
+(use [`children`](@ref) for those).
+"""
+value(o::Node) = o.value
+
+"""
+ children(node) -> Vector{Node} or ()
+
+Return the child nodes of `node` in document order. Returns an empty tuple `()` for nodes
+that cannot have children (e.g. `Text`, `Comment`, `CData`).
+"""
+children(o::Node) = something(o.children, ())
+
+"""
+ is_simple(node) -> Bool
+
+Return `true` if `node` is an `Element` with no attributes and exactly one `Text` or
+`CData` child — i.e. the `content ` pattern with no nested markup. See also
+[`simple_value`](@ref).
+"""
+is_simple(o::Node) = o.nodetype === Element &&
+ (isnothing(o.attributes) || isempty(o.attributes)) &&
+ !isnothing(o.children) && length(o.children) == 1 &&
+ o.children[1].nodetype in (Text, CData)
+
+"""
+ simple_value(node) -> String
+
+Return the textual content of a simple element (see [`is_simple`](@ref)). Errors if
+`node` is not simple.
+"""
+simple_value(o::Node) = is_simple(o) ? o.children[1].value :
+ error("`simple_value` is only defined for simple nodes.")
+
+"""
+ is_simple_value(node) -> Union{Nothing, String, SubString{String}}
+
+Combined predicate-and-accessor: return the simple text/CData value of `node` if it is a
+simple element (see [`is_simple`](@ref)), or `nothing` otherwise. Avoids the redundant
+tokenization that `is_simple(n) ? simple_value(n) : ...` does on `LazyNode`.
+"""
+is_simple_value(o::Node) = is_simple(o) ? o.children[1].value : nothing
-Base.read(io::IO, ::Type{LazyNode}) = LazyNode(read(io, Raw))
-Base.read(filename::AbstractString, ::Type{LazyNode}) = LazyNode(read(filename, Raw))
-Base.parse(x::AbstractString, ::Type{LazyNode}) = LazyNode(parse(x, Raw))
+#-----------------------------------------------------------------------------# tree navigation
-children(o::LazyNode) = LazyNode.(children(o.raw))
-parent(o::LazyNode) = LazyNode(parent(o.raw))
-depth(o::LazyNode) = depth(o.raw)
+"""
+ parent(child::Node, root::Node) -> Node
-Base.IteratorSize(::Type{LazyNode}) = Base.SizeUnknown()
-Base.eltype(::Type{LazyNode}) = LazyNode
+Return the parent of `child` within the tree rooted at `root`.
-function Base.iterate(o::LazyNode, state=o)
- n = next(state)
- return isnothing(n) ? nothing : (n, n)
+Since `Node` does not store parent pointers, this performs a tree search from `root`.
+Throws an error if `child` is not found or if `child === root`.
+"""
+function Base.parent(child::Node, root::Node)
+ child === root && error("Root node has no parent.")
+ result = _find_parent(child, root)
+ isnothing(result) && error("Node not found in tree.")
+ result
end
-function next(o::LazyNode)
- n = next(o.raw)
- isnothing(n) && return nothing
- n.type === RawElementClose ? next(LazyNode(n)) : LazyNode(n)
+# Depth-first search for `child` within `current`; returns the containing node or nothing.
+function _find_parent(child::Node, current::Node)
+ for c in children(current)
+ c === child && return current
+ result = _find_parent(child, c)
+ isnothing(result) || return result
+ end
+ nothing
end
-function prev(o::LazyNode)
- n = prev(o.raw)
- isnothing(n) && return nothing
- n.type === RawElementClose ? prev(LazyNode(n)) : LazyNode(n)
+
+"""
+ depth(child::Node, root::Node) -> Int
+
+Return the depth of `child` within the tree rooted at `root` (root has depth 0).
+
+Since `Node` does not store parent pointers, this performs a tree search from `root`.
+Throws an error if `child` is not found in the tree.
+"""
+function depth(child::Node, root::Node)
+ child === root && return 0
+ result = _find_depth(child, root, 0)
+ isnothing(result) && error("Node not found in tree.")
+ result
+end
+
+# Depth-first search returning the depth of `child` relative to `current` (where children
+# of `current` are at depth `d + 1`), or nothing if not found.
+function _find_depth(child::Node, current::Node, d::Int)
+ for c in children(current)
+ c === child && return d + 1
+ result = _find_depth(child, c, d + 1)
+ isnothing(result) || return result
+ end
+ nothing
end
-#-----------------------------------------------------------------------------# Node
"""
- Node(nodetype, tag, attributes, value, children)
- Node(node::Node; kw...) # copy node with keyword overrides
- Node(node::LazyNode) # un-lazy the LazyNode
+ siblings(child::Node, root::Node) -> Vector{Node}
+
+Return the siblings of `child` (other children of the same parent) within the tree rooted
+at `root`. The returned vector does not include `child` itself.
-A representation of an XML DOM node. For simpler construction, use `(::NodeType)(args...)`
+Throws an error if `child` is the root or is not found in the tree.
"""
-struct Node <: AbstractXMLNode
- nodetype::NodeType
- tag::Union{Nothing, String}
- attributes::Union{Nothing, OrderedDict{String, String}}
- value::Union{Nothing, String}
- children::Union{Nothing, Vector{Node}}
-
- function Node(nodetype::NodeType, tag=nothing, attributes=nothing, value=nothing, children=nothing)
- new(nodetype,
- isnothing(tag) ? nothing : string(tag),
- isnothing(attributes) ? nothing : OrderedDict(string(k) => string(v) for (k, v) in pairs(attributes)),
- isnothing(value) ? nothing : string(value),
- isnothing(children) ? nothing :
- children isa Node ? [children] :
- children isa Vector{Node} ? children :
- children isa Vector ? map(Node, children) :
- children isa Tuple ? map(Node, collect(children)) :
- [Node(children)]
- )
+function siblings(child::Node, root::Node)
+ p = parent(child, root)
+ [c for c in children(p) if c !== child]
+end
+
+include("xpath.jl")
+include("lazynode.jl")
+
+
+#-----------------------------------------------------------------------------# _to_node
+# Coerce a positional argument to a Node{String}: identity for nodes, wrap non-nodes as
+# Text. The middle method rejects non-String parameterizations to keep mixed-storage trees
+# from being silently constructed.
+_to_node(n::Node{String}) = n
+_to_node(n::Node) = throw(ArgumentError("Expected Node{String}, got $(typeof(n))"))
+_to_node(x) = Node{String}(Text, nothing, nothing, string(x), nothing)
+
+#-----------------------------------------------------------------------------# NodeType constructors
+# Make each NodeType variant callable as a constructor: `Element("div", ...)`,
+# `Text("hi")`, etc. Dispatches on `T` to validate args/kwargs and build the right Node.
+function (T::NodeType)(args...; attrs...)
+ S = String
+ if T in (Text, Comment, CData, DTD)
+ length(args) == 1 || error("$T nodes require exactly one value argument.")
+ !isempty(attrs) && error("$T nodes do not accept attributes.")
+ Node{S}(T, nothing, nothing, string(only(args)), nothing)
+ elseif T === Element
+ isempty(args) && error("Element nodes require at least a tag.")
+ t = string(first(args))
+ a = Pair{S,S}[String(k) => String(v) for (k, v) in pairs(attrs)]
+ c = Node{S}[_to_node(x) for x in args[2:end]]
+ Node{S}(T, t, a, nothing, c)
+ elseif T === Declaration
+ !isempty(args) && error("Declaration nodes only accept keyword attributes.")
+ a = isempty(attrs) ? nothing : [String(k) => String(v) for (k, v) in pairs(attrs)]
+ Node{S}(T, nothing, a, nothing, nothing)
+ elseif T === ProcessingInstruction
+ length(args) >= 1 || error("ProcessingInstruction nodes require a target.")
+ length(args) <= 2 || error("ProcessingInstruction nodes accept a target and optional content.")
+ !isempty(attrs) && error("ProcessingInstruction nodes do not accept attributes.")
+ t = string(args[1])
+ v = length(args) == 2 ? string(args[2]) : nothing
+ Node{S}(T, t, nothing, v, nothing)
+ elseif T === Document
+ !isempty(attrs) && error("Document nodes do not accept attributes.")
+ c = Node{S}[_to_node(x) for x in args]
+ Node{S}(T, nothing, nothing, nothing, c)
end
end
-function Node(o::Node, x...; kw...)
- attrs = !isnothing(kw) ?
- merge(
- OrderedDict(string(k) => string(v) for (k, v) in pairs(kw)),
- isnothing(o.attributes) ? OrderedDict{String,String}() : o.attributes
- ) :
- o.attributes
- children = isempty(x) ? o.children : vcat(isnothing(o.children) ? [] : o.children, collect(x))
- Node(o.nodetype, o.tag, attrs, o.value, children)
+#-----------------------------------------------------------------------------# equality
+# Treat `nothing` and an empty collection as equivalent so that an absent attribute /
+# children field compares equal to an explicitly empty one.
+_eq(::Nothing, ::Nothing) = true
+_eq(::Nothing, b) = isempty(b)
+_eq(a, ::Nothing) = isempty(a)
+_eq(a, b) = a == b
+
+# Attribute equality is order-insensitive per XML spec.
+function _attrs_eq(a, b)
+ a_empty = isnothing(a) || isempty(a)
+ b_empty = isnothing(b) || isempty(b)
+ a_empty && b_empty && return true
+ (a_empty != b_empty) && return false
+ length(a) != length(b) && return false
+ for p in a
+ p in b || return false
+ end
+ true
end
-function Node(node::LazyNode)
- nodetype = node.nodetype
- tag = node.tag
- attributes = node.attributes
- value = node.value
- c = XML.children(node)
- Node(nodetype, tag, attributes, value, isempty(c) ? nothing : map(Node, c))
+function Base.:(==)(a::Node, b::Node)
+ a.nodetype == b.nodetype &&
+ a.tag == b.tag &&
+ _attrs_eq(a.attributes, b.attributes) &&
+ a.value == b.value &&
+ _eq(a.children, b.children)
end
-Node(data::Raw) = Node(LazyNode(data))
+#-----------------------------------------------------------------------------# indexing
+Base.getindex(o::Node, i::Integer) = children(o)[i]
+Base.getindex(o::Node, ::Colon) = children(o)
+Base.lastindex(o::Node) = lastindex(children(o))
+Base.only(o::Node) = only(children(o))
+Base.length(o::Node) = length(children(o))
+
+function Base.get(o::Node, key::AbstractString, default)
+ isnothing(o.attributes) && return default
+ for (k, v) in o.attributes
+ k == key && return v
+ end
+ default
+end
-# Anything that's not Vector{UInt8} or a (Lazy)Node is converted to a Text Node
-Node(x) = Node(Text, nothing, nothing, string(x), nothing)
+const _MISSING_ATTR = gensym(:missing_attr)
-h(tag::Union{Symbol, String}, children...; kw...) = Node(Element, tag, kw, nothing, children)
-Base.getproperty(::typeof(h), tag::Symbol) = h(tag)
-(o::Node)(children...; kw...) = Node(o, Node.(children)...; kw...)
+function Base.getindex(o::Node, key::AbstractString)
+ val = get(o, key, _MISSING_ATTR)
+ val === _MISSING_ATTR && throw(KeyError(key))
+ val
+end
-# NOT in-place for Text Nodes
-function escape!(o::Node, warn::Bool=true)
- if o.nodetype == Text
- warn && @warn "escape!() called on a Text Node creates a new node."
- return Text(escape(o.value))
+function Base.haskey(o::Node, key::AbstractString)
+ get(o, key, _MISSING_ATTR) !== _MISSING_ATTR
+end
+
+Base.keys(o::Node) = isnothing(o.attributes) ? () : first.(o.attributes)
+
+#-----------------------------------------------------------------------------# mutation
+function Base.setindex!(o::Node, val, i::Integer)
+ isnothing(o.children) && error("Node has no children.")
+ o.children[i] = _to_node(val)
+end
+
+function Base.setindex!(o::Node, val, key::AbstractString)
+ isnothing(o.attributes) && error("Node has no attributes.")
+ v = string(val)
+ for i in eachindex(o.attributes)
+ if first(o.attributes[i]) == key
+ o.attributes[i] = key => v
+ return v
+ end
end
- isnothing(o.children) && return o
- map!(x -> escape!(x, false), o.children, o.children)
- o
+ push!(o.attributes, key => v)
+ v
end
-function unescape!(o::Node, warn::Bool=true)
- if o.nodetype == Text
- warn && @warn "unescape!() called on a Text Node creates a new node."
- return Text(unescape(o.value))
+
+function Base.push!(a::Node, b)
+ isnothing(a.children) && error("Node does not accept children.")
+ push!(a.children, _to_node(b))
+ a
+end
+
+function Base.pushfirst!(a::Node, b)
+ isnothing(a.children) && error("Node does not accept children.")
+ pushfirst!(a.children, _to_node(b))
+ a
+end
+
+#-----------------------------------------------------------------------------# show (REPL)
+function Base.show(io::IO, o::Node)
+ nt = o.nodetype
+ print(io, nt)
+ if nt === Text
+ print(io, ' ', repr(o.value))
+ elseif nt === Element
+ print(io, " <", o.tag)
+ if !isnothing(o.attributes)
+ for (k, v) in o.attributes
+ print(io, ' ', k, '=', '"', v, '"')
+ end
+ end
+ print(io, '>')
+ n = length(children(o))
+ n > 0 && print(io, n == 1 ? " (1 child)" : " ($n children)")
+ elseif nt === DTD
+ print(io, " ')
+ elseif nt === Declaration
+ print(io, " ")
+ elseif nt === ProcessingInstruction
+ print(io, " ", o.tag)
+ !isnothing(o.value) && print(io, ' ', o.value)
+ print(io, "?>")
+ elseif nt === Comment
+ print(io, " ")
+ elseif nt === CData
+ print(io, " ")
+ elseif nt === Document
+ n = length(children(o))
+ n > 0 && print(io, n == 1 ? " (1 child)" : " ($n children)")
end
- isnothing(o.children) && return o
- map!(x -> unescape!(x, false), o.children, o.children)
- o
end
+#-----------------------------------------------------------------------------# show (text/xml)
-Base.read(filename::AbstractString, ::Type{Node}) = Node(read(filename, Raw))
-Base.read(io::IO, ::Type{Node}) = Node(read(io, Raw))
-Base.parse(x::AbstractString, ::Type{Node}) = Node(parse(x, Raw))
+# Write XML-escaped content directly to IO (single pass, no intermediate string)
+function _write_escaped(io::IO, s::String)
+ start = 1
+ i = 1
+ n = ncodeunits(s)
+ @inbounds while i <= n
+ b = codeunit(s, i)
+ esc = if b == UInt8('&'); "&"
+ elseif b == UInt8('<'); "<"
+ elseif b == UInt8('>'); ">"
+ elseif b == UInt8('"'); """
+ elseif b == UInt8('\''); "'"
+ else
+ i += 1
+ continue
+ end
+ i > start && GC.@preserve s Base.unsafe_write(io, pointer(s, start), (i - start) % UInt)
+ print(io, esc)
+ i += 1
+ start = i
+ end
+ start <= n && GC.@preserve s Base.unsafe_write(io, pointer(s, start), (n - start + 1) % UInt)
+ nothing
+end
-Base.setindex!(o::Node, val, i::Integer) = o.children[i] = Node(val)
-Base.push!(a::Node, b::Node) = push!(a.children, b)
-Base.pushfirst!(a::Node, b::Node) = pushfirst!(a.children, b)
+# Cached indentation strings to avoid repeated allocation
+const _MAX_CACHED_INDENT = 64
+const _INDENT_STRINGS = [" " ^ n for n in 0:_MAX_CACHED_INDENT]
+@inline function _indent_str(n::Int)
+ 0 <= n <= _MAX_CACHED_INDENT && return @inbounds _INDENT_STRINGS[n + 1]
+ " " ^ n
+end
-Base.setindex!(o::Node, val, key::AbstractString) = (o.attributes[key] = string(val))
-Base.getindex(o::Node, val::AbstractString) = o.attributes[val]
-Base.haskey(o::Node, key::AbstractString) = isnothing(o.attributes) ? false : haskey(o.attributes, key)
-Base.keys(o::Node) = isnothing(o.attributes) ? () : keys(o.attributes)
+# Serialize `key="escaped-value"` pairs for an attributes vector (no leading space outside).
+# Uses byte-level `Base.write` instead of `print` to avoid the varargs-print dispatch
+# overhead that shows up under profile when an element has many attributes.
+function _print_attrs(io::IO, attributes)
+ isnothing(attributes) && return
+ for (k, v) in attributes
+ Base.write(io, UInt8(' '))
+ Base.write(io, k)
+ Base.write(io, UInt8('='))
+ Base.write(io, UInt8('"'))
+ _write_escaped(io, v)
+ Base.write(io, UInt8('"'))
+ end
+end
-Base.show(io::IO, o::Node) = _show_node(io, o)
+# Whitespace-only Text — emitted by the parser to round-trip source whitespace; pretty
+# printing regenerates indentation from the tree shape and drops these.
+@inline function _is_ignorable_text(node::Node)
+ node.nodetype === Text && !isnothing(node.value) && all(isspace, node.value)
+end
-#-----------------------------------------------------------------------------# Node Constructors
-function (T::NodeType)(args...; attr...)
- if T === Document
- !isempty(attr) && error("Document nodes do not have attributes.")
- Node(T, nothing, nothing, nothing, args)
- elseif T === DTD
- !isempty(attr) && error("DTD nodes only accept a value.")
- length(args) > 1 && error("DTD nodes only accept a value.")
- Node(T, nothing, nothing, only(args))
- elseif T === Declaration
- !isempty(args) && error("Declaration nodes only accept attributes")
- Node(T, nothing, attr)
- elseif T === ProcessingInstruction
- length(args) == 1 || error("ProcessingInstruction nodes require a tag and attributes.")
- Node(T, only(args), attr)
- elseif T === Comment
- !isempty(attr) && error("Comment nodes do not have attributes.")
- length(args) > 1 && error("Comment nodes only accept a single input.")
- Node(T, nothing, nothing, only(args))
- elseif T === CData
- !isempty(attr) && error("CData nodes do not have attributes.")
- length(args) > 1 && error("CData nodes only accept a single input.")
- Node(T, nothing, nothing, only(args))
- elseif T === Text
- !isempty(attr) && error("Text nodes do not have attributes.")
- length(args) > 1 && error("Text nodes only accept a single input.")
- Node(T, nothing, nothing, only(args))
- elseif T === Element
- tag = first(args)
- Node(T, tag, attr, nothing, args[2:end])
- else
- error("Unreachable reached while trying to create a Node via (::NodeType)(args...; kw...).")
+# Mixed content = at least one Text/CData child carrying actual (non-whitespace) data.
+# In that case the original whitespace is significant and we must not reformat.
+function _has_significant_text(children)
+ for c in children
+ nt = c.nodetype
+ if nt === Text
+ (!isnothing(c.value) && !all(isspace, c.value)) && return true
+ elseif nt === CData
+ return true
+ end
end
+ false
end
-#-----------------------------------------------------------------------------# !!! common !!!
-# Everything below here is common to all data structures
+# Main XML serializer. `depth` controls indentation; `preserve` propagates `xml:space=
+# "preserve"` semantics down the subtree so we don't reformat whitespace-sensitive content.
+function _write_xml(io::IO, node::Node, depth::Int=0, indent::Int=2, preserve::Bool=false)
+ pad = preserve ? "" : _indent_str(indent * depth)
+ nt = node.nodetype
+ if nt === Text
+ _write_escaped(io, node.value)
+ elseif nt === Element
+ # Check xml:space on this element
+ child_preserve = preserve
+ if !isnothing(node.attributes)
+ for (k, v) in node.attributes
+ k == "xml:space" && (child_preserve = v == "preserve")
+ end
+ end
+ Base.write(io, pad)
+ Base.write(io, UInt8('<'))
+ Base.write(io, node.tag)
+ _print_attrs(io, node.attributes)
+ ch = node.children
+ if isnothing(ch) || isempty(ch)
+ Base.write(io, UInt8('/'))
+ Base.write(io, UInt8('>'))
+ elseif length(ch) == 1 && only(ch).nodetype === Text
+ Base.write(io, UInt8('>'))
+ _write_xml(io, only(ch), 0, 0, child_preserve)
+ Base.write(io, UInt8('<'))
+ Base.write(io, UInt8('/'))
+ Base.write(io, node.tag)
+ Base.write(io, UInt8('>'))
+ else
+ # If real Text or any CData lives among the children, treat as mixed
+ # content and preserve the original layout. Otherwise pretty-print
+ # and skip whitespace-only Text children — those were emitted by the
+ # parser purely to round-trip source whitespace, and the writer
+ # regenerates indentation from the tree shape.
+ effective_preserve = child_preserve || _has_significant_text(ch)
+ if effective_preserve
+ Base.write(io, UInt8('>'))
+ else
+ Base.write(io, UInt8('>'))
+ Base.write(io, UInt8('\n'))
+ end
+ for child in ch
+ if !effective_preserve && _is_ignorable_text(child)
+ continue
+ end
+ _write_xml(io, child, depth + 1, indent, effective_preserve)
+ effective_preserve || Base.write(io, UInt8('\n'))
+ end
+ effective_preserve || Base.write(io, pad)
+ Base.write(io, UInt8('<'))
+ Base.write(io, UInt8('/'))
+ Base.write(io, node.tag)
+ Base.write(io, UInt8('>'))
+ end
+ elseif nt === Declaration
+ Base.write(io, pad)
+ Base.write(io, "")
+ elseif nt === ProcessingInstruction
+ Base.write(io, pad)
+ Base.write(io, "")
+ Base.write(io, node.tag)
+ if !isnothing(node.value)
+ Base.write(io, UInt8(' '))
+ Base.write(io, node.value)
+ end
+ Base.write(io, "?>")
+ elseif nt === Comment
+ Base.write(io, pad)
+ Base.write(io, "")
+ elseif nt === CData
+ Base.write(io, pad)
+ Base.write(io, "")
+ elseif nt === DTD
+ Base.write(io, pad)
+ Base.write(io, "'))
+ elseif nt === Document
+ ch = node.children
+ if !isnothing(ch)
+ # Drop whitespace-only Text between top-level nodes when pretty
+ # printing (XML grammar disallows text at document level, so any
+ # such Text comes from inter-node whitespace in the source).
+ visible = preserve ? ch : filter(!_is_ignorable_text, ch)
+ n_visible = length(visible)
+ for (i, child) in enumerate(visible)
+ _write_xml(io, child, 0, indent, preserve)
+ i < n_visible && Base.write(io, UInt8('\n'))
+ end
+ end
+ end
+end
+Base.show(io::IO, ::MIME"text/xml", node::Node) = _write_xml(io, node)
-#-----------------------------------------------------------------------------# interface fallbacks
-nodetype(o) = o.nodetype
-tag(o) = o.tag
-attributes(o) = o.attributes
-value(o) = o.value
-children(o::T) where {T} = isnothing(o.children) ? () : o.children
+#-----------------------------------------------------------------------------# write / read
+write(node::Node; indentsize::Int=2) = (io = IOBuffer(); _write_xml(io, node, 0, indentsize); String(take!(io)))
+write(filename::AbstractString, node::Node; kw...) = open(io -> write(io, node; kw...), filename, "w")
+write(io::IO, node::Node; indentsize::Int=2) = _write_xml(io, node, 0, indentsize)
-depth(o) = missing
-parent(o) = missing
-next(o) = missing
-prev(o) = missing
+Base.read(filename::AbstractString, ::Type{Node}) = parse(read(filename, String), Node)
+Base.read(io::IO, ::Type{Node}) = parse(read(io, String), Node)
-is_simple(o) = nodetype(o) == Element && (isnothing(attributes(o)) || isempty(attributes(o))) &&
- length(children(o)) == 1 && nodetype(only(o)) in (Text, CData)
+#-----------------------------------------------------------------------------# parse
+Base.parse(::Type{Node}, xml::AbstractString) = parse(xml, Node)
-simple_value(o) = is_simple(o) ? value(only(o)) : error("`XML.simple_value` is only defined for simple nodes.")
+function Base.parse(xml::AbstractString, ::Type{Node})
+ _parse(String(xml), String, unescape)
+end
-Base.@deprecate_binding simplevalue simple_value
+function Base.parse(xml::AbstractString, ::Type{Node{SubString{String}}})
+ _parse(String(xml), SubString{String}, identity)
+end
-#-----------------------------------------------------------------------------# nodes_equal
-function nodes_equal(a, b)
- out = XML.tag(a) == XML.tag(b)
- out &= XML.nodetype(a) == XML.nodetype(b)
- out &= XML.attributes(a) == XML.attributes(b)
- out &= XML.value(a) == XML.value(b)
- out &= length(XML.children(a)) == length(XML.children(b))
- out &= all(nodes_equal(ai, bi) for (ai,bi) in zip(XML.children(a), XML.children(b)))
- return out
+# Convert a parser substring to the requested storage type — copy to a fresh String, or
+# keep the zero-copy SubString view.
+_to(::Type{String}, s::AbstractString) = String(s)
+_to(::Type{SubString{String}}, s::SubString{String}) = s
+
+# Collapse an empty Vector to `nothing` so Node fields store "absent" canonically.
+_nothingify(v::Vector) = isempty(v) ? nothing : v
+
+# Decode the raw bytes of a TEXT/ATTR_VALUE token into the parser's storage type. When the
+# tokenizer guarantees no `&` was seen (`has_entities=false`), we skip the entity-decode
+# pass entirely. The `convert_text=identity` specialization (SubString parse) skips the
+# runtime branch as well — both arms would return the same value.
+@inline _text_value(::Type{S}, raw, _, ::typeof(identity)) where {S} = _to(S, raw)
+@inline _text_value(::Type{S}, raw, has_entities, convert_text::F) where {S, F} =
+ has_entities ? convert_text(raw) : _to(S, raw)
+
+# Token-stream → Node{S} builder. `convert_text` is `unescape` for parsed content (with
+# entity decoding) and `identity` for zero-copy SubString parsing where the caller opts
+# to keep raw escapes.
+function _parse(xml::String, ::Type{S}, convert_text::F) where {S, F}
+ tags = S[]
+ attrs_stack = Vector{Pair{S,S}}[]
+ children_stack = Vector{Vector{Node{S}}}()
+ push!(children_stack, Node{S}[])
+
+ pending_attr_name = SubString(xml, 1, 0)
+ decl_attrs = nothing
+ pending_pi_tag = SubString(xml, 1, 0)
+ pending_pi_value = nothing
+ in_close_tag = false
+
+ for token in tokenize(xml)
+ k = token.kind
+
+ if k === TokenKinds.TEXT
+ v = _text_value(S, token.raw, token.has_entities, convert_text)
+ push!(last(children_stack), Node{S}(Text, nothing, nothing, v, nothing))
+
+ elseif k === TokenKinds.OPEN_TAG
+ push!(tags, _to(S, tag_name(token)))
+ push!(attrs_stack, Pair{S,S}[])
+ push!(children_stack, Node{S}[])
+
+ elseif k === TokenKinds.SELF_CLOSE
+ t = pop!(tags)
+ a = pop!(attrs_stack)
+ pop!(children_stack)
+ push!(last(children_stack), Node{S}(Element, t, _nothingify(a), nothing, nothing))
+
+ elseif k === TokenKinds.TAG_CLOSE
+ in_close_tag && (in_close_tag = false)
+
+ elseif k === TokenKinds.CLOSE_TAG
+ close_name = tag_name(token)
+ isempty(tags) && error("Closing tag $close_name> with no matching open tag.")
+ t = pop!(tags)
+ t == close_name || error("Mismatched tags: expected $t>, got $close_name>.")
+ a = pop!(attrs_stack)
+ c = pop!(children_stack)
+ push!(last(children_stack), Node{S}(Element, t, _nothingify(a), nothing, isempty(c) ? nothing : c))
+ in_close_tag = true
+
+ elseif k === TokenKinds.ATTR_NAME
+ pending_attr_name = token.raw
+
+ elseif k === TokenKinds.ATTR_VALUE
+ val = _text_value(S, attr_value(token), token.has_entities, convert_text)
+ name = _to(S, pending_attr_name)
+ if decl_attrs !== nothing
+ any(p -> first(p) == name, decl_attrs) && error("Duplicate attribute: $name")
+ push!(decl_attrs, name => val)
+ elseif !isempty(attrs_stack)
+ any(p -> first(p) == name, last(attrs_stack)) && error("Duplicate attribute: $name")
+ push!(last(attrs_stack), name => val)
+ end
+
+ elseif k === TokenKinds.XML_DECL_OPEN
+ decl_attrs = Pair{S,S}[]
+
+ elseif k === TokenKinds.XML_DECL_CLOSE
+ a = isempty(decl_attrs) ? nothing : decl_attrs
+ push!(last(children_stack), Node{S}(Declaration, nothing, a, nothing, nothing))
+ decl_attrs = nothing
+
+ elseif k === TokenKinds.COMMENT_CONTENT
+ push!(last(children_stack), Node{S}(Comment, nothing, nothing, _to(S, token.raw), nothing))
+
+ elseif k === TokenKinds.CDATA_CONTENT
+ push!(last(children_stack), Node{S}(CData, nothing, nothing, _to(S, token.raw), nothing))
+
+ elseif k === TokenKinds.DOCTYPE_CONTENT
+ push!(last(children_stack), Node{S}(DTD, nothing, nothing, _to(S, lstrip(token.raw)), nothing))
+
+ elseif k === TokenKinds.PI_OPEN
+ pending_pi_tag = pi_target(token)
+ pending_pi_value = nothing
+
+ elseif k === TokenKinds.PI_CONTENT
+ content = strip(token.raw)
+ pending_pi_value = isempty(content) ? nothing : _to(S, content)
+
+ elseif k === TokenKinds.PI_CLOSE
+ push!(last(children_stack), Node{S}(ProcessingInstruction, _to(S, pending_pi_tag), nothing, pending_pi_value, nothing))
+ end
+ end
+
+ !isempty(tags) && error("Unclosed tags: $(join(tags, ", "))")
+ doc_children = only(children_stack)
+ Node{S}(Document, nothing, nothing, nothing, isempty(doc_children) ? nothing : doc_children)
end
-Base.:(==)(a::AbstractXMLNode, b::AbstractXMLNode) = nodes_equal(a, b)
+#-----------------------------------------------------------------------------# h (HTML/XML element builder)
+"""
+ h(tag, children...; attrs...)
+ h.tag(children...; attrs...)
-#-----------------------------------------------------------------------------# parse
-Base.parse(::Type{T}, str::AbstractString) where {T <: AbstractXMLNode} = parse(str, T)
+Convenience constructor for `Element` nodes.
-#-----------------------------------------------------------------------------# indexing
-Base.getindex(o::Union{Raw, AbstractXMLNode}) = o
-Base.getindex(o::Union{Raw, AbstractXMLNode}, i::Integer) = children(o)[i]
-Base.getindex(o::Union{Raw, AbstractXMLNode}, ::Colon) = children(o)
-Base.lastindex(o::Union{Raw, AbstractXMLNode}) = lastindex(children(o))
-
-Base.only(o::Union{Raw, AbstractXMLNode}) = only(children(o))
-
-Base.length(o::AbstractXMLNode) = length(children(o))
-
-#-----------------------------------------------------------------------------# printing
-function _show_node(io::IO, o)
- printstyled(io, typeof(o), ' '; color=:light_black)
- !ismissing(depth(o)) && printstyled(io, "(depth=", depth(o), ") ", color=:light_black)
- printstyled(io, nodetype(o), ; color=:light_green)
- if o.nodetype === Text
- printstyled(io, ' ', repr(value(o)))
- elseif o.nodetype === Element
- printstyled(io, " <", tag(o), color=:light_cyan)
- _print_attrs(io, o; color=:light_yellow)
- printstyled(io, '>', color=:light_cyan)
- _print_n_children(io, o)
- elseif o.nodetype === DTD
- printstyled(io, " ', color=:light_cyan)
- elseif o.nodetype === Declaration
- printstyled(io, " ", color=:light_cyan)
- elseif o.nodetype === ProcessingInstruction
- printstyled(io, " ", tag(o), color=:light_cyan)
- _print_attrs(io, o; color=:light_yellow)
- printstyled(io, "?>", color=:light_cyan)
- elseif o.nodetype === Comment
- printstyled(io, " ", color=:light_cyan)
- elseif o.nodetype === CData
- printstyled(io, " ", color=:light_cyan)
- elseif o.nodetype === Document
- _print_n_children(io, o)
- elseif o.nodetype === UNKNOWN
- printstyled(io, "Unknown", color=:light_cyan)
- _print_n_children(io, o)
- else
- error("Unreachable reached")
+ h("div", "hello"; class="main") # hello
+ h.div("hello"; class="main") # same thing
+"""
+function h(tag::Union{Symbol, AbstractString}, children...; attrs...)
+ t = String(tag)
+ a = Pair{String,String}[String(k) => String(v) for (k, v) in pairs(attrs)]
+ c = Node{String}[_to_node(x) for x in children]
+ Node{String}(Element, t, a, nothing, c)
+end
+
+Base.getproperty(::typeof(h), tag::Symbol) = h(tag)
+
+function (o::Node)(args...; attrs...)
+ o.nodetype === Element || error("Only Element nodes are callable.")
+ old_children = something(o.children, ())
+ old_attrs = isnothing(o.attributes) ? () : (Symbol(k) => v for (k, v) in o.attributes)
+ h(o.tag, old_children..., args...; old_attrs..., attrs...)
+end
+
+#-----------------------------------------------------------------------------# DTD parsing
+struct ElementDecl
+ name::String
+ content::String # "EMPTY", "ANY", or content model like "(#PCDATA)" or "(a,b,c)*"
+end
+
+struct AttDecl
+ element::String
+ name::String
+ type::String # "CDATA", "ID", "(val1|val2)", "NOTATION (a|b)", etc.
+ default::String # "#REQUIRED", "#IMPLIED", "#FIXED \"val\"", or "\"val\""
+end
+
+struct EntityDecl
+ name::String
+ value::Union{Nothing, String} # replacement text (internal entities)
+ external_id::Union{Nothing, String} # "SYSTEM \"uri\"" or "PUBLIC \"pubid\" \"uri\""
+ parameter::Bool
+end
+
+struct NotationDecl
+ name::String
+ external_id::String
+end
+
+struct ParsedDTD
+ root::String
+ system_id::Union{Nothing, String}
+ public_id::Union{Nothing, String}
+ elements::Vector{ElementDecl}
+ attributes::Vector{AttDecl}
+ entities::Vector{EntityDecl}
+ notations::Vector{NotationDecl}
+end
+
+# DTD parsing helpers — each returns (parsed_piece, new_pos) so calls compose.
+
+# A byte that can appear in an XML Name (letters, digits, `_`, `-`, `.`, `:`).
+@inline _dtd_is_name_char(c::Char) =
+ ('a' <= c <= 'z') || ('A' <= c <= 'Z') || ('0' <= c <= '9') ||
+ c == '_' || c == '-' || c == '.' || c == ':'
+
+# Advance past any whitespace.
+function _dtd_skip_ws(s, pos)
+ while pos <= ncodeunits(s) && isspace(s[pos])
+ pos += 1
end
+ pos
end
-function _print_attrs(io::IO, o; color=:normal)
- attr = attributes(o)
- isnothing(attr) && return nothing
- for (k,v) in attr
- # printstyled(io, ' ', k, '=', '"', v, '"'; color)
- print(io, ' ', k, '=', '"', v, '"')
+# Read an XML Name token; errors if no name characters are present.
+function _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ start = pos
+ while pos <= ncodeunits(s) && _dtd_is_name_char(s[pos])
+ pos += 1
end
+ start == pos && error("Expected name at position $pos in DTD")
+ SubString(s, start, pos - 1), pos
end
-function _print_n_children(io::IO, o::Node)
- n = length(children(o))
- text = n == 0 ? "" : n == 1 ? " (1 child)" : " ($n children)"
- printstyled(io, text, color=:light_black)
-end
-_print_n_children(io::IO, o) = nothing
-
-#-----------------------------------------------------------------------------# write_xml
-write(x; kw...) = (io = IOBuffer(); write(io, x; kw...); String(take!(io)))
-
-write(filename::AbstractString, x; kw...) = open(io -> write(io, x; kw...), filename, "w")
-
-function write(io::IO, x, ctx::Vector{Bool}=[false]; indentsize::Int=2, depth::Int=1)
- indent = ' ' ^ indentsize
- nodetype = XML.nodetype(x)
- tag = XML.tag(x)
- value = XML.value(x)
- children = XML.children(x)
-
- padding = indent ^ max(0, depth - 1)
- !ctx[end] && print(io, padding)
-
- if nodetype === Text
- print(io, value)
-
- elseif nodetype === Element
- push!(ctx, ctx[end])
- update_ctx!(ctx, x)
- print(io, '<', tag)
- _print_attrs(io, x)
- print(io, isempty(children) ? '/' : "", '>')
- if !isempty(children)
- if length(children) == 1 && XML.nodetype(only(children)) === Text
- write(io, only(children), ctx; indentsize=0)
- print(io, "", tag, '>')
- else
- !ctx[end] && println(io)
- foreach(children) do child
- write(io, child, ctx; indentsize, depth=depth + 1)
- !ctx[end] && println(io)
- end
- print(io, !ctx[end] ? padding : "", "", tag, '>')
+
+# Read a `"..."` or `'...'` string and return the contents without the surrounding quotes.
+function _dtd_read_quoted(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ q = s[pos]
+ (q == '"' || q == '\'') || error("Expected quoted string at position $pos in DTD")
+ pos += 1
+ start = pos
+ while pos <= ncodeunits(s) && s[pos] != q
+ pos += 1
+ end
+ val = SubString(s, start, pos - 1)
+ pos += 1
+ val, pos
+end
+
+# Read a balanced parenthesized expression (e.g. `(a|b|(c,d))`), returning the full
+# substring including the outer `(` and `)`. Skips over quoted strings inside.
+function _dtd_read_parens(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ s[pos] == '(' || error("Expected '(' at position $pos in DTD")
+ depth = 1
+ start = pos
+ pos += 1
+ while pos <= ncodeunits(s) && depth > 0
+ c = s[pos]
+ if c == '('
+ depth += 1
+ elseif c == ')'
+ depth -= 1
+ elseif c == '"' || c == '\''
+ pos += 1
+ while pos <= ncodeunits(s) && s[pos] != c
+ pos += 1
end
end
- pop!(ctx)
+ pos += 1
+ end
+ SubString(s, start, pos - 1), pos
+end
- elseif nodetype === DTD
- print(io, "')
+# Advance past the next `>` that terminates a markup declaration, ignoring `>` inside
+# quoted strings.
+function _dtd_skip_to_close(s, pos)
+ while pos <= ncodeunits(s) && s[pos] != '>'
+ c = s[pos]
+ if c == '"' || c == '\''
+ pos += 1
+ while pos <= ncodeunits(s) && s[pos] != c
+ pos += 1
+ end
+ end
+ pos += 1
+ end
+ pos <= ncodeunits(s) ? pos + 1 : pos
+end
- elseif nodetype === Declaration
- print(io, "")
+# Parse `` — content is either a name (EMPTY/ANY) or a parens
+# group with an optional `*`/`+`/`?` quantifier appended.
+function _dtd_parse_element(s, pos)
+ name, pos = _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ if s[pos] == '('
+ content, pos = _dtd_read_parens(s, pos)
+ if pos <= ncodeunits(s) && s[pos] in ('*', '+', '?')
+ content = string(content, s[pos])
+ pos += 1
+ end
+ else
+ content, pos = _dtd_read_name(s, pos)
+ end
+ pos = _dtd_skip_to_close(s, pos)
+ ElementDecl(String(name), String(content)), pos
+end
- elseif nodetype === ProcessingInstruction
- print(io, "", tag)
- _print_attrs(io, x)
- print(io, "?>")
+# Parse `` — emits one AttDecl per attribute.
+function _dtd_parse_attlist(s, pos)
+ element, pos = _dtd_read_name(s, pos)
+ atts = AttDecl[]
+ while true
+ pos = _dtd_skip_ws(s, pos)
+ (pos > ncodeunits(s) || s[pos] == '>') && break
- elseif nodetype === Comment
- print(io, "")
+ name, pos = _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
- elseif nodetype === CData
- print(io, "")
+ # Attribute type
+ if s[pos] == '('
+ atype, pos = _dtd_read_parens(s, pos)
+ else
+ atype, pos = _dtd_read_name(s, pos)
+ if atype == "NOTATION"
+ pos = _dtd_skip_ws(s, pos)
+ parens, pos = _dtd_read_parens(s, pos)
+ atype = string("NOTATION ", parens)
+ end
+ end
+ pos = _dtd_skip_ws(s, pos)
- elseif nodetype === Document
- foreach(children) do child
- write(io, child, ctx; indentsize)
- !ctx[end] && println(io)
+ # Default declaration
+ if s[pos] == '#'
+ pos += 1
+ keyword, pos = _dtd_read_name(s, pos)
+ if keyword == "FIXED"
+ pos = _dtd_skip_ws(s, pos)
+ val, pos = _dtd_read_quoted(s, pos)
+ default = string("#FIXED \"", val, "\"")
+ else
+ default = string("#", keyword)
+ end
+ elseif s[pos] == '"' || s[pos] == '\''
+ val, pos = _dtd_read_quoted(s, pos)
+ default = string("\"", val, "\"")
+ else
+ error("Expected default declaration at position $pos in DTD")
end
+ push!(atts, AttDecl(String(element), String(name), String(atype), default))
+ end
+ pos <= ncodeunits(s) && s[pos] == '>' && (pos += 1)
+ atts, pos
+end
+
+# Parse `` or ``. `%` marks a
+# parameter entity (referenced as `%name;` in DTDs only).
+function _dtd_parse_entity(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ parameter = false
+ if pos <= ncodeunits(s) && s[pos] == '%'
+ parameter = true
+ pos += 1
+ end
+ name, pos = _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+
+ value = nothing
+ external_id = nothing
+ if s[pos] == '"' || s[pos] == '\''
+ v, pos = _dtd_read_quoted(s, pos)
+ value = String(v)
+ else
+ keyword, pos = _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ if keyword == "SYSTEM"
+ uri, pos = _dtd_read_quoted(s, pos)
+ external_id = string("SYSTEM \"", uri, "\"")
+ elseif keyword == "PUBLIC"
+ pubid, pos = _dtd_read_quoted(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ uri, pos = _dtd_read_quoted(s, pos)
+ external_id = string("PUBLIC \"", pubid, "\" \"", uri, "\"")
+ else
+ error("Expected SYSTEM, PUBLIC, or quoted value in ENTITY declaration")
+ end
+ end
+ pos = _dtd_skip_to_close(s, pos)
+ EntityDecl(String(name), value, external_id, parameter), pos
+end
+# Parse `` / ``.
+function _dtd_parse_notation(s, pos)
+ name, pos = _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ keyword, pos = _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ if keyword == "SYSTEM"
+ uri, pos = _dtd_read_quoted(s, pos)
+ external_id = string("SYSTEM \"", uri, "\"")
+ elseif keyword == "PUBLIC"
+ pubid, pos = _dtd_read_quoted(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+ if pos <= ncodeunits(s) && (s[pos] == '"' || s[pos] == '\'')
+ uri, pos = _dtd_read_quoted(s, pos)
+ external_id = string("PUBLIC \"", pubid, "\" \"", uri, "\"")
+ else
+ external_id = string("PUBLIC \"", pubid, "\"")
+ end
else
- error("Unreachable case reached during XML.write")
+ error("Expected SYSTEM or PUBLIC in NOTATION declaration")
end
+ pos = _dtd_skip_to_close(s, pos)
+ NotationDecl(String(name), external_id), pos
+end
+
+"""
+ parse_dtd(value::AbstractString) -> ParsedDTD
+ parse_dtd(node::Node) -> ParsedDTD
+Parse a DTD value string (from a `DTD` node) into structured declarations.
+"""
+function parse_dtd(value::AbstractString)
+ s = String(value)
+ pos = 1
+
+ root, pos = _dtd_read_name(s, pos)
+ pos = _dtd_skip_ws(s, pos)
+
+ # External ID
+ system_id = nothing
+ public_id = nothing
+ if pos <= ncodeunits(s) && _dtd_is_name_char(s[pos])
+ keyword, kpos = _dtd_read_name(s, pos)
+ if keyword == "SYSTEM"
+ pos = kpos
+ uri, pos = _dtd_read_quoted(s, pos)
+ system_id = String(uri)
+ elseif keyword == "PUBLIC"
+ pos = kpos
+ pubid, pos = _dtd_read_quoted(s, pos)
+ public_id = String(pubid)
+ pos = _dtd_skip_ws(s, pos)
+ if pos <= ncodeunits(s) && (s[pos] == '"' || s[pos] == '\'')
+ uri, pos = _dtd_read_quoted(s, pos)
+ system_id = String(uri)
+ end
+ end
+ end
+
+ elements = ElementDecl[]
+ attributes = AttDecl[]
+ entities = EntityDecl[]
+ notations = NotationDecl[]
+
+ # Internal subset
+ pos = _dtd_skip_ws(s, pos)
+ if pos <= ncodeunits(s) && s[pos] == '['
+ pos += 1
+ while pos <= ncodeunits(s)
+ pos = _dtd_skip_ws(s, pos)
+ pos > ncodeunits(s) && break
+ s[pos] == ']' && break
+
+ rest = SubString(s, pos)
+ if startswith(rest, "", s, pos + 4)
+ isnothing(i) && error("Unterminated comment in DTD")
+ pos = last(i) + 1
+ elseif startswith(rest, "")
+ i = findnext("?>", s, pos + 2)
+ isnothing(i) && error("Unterminated PI in DTD")
+ pos = last(i) + 1
+ elseif startswith(rest, "
+ SELF_CLOSE # />
+ ATTR_NAME # attribute name
+ ATTR_VALUE # "value" or 'value' (with quotes in raw)
+
+ # CDATA sections
+ CDATA_OPEN #
+
+ # Comments
+ COMMENT_OPEN #
+
+ # Processing instructions
+ PI_OPEN #
+
+ # XML declaration ()
+ XML_DECL_OPEN #
+ # (reuses ATTR_NAME / ATTR_VALUE for pseudo-attributes)
+
+ # DOCTYPE
+ DOCTYPE_OPEN #
+ end
+end
+
+#-----------------------------------------------------------------------# Token
+# `has_entities` records whether the raw bytes contain a `&`. It is set by the readers for
+# `TEXT` and `ATTR_VALUE` (where entity references can appear) and stays `false` for every
+# other token kind. The downstream parser uses it to skip `unescape`'s redundant byte scan
+# when no entities are present.
+#
+# Field order matters: `has_entities` lives in the alignment padding that would otherwise
+# sit between the 1-byte `kind` and the 24-byte `raw`. This keeps `sizeof(Token{String})`
+# at 32 bytes instead of 40, which matters because tokens are allocated by the million
+# during parse.
+struct Token{S <: AbstractString}
+ kind::TokenKinds.Kind
+ has_entities::Bool
+ raw::SubString{S}
+end
+
+# Backwards-compatible constructor for the many internal call sites that emit non-entity
+# tokens (markup, names, close tokens, etc.).
+@inline Token(kind::TokenKinds.Kind, raw::SubString{S}) where {S} = Token{S}(kind, false, raw)
+
+function Base.show(io::IO, t::Token)
+ print(io, t.kind, ": ", repr(String(t.raw)))
+end
+
+#-----------------------------------------------------------------------# Tokenizer mode
+@enum Mode::UInt8 begin
+ M_DEFAULT # normal content mode
+ M_TAG # inside open tag, reading attributes
+ M_TAG_VALUE # expecting quoted attribute value
+ M_CLOSE_TAG # inside close tag, expecting >
+ M_XML_DECL # inside Tokenizer
+
+Return a lazy iterator of `Token`s over the XML string `xml`.
+"""
+struct Tokenizer{S <: AbstractString}
+ data::S
+ start::Int
+end
+
+tokenize(xml::AbstractString) = Tokenizer(xml, 1)
+tokenize(xml::AbstractString, pos::Int) = StatefulTokenizer(Tokenizer(xml, pos))
+
+# Lightweight mutable holder that drives the immutable `Tokenizer`'s iterate protocol with
+# a single state field — avoids the `Union{VS,Nothing}` field and per-iteration tuple
+# storage that `Iterators.Stateful` carries.
+mutable struct StatefulTokenizer{S <: AbstractString}
+ const t::Tokenizer{S}
+ state::TokenizerState{S}
+ done::Bool
+end
+
+StatefulTokenizer(t::Tokenizer{S}) where {S <: AbstractString} =
+ StatefulTokenizer{S}(t, TokenizerState(t.start, M_DEFAULT, no_token(t.data)), false)
+
+Base.IteratorSize(::Type{<:StatefulTokenizer}) = Base.SizeUnknown()
+Base.eltype(::Type{StatefulTokenizer{S}}) where {S} = Token{S}
+
+@inline function Base.iterate(st::StatefulTokenizer, _ = nothing)
+ st.done && return nothing
+ r = iterate(st.t, st.state)
+ if r === nothing
+ st.done = true
+ return nothing
+ end
+ st.state = r[2]
+ (r[1], nothing)
+end
+
+function Base.show(io::IO, t::Tokenizer)
+ n = ncodeunits(t.data)
+ print(io, "Tokenizer(")
+ t.start > 1 && print(io, t.start, "/")
+ print(io, Base.format_bytes(n), ")")
+end
+
+Base.IteratorSize(::Type{<:Tokenizer}) = Base.SizeUnknown()
+Base.eltype(::Type{Tokenizer{S}}) where {S} = Token{S}
+
+function Base.iterate(t::Tokenizer, st::TokenizerState=TokenizerState(t.start, M_DEFAULT, no_token(t.data)))
+ (; data) = t
+ (; pending, pos, mode) = st
+
+ if has_pending(st)
+ return (pending, TokenizerState(pos, mode, no_token(data)))
+ end
+ iseof(data, pos) && return nothing
+
+ if mode == M_DEFAULT
+ peek(data, pos) == UInt8('<') ? read_markup(data, pos) : read_text(data, pos)
+ elseif mode == M_TAG || mode == M_XML_DECL
+ read_in_tag(data, pos, mode)
+ elseif mode == M_TAG_VALUE || mode == M_XML_DECL_VALUE
+ read_attr_value(data, pos, mode)
+ elseif mode == M_CLOSE_TAG
+ read_close_tag_end(data, pos)
+ elseif mode == M_COMMENT
+ read_comment_body(data, pos)
+ elseif mode == M_CDATA
+ read_cdata_body(data, pos)
+ elseif mode == M_PI
+ read_pi_body(data, pos)
+ else # M_DOCTYPE
+ read_doctype_body(data, pos)
+ end
+end
+
+#-----------------------------------------------------------------------# Internal helpers
+# Check if pos is past the end of data
+@inline iseof(data::AbstractString, pos::Int)::Bool = pos > ncodeunits(data)
+# Read the byte at pos without bounds checking
+@inline peek(data::AbstractString, pos::Int)::UInt8 = @inbounds codeunit(data, pos)
+# Check if pos + offset is within bounds
+@inline canpeek(data::AbstractString, pos::Int, offset::Int)::Bool = pos + offset <= ncodeunits(data)
+
+# Lookup table for XML name bytes (letter, digit, _, -, ., :)
+const NAME_BYTE_TABLE = let t = falses(256)
+ for r in (UInt8('a'):UInt8('z'), UInt8('A'):UInt8('Z'), UInt8('0'):UInt8('9'))
+ for b in r; t[b + 1] = true; end
+ end
+ for b in (UInt8('_'), UInt8('-'), UInt8('.'), UInt8(':')); t[b + 1] = true; end
+ NTuple{256,Bool}(t)
+end
+@inline is_name_byte(b::UInt8)::Bool = @inbounds NAME_BYTE_TABLE[b + 1]
+
+# Check if byte is XML whitespace (space, tab, newline, carriage return)
+@inline function is_whitespace(b::UInt8)::Bool
+ b == UInt8(' ') || b == UInt8('\t') || b == UInt8('\n') || b == UInt8('\r')
+end
+
+# Advance pos past any whitespace bytes
+@inline function skip_whitespace(data::AbstractString, pos::Int)::Int
+ @inbounds while !iseof(data, pos) && is_whitespace(peek(data, pos))
+ pos += 1
+ end
+ pos
+end
+
+# Advance pos past a quoted string (single or double quotes)
+function skip_quoted(data::AbstractString, pos::Int)::Int
+ q = @inbounds peek(data, pos)
+ pos += 1
+ @inbounds while !iseof(data, pos)
+ peek(data, pos) == q && return pos + 1
+ pos += 1
+ end
+ error("Unterminated quoted string")
+end
+
+# Throw a tokenizer error with position context (noinline to keep error paths out of hot code)
+@noinline err(msg::AbstractString, pos::Int) = throw(ArgumentError("XML tokenizer error at position $pos: $msg"))
+
+#-----------------------------------------------------------------------# Text and markup
+# Read text content up to the next '<'. Uses `findnext` (memchr-backed for `String`) to
+# find the end-of-text delimiter, then scans for `&` only within the text region — a full
+# document `findnext('&', ...)` would be O(doc_size) per text token and degrade to
+# O(doc_size²) on entity-free documents.
+function read_text(data::AbstractString, pos::Int)
+ start = pos
+ n = ncodeunits(data)
+ lt_idx = findnext('<', data, pos)
+ end_pos = isnothing(lt_idx) ? n + 1 : lt_idx
+ raw = @inbounds SubString(data, start, prevind(data, end_pos))
+ has_amp = occursin('&', raw)
+ tok = Token{typeof(data)}(TokenKinds.TEXT, has_amp, raw)
+ (tok, TokenizerState(end_pos, M_DEFAULT, no_token(data)))
+end
+
+# Dispatch on the character after '<' to the appropriate reader
+function read_markup(data::AbstractString, pos::Int)
+ start = pos
+ pos += 1 # skip '<'
+ iseof(data, pos) && err("unexpected end of input after '<'", start)
+
+ b = peek(data, pos)
+ if b == UInt8('!')
+ read_bang(data, pos + 1, start)
+ elseif b == UInt8('?')
+ read_pi_start(data, pos + 1, start)
+ elseif b == UInt8('/')
+ read_close_tag_start(data, pos + 1, start)
+ else
+ read_open_tag_start(data, pos, start)
+ end
+end
+
+#-----------------------------------------------------------------------# or other ' that closes a '' tag
+function read_close_tag_end(data::AbstractString, pos::Int)
+ pos = skip_whitespace(data, pos)
+ iseof(data, pos) && err("unterminated close tag", pos)
+ peek(data, pos) == UInt8('>') || err("expected '>'", pos)
+ tok = Token(TokenKinds.TAG_CLOSE, @inbounds SubString(data, pos, pos))
+ (tok, TokenizerState(pos + 1, M_DEFAULT, no_token(data)))
+end
+
+#-----------------------------------------------------------------------# Attributes (shared by M_TAG and M_XML_DECL)
+# Read the next attribute name or tag-close delimiter (>, />, ?>)
+function read_in_tag(data::AbstractString, pos::Int, mode::Mode)
+ pos = skip_whitespace(data, pos)
+ iseof(data, pos) && err("unterminated tag", pos)
+
+ b = peek(data, pos)
+ is_decl = (mode == M_XML_DECL)
+
+ # Check for end delimiters
+ if is_decl
+ if b == UInt8('?') && canpeek(data, pos, 1) && peek(data, pos + 1) == UInt8('>')
+ tok = Token(TokenKinds.XML_DECL_CLOSE, @inbounds SubString(data, pos, pos + 1))
+ return (tok, TokenizerState(pos + 2, M_DEFAULT, no_token(data)))
+ end
+ else
+ if b == UInt8('>')
+ tok = Token(TokenKinds.TAG_CLOSE, @inbounds SubString(data, pos, pos))
+ return (tok, TokenizerState(pos + 1, M_DEFAULT, no_token(data)))
+ end
+ if b == UInt8('/') && canpeek(data, pos, 1) && peek(data, pos + 1) == UInt8('>')
+ tok = Token(TokenKinds.SELF_CLOSE, @inbounds SubString(data, pos, pos + 1))
+ return (tok, TokenizerState(pos + 2, M_DEFAULT, no_token(data)))
+ end
+ end
+
+ # Attribute name
+ name_start = pos
+ @inbounds while !iseof(data, pos) && is_name_byte(peek(data, pos))
+ pos += 1
+ end
+ name_end = pos - 1
+ name_start > name_end && err("expected attribute name or tag close", pos)
+
+ # Consume '=' and surrounding whitespace (not part of any token)
+ pos = skip_whitespace(data, pos)
+ (!iseof(data, pos) && peek(data, pos) == UInt8('=')) || err("expected '=' after attribute name", pos)
+ pos += 1
+ pos = skip_whitespace(data, pos)
+
+ next_state = is_decl ? M_XML_DECL_VALUE : M_TAG_VALUE
+ tok = Token(TokenKinds.ATTR_NAME, @inbounds SubString(data, name_start, name_end))
+ (tok, TokenizerState(pos, next_state, no_token(data)))
+end
+
+# Read a quoted attribute value (including the quotes). Same shape as `read_text`: use
+# `findnext` for the closing quote (memchr-backed for `String`), then a bounded `occursin`
+# over the value range for entity detection so we never scan past the quote.
+function read_attr_value(data::AbstractString, pos::Int, mode::Mode)
+ iseof(data, pos) && err("expected attribute value", pos)
+
+ q = peek(data, pos)
+ (q == UInt8('"') || q == UInt8('\'')) || err("expected quoted attribute value", pos)
+
+ start = pos
+ pos += 1 # skip opening quote
+ quote_char = Char(q)
+ close_idx = findnext(quote_char, data, pos)
+ isnothing(close_idx) && err("unterminated attribute value", start)
+ # Value range is [pos, close_idx - 1]; entity check is bounded to this view.
+ inner = @inbounds SubString(data, pos, prevind(data, close_idx))
+ has_amp = occursin('&', inner)
+ pos = close_idx + 1 # one past the closing quote (always ASCII)
+
+ next_state = (mode == M_XML_DECL_VALUE) ? M_XML_DECL : M_TAG
+ raw = @inbounds SubString(data, start, pos - 1)
+ tok = Token{typeof(data)}(TokenKinds.ATTR_VALUE, has_amp, raw)
+ (tok, TokenizerState(pos, next_state, no_token(data)))
+end
+
+#-----------------------------------------------------------------------# Content bodies (comment, CDATA, PI, DOCTYPE)
+# Scan for '-->' and emit comment content + close tokens
+function read_comment_body(data::AbstractString, pos::Int)
+ start = pos
+ @inbounds while !iseof(data, pos)
+ if peek(data, pos) == UInt8('-') &&
+ canpeek(data, pos, 1) && peek(data, pos + 1) == UInt8('-') &&
+ canpeek(data, pos, 2) && peek(data, pos + 2) == UInt8('>')
+ content_end = prevind(data, pos)
+ close_start = pos
+ pos += 3
+ pending = Token(TokenKinds.COMMENT_CLOSE, SubString(data, close_start, pos - 1))
+ tok = Token(TokenKinds.COMMENT_CONTENT, SubString(data, start, content_end))
+ return (tok, TokenizerState(pos, M_DEFAULT, pending))
+ end
+ pos += 1
+ end
+ err("unterminated comment", start)
+end
+
+# Scan for ']]>' and emit CDATA content + close tokens
+function read_cdata_body(data::AbstractString, pos::Int)
+ start = pos
+ @inbounds while !iseof(data, pos)
+ if peek(data, pos) == UInt8(']') &&
+ canpeek(data, pos, 1) && peek(data, pos + 1) == UInt8(']') &&
+ canpeek(data, pos, 2) && peek(data, pos + 2) == UInt8('>')
+ content_end = prevind(data, pos)
+ close_start = pos
+ pos += 3
+ pending = Token(TokenKinds.CDATA_CLOSE, SubString(data, close_start, pos - 1))
+ tok = Token(TokenKinds.CDATA_CONTENT, SubString(data, start, content_end))
+ return (tok, TokenizerState(pos, M_DEFAULT, pending))
+ end
+ pos += 1
+ end
+ err("unterminated CDATA section", start)
+end
+
+# Scan for '?>' and emit PI content + close tokens
+function read_pi_body(data::AbstractString, pos::Int)
+ start = pos
+ @inbounds while !iseof(data, pos)
+ if peek(data, pos) == UInt8('?') && canpeek(data, pos, 1) && peek(data, pos + 1) == UInt8('>')
+ content_end = prevind(data, pos)
+ close_start = pos
+ pos += 2
+ pending = Token(TokenKinds.PI_CLOSE, SubString(data, close_start, pos - 1))
+ tok = Token(TokenKinds.PI_CONTENT, SubString(data, start, content_end))
+ return (tok, TokenizerState(pos, M_DEFAULT, pending))
+ end
+ pos += 1
+ end
+ err("unterminated processing instruction", start)
+end
+
+# Scan DOCTYPE body, handling nested brackets, quotes, and comments
+function read_doctype_body(data::AbstractString, pos::Int)
+ start = pos
+ depth = 0
+ @inbounds while !iseof(data, pos)
+ b = peek(data, pos)
+ if b == UInt8('-') && canpeek(data, pos, 1) && peek(data, pos + 1) == UInt8('-') &&
+ pos >= 3 &&
+ codeunit(data, pos - 1) == UInt8('!') &&
+ codeunit(data, pos - 2) == UInt8('<')
+ # Inside a
+ pos += 2 # skip "--"
+ while !iseof(data, pos)
+ if peek(data, pos) == UInt8('-') && canpeek(data, pos, 1) && peek(data, pos + 1) == UInt8('-') &&
+ canpeek(data, pos, 2) && peek(data, pos + 2) == UInt8('>')
+ pos += 3 # skip "-->"
+ break
+ end
+ pos += 1
+ end
+ elseif b == UInt8('"') || b == UInt8('\'')
+ pos = skip_quoted(data, pos)
+ elseif b == UInt8('[')
+ depth += 1
+ pos += 1
+ elseif b == UInt8(']')
+ depth -= 1
+ pos += 1
+ elseif b == UInt8('>') && depth == 0
+ content_end = prevind(data, pos)
+ close_start = pos
+ pos += 1
+ pending = Token(TokenKinds.DOCTYPE_CLOSE, @inbounds SubString(data, close_start, pos - 1))
+ tok = Token(TokenKinds.DOCTYPE_CONTENT, @inbounds SubString(data, start, content_end))
+ return (tok, TokenizerState(pos, M_DEFAULT, pending))
+ else
+ pos += 1
+ end
+ end
+ err("unterminated DOCTYPE", start)
+end
+
+#-----------------------------------------------------------------------# Utility functions
+
+"""
+ tag_name(token::Token) -> SubString{String}
+
+Extract the element name from an `OPEN_TAG` or `CLOSE_TAG` token.
+"""
+function tag_name(token::Token)
+ if token.kind == TokenKinds.OPEN_TAG
+ @inbounds SubString(token.raw, 2, ncodeunits(token.raw)) # skip '<'
+ elseif token.kind == TokenKinds.CLOSE_TAG
+ @inbounds SubString(token.raw, 3, ncodeunits(token.raw)) # skip ''
+ else
+ throw(ArgumentError("tag_name requires OPEN_TAG or CLOSE_TAG, got $(token.kind)"))
+ end
+end
+
+"""
+ attr_value(token::Token) -> SubString{String}
+
+Strip the surrounding quotes from an `ATTR_VALUE` token.
+"""
+function attr_value(token::Token)
+ token.kind == TokenKinds.ATTR_VALUE ||
+ throw(ArgumentError("attr_value requires ATTR_VALUE, got $(token.kind)"))
+ @inbounds SubString(token.raw, 2, prevind(token.raw, lastindex(token.raw)))
+end
+
+"""
+ pi_target(token::Token) -> SubString{String}
+
+Extract the target name from a `PI_OPEN` or `XML_DECL_OPEN` token.
+"""
+function pi_target(token::Token)
+ (token.kind == TokenKinds.PI_OPEN || token.kind == TokenKinds.XML_DECL_OPEN) ||
+ throw(ArgumentError("pi_target requires PI_OPEN or XML_DECL_OPEN, got $(token.kind)"))
+ @inbounds SubString(token.raw, 3, ncodeunits(token.raw)) # skip ''
+end
+
+end # module XMLTokenizer
diff --git a/src/dtd.jl b/src/dtd.jl
deleted file mode 100644
index 58299f0..0000000
--- a/src/dtd.jl
+++ /dev/null
@@ -1,141 +0,0 @@
-# This is all a work in progress
-
-#-----------------------------------------------------------------------------# position_after
-function position_after(needle::Vector{UInt8}, haystack::Vector{UInt8}, i)
- x = findnext(needle, haystack, i)
- isnothing(x) ? nothing : x[end] + 1
-end
-
-position_after(needle::String, haystack::Vector{UInt8}, i) = position_after(Vector{UInt8}(needle), haystack, i)
-
-
-#-----------------------------------------------------------------------------# DeclaredElement
-struct DeclaredElement
- name::String
- content::String # "ANY", "EMPTY", or "(children...)"
- function DeclaredElement(name, content)
- content in ("ANY", "EMPTY") || (content[1] == '(' && content[end] == ')') ||
- error("DeclaredElement `content` must be 'ANY', 'EMPTY', or '(children...)'. Got $content.")
- new(name, content)
- end
-end
-Base.show(io::IO, o::DeclaredElement) = print(io, "")
-
-function get_declared_elements(data::Vector{UInt8})
- i = position_after("")
-
-
-function get_declared_attributes(data)
- i = position_after("")
-end
-
-function get_declared_entities(data)
- i = position_after(" println(io, " ", x), o.elements)
- printstyled(io, " DeclaredAttributes (", length(o.attributes), ")\n", color=:light_green)
- foreach(x -> println(io, " ", x), o.attributes)
- printstyled(io, " DeclaredEntities (", length(o.entities), ")\n", color=:light_green)
- foreach(x -> println(io, " ", x), o.entities)
-end
-
-
-function DTDBody(data::Vector{UInt8}, file = false)
- file && @goto isfile
- i = position_after(" _as_substring(_decode_attr(result[1])))
+ end
+ isempty(attrs) ? nothing : Attributes(attrs)
+end
+
+"""
+ get(n::LazyNode, key::AbstractString, default)
+
+Return the value of attribute `key` on `n`, or `default` if absent. Walks the token stream
+once — no `Attributes` allocation — so this is the recommended way to read a single
+attribute from a `LazyNode`. Use [`eachattribute`](@ref) to stream all attribute pairs
+without allocating, or [`attributes`](@ref) for the materialized dict.
+"""
+function Base.get(n::LazyNode, key::AbstractString, default)
+ n.nodetype in (Element, Declaration) || return default
+ iter = _lazy_tokenizer(n)
+ iterate(iter) # skip OPEN_TAG or XML_DECL_OPEN
+ for tok in iter
+ tok.kind === TokenKinds.ATTR_NAME || return default
+ if tok.raw == key
+ result = iterate(iter)
+ result === nothing && return default
+ return _decode_attr(result[1])
+ else
+ iterate(iter) # skip value
+ end
+ end
+ default
+end
+
+#-----------------------------------------------------------------------------# eachattribute
+struct LazyAttrIterator{I}
+ iter::I
+ done::Base.RefValue{Bool}
+end
+
+Base.IteratorSize(::Type{<:LazyAttrIterator}) = Base.SizeUnknown()
+Base.eltype(::Type{<:LazyAttrIterator}) = Pair{SubString{String}, Union{SubString{String}, String}}
+
+"""
+ eachattribute(n::LazyNode)
+
+Lazy iterator yielding `name => value` pairs for the attributes of `n` (an `Element` or
+`Declaration`). Does not allocate an [`Attributes`](@ref) dict or intermediate vector;
+suitable for hot paths that only need to scan attributes.
+
+For a single attribute by name, prefer `get(n, key, default)` — it short-circuits as soon
+as the match is found.
+"""
+function eachattribute(n::LazyNode)
+ iter = _lazy_tokenizer(n)
+ is_attrs = n.nodetype === Element || n.nodetype === Declaration
+ is_attrs && iterate(iter) # skip OPEN_TAG / XML_DECL_OPEN
+ LazyAttrIterator{typeof(iter)}(iter, Ref(!is_attrs))
+end
+
+function Base.iterate(it::LazyAttrIterator, _ = nothing)
+ it.done[] && return nothing
+ r = iterate(it.iter)
+ isnothing(r) && (it.done[] = true; return nothing)
+ tok = r[1]
+ if tok.kind !== TokenKinds.ATTR_NAME
+ it.done[] = true
+ return nothing
+ end
+ name = tok.raw
+ r = iterate(it.iter)
+ if isnothing(r)
+ it.done[] = true
+ return nothing
+ end
+ val = _decode_attr(r[1])
+ ((name => val), nothing)
+end
+
+function Base.getindex(n::LazyNode, key::AbstractString)
+ val = get(n, key, _MISSING_ATTR)
+ val === _MISSING_ATTR && throw(KeyError(key))
+ val
+end
+
+function Base.haskey(n::LazyNode, key::AbstractString)
+ get(n, key, _MISSING_ATTR) !== _MISSING_ATTR
+end
+
+function Base.keys(n::LazyNode)
+ n.nodetype in (Element, Declaration) || return ()
+ iter = _lazy_tokenizer(n)
+ iterate(iter)
+ result = SubString{String}[]
+ for tok in iter
+ tok.kind === TokenKinds.ATTR_NAME || break
+ push!(result, tok.raw)
+ iterate(iter) # skip value
+ end
+ result
+end
+
+#-----------------------------------------------------------------------------# children
+function children(n::LazyNode{S}) where {S}
+ nt = n.nodetype
+ (nt === Document || nt === Element) || return ()
+ children!(LazyNode{S}[], n)
+end
+
+"""
+ children!(buf::Vector{LazyNode{S}}, n::LazyNode{S}) -> buf
+
+Collect children of `n` into `buf` (cleared first) and return it. Lets callers reuse a
+single buffer across many nodes — useful when streaming through siblings (e.g. XLSX row
+iteration) to avoid one `Vector` allocation per node.
+"""
+function children!(buf::Vector{LazyNode{S}}, n::LazyNode{S}) where {S}
+ empty!(buf)
+ nt = n.nodetype
+ if nt === Document
+ return _lazy_collect_children!(buf, n.data, _lazy_tokenizer(n))
+ elseif nt !== Element
+ return buf
+ end
+ iter = _lazy_tokenizer(n)
+ for tok in iter
+ tok.kind === TokenKinds.SELF_CLOSE && return buf
+ tok.kind === TokenKinds.TAG_CLOSE && break
+ end
+ _lazy_collect_children!(buf, n.data, iter)
+end
+
+function _lazy_collect_children!(result::Vector{LazyNode{S}}, data::S, iter) where {S <: AbstractString}
+ for tok in iter
+ k = tok.kind
+ if k === TokenKinds.TEXT
+ push!(result, LazyNode(data, tok, Text))
+ elseif k === TokenKinds.OPEN_TAG
+ push!(result, LazyNode(data, tok, Element))
+ _lazy_skip_element!(iter)
+ elseif k === TokenKinds.COMMENT_OPEN
+ push!(result, LazyNode(data, tok, Comment))
+ _lazy_skip_until!(iter, TokenKinds.COMMENT_CLOSE)
+ elseif k === TokenKinds.CDATA_OPEN
+ push!(result, LazyNode(data, tok, CData))
+ _lazy_skip_until!(iter, TokenKinds.CDATA_CLOSE)
+ elseif k === TokenKinds.PI_OPEN
+ push!(result, LazyNode(data, tok, ProcessingInstruction))
+ _lazy_skip_until!(iter, TokenKinds.PI_CLOSE)
+ elseif k === TokenKinds.XML_DECL_OPEN
+ push!(result, LazyNode(data, tok, Declaration))
+ _lazy_skip_until!(iter, TokenKinds.XML_DECL_CLOSE)
+ elseif k === TokenKinds.DOCTYPE_OPEN
+ push!(result, LazyNode(data, tok, DTD))
+ _lazy_skip_until!(iter, TokenKinds.DOCTYPE_CLOSE)
+ elseif k === TokenKinds.CLOSE_TAG
+ break
+ end
+ end
+ result
+end
+
+function _lazy_skip_element!(iter)
+ depth = 1
+ for tok in iter
+ k = tok.kind
+ if k === TokenKinds.OPEN_TAG
+ depth += 1
+ elseif k === TokenKinds.SELF_CLOSE
+ depth -= 1
+ depth == 0 && return
+ elseif k === TokenKinds.CLOSE_TAG
+ depth -= 1
+ if depth == 0
+ iterate(iter) # consume trailing TAG_CLOSE
+ return
+ end
+ end
+ end
+end
+
+function _lazy_skip_until!(iter, target::TokenKinds.Kind)
+ for tok in iter
+ tok.kind === target && return
+ end
+end
+
+_token_end(tok) = tok.raw.offset + tok.raw.ncodeunits
+
+function _scan_to_close(iter, close_kind::TokenKinds.Kind)
+ for tok in iter
+ tok.kind === close_kind && return _token_end(tok)
+ end
+ error("Could not find closing token")
+end
+
+#-----------------------------------------------------------------------------# sourcetext
+"""
+ sourcetext(n::LazyNode) -> SubString{String}
+
+Return the original source text of the node as a `SubString`, with no parsing, escaping,
+or reformatting. This is the zero-copy counterpart of [`write`](@ref) for lazy nodes.
+"""
+function sourcetext(n::LazyNode)
+ nt = n.nodetype
+ start = _lazy_pos(n)
+ if nt === Element
+ iter = _lazy_tokenizer(n)
+ for tok in iter
+ tok.kind === TokenKinds.SELF_CLOSE && return SubString(n.data, start, _token_end(tok))
+ tok.kind === TokenKinds.TAG_CLOSE && break
+ end
+ depth = 1
+ for tok in iter
+ k = tok.kind
+ if k === TokenKinds.OPEN_TAG
+ depth += 1
+ elseif k === TokenKinds.SELF_CLOSE
+ depth -= 1
+ elseif k === TokenKinds.CLOSE_TAG
+ depth -= 1
+ if depth == 0
+ result = iterate(iter)
+ result === nothing && error("Could not find closing '>'")
+ return SubString(n.data, start, _token_end(result[1]))
+ end
+ end
+ end
+ error("Could not find closing tag")
+ elseif nt === Comment
+ return SubString(n.data, start, _scan_to_close(_lazy_tokenizer(n), TokenKinds.COMMENT_CLOSE))
+ elseif nt === CData
+ return SubString(n.data, start, _scan_to_close(_lazy_tokenizer(n), TokenKinds.CDATA_CLOSE))
+ elseif nt === ProcessingInstruction
+ return SubString(n.data, start, _scan_to_close(_lazy_tokenizer(n), TokenKinds.PI_CLOSE))
+ elseif nt === Declaration
+ return SubString(n.data, start, _scan_to_close(_lazy_tokenizer(n), TokenKinds.XML_DECL_CLOSE))
+ elseif nt === DTD
+ return SubString(n.data, start, _scan_to_close(_lazy_tokenizer(n), TokenKinds.DOCTYPE_CLOSE))
+ elseif nt === Text
+ return n.token.raw
+ elseif nt === Document
+ return SubString(n.data)
+ end
+end
+
+#-----------------------------------------------------------------------------# write
+"""
+ write(n::LazyNode; normalize::Bool=false, indentsize::Int=2) -> String
+ write(io::IO, n::LazyNode; normalize::Bool=false, indentsize::Int=2)
+ write(filename::AbstractString, n::LazyNode; normalize::Bool=false, indentsize::Int=2)
+
+Serialize a `LazyNode`. With `normalize=false` (the default) the result is the node's
+original source bytes (zero-copy via [`sourcetext`](@ref)) — fast, but any source-side
+whitespace between tags is preserved verbatim.
+
+With `normalize=true` the node is parsed into a `Node` tree and re-serialized, which
+collapses incidental source whitespace and pretty-prints with `indentsize`-space
+indentation.
+"""
+function write(n::LazyNode; normalize::Bool=false, indentsize::Int=2)
+ normalize ? write(parse(String(sourcetext(n)), Node); indentsize) : String(sourcetext(n))
+end
+
+function write(io::IO, n::LazyNode; normalize::Bool=false, indentsize::Int=2)
+ if normalize
+ write(io, parse(String(sourcetext(n)), Node); indentsize)
+ else
+ Base.write(io, sourcetext(n))
+ end
+end
+
+function write(filename::AbstractString, n::LazyNode; normalize::Bool=false, indentsize::Int=2)
+ open(io -> write(io, n; normalize, indentsize), filename, "w")
+end
+
+#-----------------------------------------------------------------------------# eachchildnode
+struct LazyChildIterator{S <: AbstractString, I}
+ data::S
+ iter::I
+ done::Base.RefValue{Bool}
+end
+
+Base.IteratorSize(::Type{<:LazyChildIterator}) = Base.SizeUnknown()
+Base.eltype(::Type{LazyChildIterator{S,I}}) where {S,I} = LazyNode{S}
+
+"""
+ eachchildnode(n::LazyNode)
+
+Return a lazy iterator over the children of `n`, yielding one [`LazyNode`](@ref) at a time
+without collecting them all into a vector.
+
+See also [`children`](@ref), which returns a `Vector{LazyNode}`.
+"""
+function eachchildnode(n::LazyNode{S}) where {S}
+ nt = n.nodetype
+ iter = _lazy_tokenizer(n)
+ if nt === Document
+ return LazyChildIterator{S, typeof(iter)}(n.data, iter, Ref(false))
+ elseif nt === Element
+ for tok in iter
+ if tok.kind === TokenKinds.SELF_CLOSE
+ return LazyChildIterator{S, typeof(iter)}(n.data, iter, Ref(true))
+ elseif tok.kind === TokenKinds.TAG_CLOSE
+ return LazyChildIterator{S, typeof(iter)}(n.data, iter, Ref(false))
+ end
+ end
+ end
+ LazyChildIterator{S, typeof(iter)}(n.data, iter, Ref(true))
+end
+
+function Base.iterate(ci::LazyChildIterator, _ = nothing)
+ ci.done[] && return nothing
+ for tok in ci.iter
+ k = tok.kind
+ if k === TokenKinds.TEXT
+ return (LazyNode(ci.data, tok, Text), nothing)
+ elseif k === TokenKinds.OPEN_TAG
+ node = LazyNode(ci.data, tok, Element)
+ _lazy_skip_element!(ci.iter)
+ return (node, nothing)
+ elseif k === TokenKinds.COMMENT_OPEN
+ node = LazyNode(ci.data, tok, Comment)
+ _lazy_skip_until!(ci.iter, TokenKinds.COMMENT_CLOSE)
+ return (node, nothing)
+ elseif k === TokenKinds.CDATA_OPEN
+ node = LazyNode(ci.data, tok, CData)
+ _lazy_skip_until!(ci.iter, TokenKinds.CDATA_CLOSE)
+ return (node, nothing)
+ elseif k === TokenKinds.PI_OPEN
+ node = LazyNode(ci.data, tok, ProcessingInstruction)
+ _lazy_skip_until!(ci.iter, TokenKinds.PI_CLOSE)
+ return (node, nothing)
+ elseif k === TokenKinds.XML_DECL_OPEN
+ node = LazyNode(ci.data, tok, Declaration)
+ _lazy_skip_until!(ci.iter, TokenKinds.XML_DECL_CLOSE)
+ return (node, nothing)
+ elseif k === TokenKinds.DOCTYPE_OPEN
+ node = LazyNode(ci.data, tok, DTD)
+ _lazy_skip_until!(ci.iter, TokenKinds.DOCTYPE_CLOSE)
+ return (node, nothing)
+ elseif k === TokenKinds.CLOSE_TAG || k === TokenKinds.TAG_CLOSE
+ ci.done[] = true
+ return nothing
+ end
+ end
+ ci.done[] = true
+ return nothing
+end
+
+#-----------------------------------------------------------------------------# is_simple / simple_value
+function is_simple(n::LazyNode)
+ n.nodetype === Element || return false
+ attrs = attributes(n)
+ (!isnothing(attrs) && !isempty(attrs)) && return false
+ ch = children(n)
+ length(ch) == 1 && ch[1].nodetype in (Text, CData)
+end
+
+function simple_value(n::LazyNode)
+ n.nodetype === Element || error("`simple_value` is only defined for simple nodes.")
+ attrs = attributes(n)
+ (!isnothing(attrs) && !isempty(attrs)) && error("`simple_value` is only defined for simple nodes.")
+ ch = children(n)
+ length(ch) == 1 && ch[1].nodetype in (Text, CData) || error("`simple_value` is only defined for simple nodes.")
+ value(ch[1])
+end
+
+# Single-pass combined predicate+accessor: returns the simple text/CData value, or
+# `nothing` if `n` is not a simple element. Avoids the double tokenization of
+# `is_simple(n) ? simple_value(n) : ...`.
+function is_simple_value(n::LazyNode)
+ n.nodetype === Element || return nothing
+ iter = _lazy_tokenizer(n)
+ iterate(iter) # skip OPEN_TAG
+ found_close = false
+ for tok in iter
+ k = tok.kind
+ k === TokenKinds.TAG_CLOSE && (found_close = true; break)
+ return nothing # attributes (ATTR_NAME), self-close, or anything else => not simple
+ end
+ found_close || return nothing
+ result = iterate(iter)
+ isnothing(result) && return nothing
+ tok = result[1]
+ k = tok.kind
+ if k === TokenKinds.TEXT
+ nxt = iterate(iter)
+ (isnothing(nxt) || nxt[1].kind !== TokenKinds.CLOSE_TAG) && return nothing
+ return _decode(tok)
+ elseif k === TokenKinds.CDATA_OPEN
+ r = iterate(iter)
+ (isnothing(r) || r[1].kind !== TokenKinds.CDATA_CONTENT) && return nothing
+ content = r[1].raw
+ r = iterate(iter)
+ (isnothing(r) || r[1].kind !== TokenKinds.CDATA_CLOSE) && return nothing
+ r = iterate(iter)
+ (isnothing(r) || r[1].kind !== TokenKinds.CLOSE_TAG) && return nothing
+ return content
+ end
+ nothing
+end
+
+#-----------------------------------------------------------------------------# indexing
+Base.getindex(n::LazyNode, i::Integer) = children(n)[i]
+Base.getindex(n::LazyNode, ::Colon) = children(n)
+Base.lastindex(n::LazyNode) = lastindex(children(n))
+Base.only(n::LazyNode) = only(children(n))
+Base.length(n::LazyNode) = length(children(n))
+
+#-----------------------------------------------------------------------------# parse / read
+Base.parse(::Type{LazyNode}, xml::AbstractString) = parse(xml, LazyNode)
+Base.parse(xml::AbstractString, ::Type{LazyNode}) = LazyNode(String(xml), Document)
+
+Base.read(filename::AbstractString, ::Type{LazyNode}) = parse(read(filename, String), LazyNode)
+Base.read(io::IO, ::Type{LazyNode}) = parse(read(io, String), LazyNode)
+
+#-----------------------------------------------------------------------------# show
+function Base.show(io::IO, n::LazyNode)
+ nt = n.nodetype
+ print(io, "Lazy ", nt)
+ if nt === Text
+ print(io, ' ', repr(value(n)))
+ elseif nt === Element
+ print(io, " <", tag(n))
+ attrs = attributes(n)
+ if !isnothing(attrs)
+ for (k, v) in attrs
+ print(io, ' ', k, '=', '"', v, '"')
+ end
+ end
+ print(io, '>')
+ elseif nt === DTD
+ print(io, " ')
+ elseif nt === Declaration
+ print(io, " ")
+ elseif nt === ProcessingInstruction
+ print(io, " ", tag(n))
+ v = value(n)
+ !isnothing(v) && print(io, ' ', v)
+ print(io, "?>")
+ elseif nt === Comment
+ print(io, " ")
+ elseif nt === CData
+ print(io, " ")
+ elseif nt === Document
+ n_ch = length(children(n))
+ n_ch > 0 && print(io, n_ch == 1 ? " (1 child)" : " ($n_ch children)")
+ end
+end
diff --git a/src/raw.jl b/src/raw.jl
deleted file mode 100644
index 29d0a10..0000000
--- a/src/raw.jl
+++ /dev/null
@@ -1,568 +0,0 @@
-#-----------------------------------------------------------------------------# RawType
-"""
- RawType:
- - RawText # text
- - RawComment #
- - RawCData #
- - RawDeclaration #
- - RawProcessingInstruction #
- - RawDTD #
- - RawElementOpen #
- - RawElementClose #
- - RawElementSelfClosed #
- - RawDocument # Something to initialize with (not really used)
-"""
-@enum(RawType, RawDocument, RawText, RawComment, RawCData, RawProcessingInstruction,
- RawDeclaration, RawDTD, RawElementOpen, RawElementClose, RawElementSelfClosed)
-
-@inline nodetype(x::RawType) =
- x === RawElementOpen ? Element :
- x === RawElementClose ? Element :
- x === RawElementSelfClosed ? Element :
- x === RawText ? Text :
- x === RawComment ? Comment :
- x === RawCData ? CData :
- x === RawDeclaration ? Declaration :
- x === RawDTD ? DTD :
- x === RawProcessingInstruction ? ProcessingInstruction :
- x === RawDocument ? Document :
- nothing
-
-#-----------------------------------------------------------------------------# Raw
-"""
- Raw(filename::String)
-
-Create an iterator over raw chunks of data in an XML file. Each chunk of data represents one of:
-
- - RawDocument # Only used to initialize the iterator state.
- - RawText # text
- - RawComment #
- - RawCData #
- - RawDeclaration #
- - RawProcessingInstruction #
- - RawDTD #
- - RawElementOpen #
- - RawElementClose #
- - RawElementSelfClosed #
-
-Useful functions:
-
- - view(o::Raw) --> view of the Vector{UInt8} chunk.
- - String(o::Raw) --> String of the chunk.
- - next(o::Raw) --> Raw of the next chunk (or `nothing`).
- - prev(o::Raw) --> Raw of the previous chunk (or `nothing`).
- - tag(o::Raw) --> String of the tag name (or `nothing`).
- - attributes(o::Raw) --> OrderedDict{String, String} of the attributes (or `nothing`).
- - value(o::Raw) --> String of the value (or `nothing`).
- - children(o::Raw) --> Vector{Raw} of the children (or `nothing`).
- - parent(o::Raw) --> Raw of the parent (or `nothing`)
- - depth(o::Raw) --> Int of the depth of the node in the XML DOM.
-"""
-struct Raw
- type::RawType
- depth::Int
- pos::Int
- len::Int
- data::Vector{UInt8}
- ctx::Vector{Bool} # Context for xml:space (Vector to support inheritance of context)
- has_xml_space::Bool # Whether data contains `xml:space` attribute at least once
-end
-function Raw(data::Vector{UInt8})#, ctx::Vector{Bool}=Bool[false])
- needle = Vector{UInt8}("xml:space")
- has_xml_space = findfirst(needle, data) !== nothing
- return Raw(RawDocument, 0, 0, 0, data, [false], has_xml_space)
-end
-function Raw(data::Vector{UInt8}, has_xml_space::Bool, ctx::Vector{Bool}=Bool[false])
- return Raw(RawDocument, 0, 0, 0, data, ctx, has_xml_space)
-end
-
-const _RAW_INDEX = WeakKeyDict{Vector{UInt8}, Any}()
-
-struct _TokRec
- type::RawType
- depth::Int
- pos::Int
- len::Int
- ctx::Vector{Bool}
-end
-
-mutable struct _Index
- recs::Vector{_TokRec}
- last_raw::Raw
- built_end::Int
-end
-
-Base.read(filename::String, ::Type{Raw}) = isfile(filename) ?
- Raw(Mmap.mmap(filename)) :
- error("File \"$filename\" does not exist.")
-
-Base.read(io::IO, ::Type{Raw}) = Raw(read(io))
-
-Base.parse(x::AbstractString, ::Type{Raw}) = Raw(Vector{UInt8}(x))
-
-# Mostly for debugging
-Base.peek(o::Raw, n::Int) = String(view(o.data[o.pos+o.len+1:min(end, o.pos + o.len + n + 1)]))
-
-function Base.show(io::IO, o::Raw)
- print(io, o.type, ':', o.depth, " (pos=", o.pos, ", len=", o.len, ")")
- o.len > 0 && printstyled(io, ": ", String(o); color=:light_green)
-end
-function Base.:(==)(a::Raw, b::Raw)
- a.type == b.type && a.depth == b.depth && a.pos == b.pos && a.len == b.len && a.data === b.data && a.ctx == b.ctx && a.has_xml_space == b.has_xml_space
-end
-
-Base.view(o::Raw) = view(o.data, o.pos:o.pos+o.len)
-Base.String(o::Raw) = String(view(o))
-
-Base.IteratorSize(::Type{Raw}) = Base.SizeUnknown()
-Base.eltype(::Type{Raw}) = Raw
-
-function Base.iterate(o::Raw, state=o)
- n = next(state)
- return isnothing(n) ? nothing : (n, n)
-end
-
-is_node(o::Raw) = o.type !== RawElementClose
-xml_nodes(o::Raw) = Iterators.Filter(is_node, o)
-
-#-----------------------------------------------------------------------------# get_name
-is_name_start_char(x::UInt8) = x in UInt8('A'):UInt8('Z') || x in UInt8('a'):UInt8('z') || x == UInt8('_')
-is_name_char(x::UInt8) = is_name_start_char(x) || x in UInt8('0'):UInt8('9') || x == UInt8('-') || x == UInt8('.') || x == UInt8(':')
-
-name_start(data, i) = findnext(is_name_start_char, data, i)
-name_stop(data, i) = findnext(!is_name_char, data, i) - 1
-
-function get_name(data, i)
- i = name_start(data, i)
- j = name_stop(data, i)
- @views String(data[i:j]), j + 1
-end
-
-#-----------------------------------------------------------------------------# get_attributes
-# starting at position i, return attributes up until the next '>' or '?' (DTD)
-function get_attributes(data, i, j)
- i = name_start(data, i)
- (isnothing(j) || isnothing(i) || i > j) && return nothing
- out = OrderedDict{String,String}()
- while !isnothing(i) && i < j
- key, i = get_name(data, i)
- # get quotechar the value is wrapped in (either ' or ")
- i = findnext(x -> x === UInt8('"') || x === UInt8('''), data, i + 1)
- quotechar = data[i]
- i2 = findnext(==(quotechar), data, i + 1)
- @views value = String(data[i+1:i2-1])
- out[key] = value
- i = name_start(data, i2)
- end
- return out
-end
-
-# ----------------------------------------------------------------------------# Utilities supporting prev
-function _get_or_init_index(o::Raw)
- idx = get(_RAW_INDEX, o.data, nothing)
- if idx === nothing
- start = Raw(o.data) # fresh RawDocument
- _RAW_INDEX[o.data] = _Index(_TokRec[], start, 0)
- idx = _RAW_INDEX[o.data]
- end
- return idx
-end
-function _ensure_index_upto!(o::Raw, target_pos::Int)
- idx = _get_or_init_index(o)
- r = idx.last_raw
- while true
- n = next(r)
- if n === nothing
- idx.built_end = typemax(Int)
- idx.last_raw = r
- return idx
- end
- push!(idx.recs, _TokRec(n.type, n.depth, n.pos, n.len, copy(n.ctx)))
- endpos = n.pos + n.len
- idx.built_end = endpos
- idx.last_raw = n
- r = n
- if endpos >= target_pos
- return idx
- end
- end
-end
-function _find_prev_token(recs::Vector{_TokRec}, p::Int)
- lo, hi = 1, length(recs)
- ans = 0
- while lo <= hi
- mid = (lo + hi) >>> 1
- endpos = recs[mid].pos + recs[mid].len
- if endpos < p + 1
- ans = mid
- lo = mid + 1
- else
- hi = mid - 1
- end
- end
- return ans == 0 ? nothing : recs[ans]
-end
-
-#-----------------------------------------------------------------------------# update xml:space context
-# check attributes for xml:space and update ctx if necessary
-function get_ctx(o)
- att = attributes(o)
- if !isnothing(att) && haskey(att, "xml:space")
- if att["xml:space"] == "preserve"
- return true
- elseif att["xml:space"] == "default"
- return false
- else
- error("Invalid value for xml:space attribute: $(att["xml:space"]). Must be 'preserve' or 'default'.")
- end
- end
- return nothing
-end
-function update_ctx!(ctx, o)
- new_ctx = get_ctx(o)
- if new_ctx !== nothing
- ctx[end] = new_ctx
- end
- return nothing
-end
-
-#-----------------------------------------------------------------------------# interface
-"""
- nodetype(node) --> XML.NodeType
-
-Return the `XML.NodeType` of the node.
-"""
-nodetype(o::Raw) = nodetype(o.type)
-
-"""
- tag(node) --> String or Nothing
-
-Return the tag name of `Element` and `PROCESSING_INSTRUCTION` nodes.
-"""
-function tag(o::Raw)
- o.type ∉ [RawElementOpen, RawElementClose, RawElementSelfClosed, RawProcessingInstruction] && return nothing
- return get_name(o.data, o.pos + 1)[1]
-end
-
-"""
- attributes(node) --> OrderedDict{String, String} or Nothing
-
-Return the attributes of `Element`, `Declaration`, or `ProcessingInstruction` nodes.
-"""
-function attributes(o::Raw)
- if o.type === RawElementOpen || o.type === RawElementSelfClosed || o.type === RawProcessingInstruction
- i = o.pos
- i = name_start(o.data, i)
- i = name_stop(o.data, i)
- get_attributes(o.data, i + 1, o.pos + o.len)
- elseif o.type === RawDeclaration
- get_attributes(o.data, o.pos + 6, o.pos + o.len)
- else
- nothing
- end
-end
-
-"""
- value(node) --> String or Nothing
-
-Return the value of `Text`, `CData`, `Comment`, or `DTD` nodes.
-"""
-function value(o::Raw)
- if o.type === RawText
- String(o)
- elseif o.type === RawCData
- String(view(o.data, o.pos+length(" Vector{typeof(node)}
-
-Return the children the node. Will only be nonempty for `Element` and `Document` nodes.
-"""
-function children(o::Raw)
- if o.type === RawElementOpen || o.type === RawDocument
- depth = o.depth
- out = Raw[]
- for item in xml_nodes(o)
- if item.depth == depth + 1
- push!(out, item)
- end
- item.depth == depth && break
- o.type === RawDocument && item.depth == 2 && break # break if we've seen the doc root
- end
- out
- else
- Raw[]
- end
-end
-
-"""
- depth(node) --> Int
-
-Return the depth of the node. Will be `0` for `Document` nodes. Not defined for `XML.Node`.
-"""
-function depth(o::Raw)
- o.depth
-end
-
-"""
- parent(node) --> typeof(node), Nothing
-
-Return the parent of the node. Will be `nothing` for `Document` nodes. Not defined for `XML.Node`.
-"""
-function parent(o::Raw)
- depth = o.depth
- depth === 0 && return nothing
- p = prev(o)
- while p.depth >= depth
- p = prev(p)
- end
- return p
-end
-
-#-----------------------------------------------------------------------------# next Raw
-# isspace(x::UInt8) = Base.isspace(Char(x))
-
-# XML whitespace per XML 1.0/1.1 production S:
-# S ::= (#x20 | #x9 | #xD | #xA)+
-@inline xml_isspace(b::UInt8)::Bool = (b == 0x20) | (b == 0x09) | (b == 0x0A) | (b == 0x0D)
-
-"""
- next(node) --> typeof(node) or Nothing
-
-Return the next node in the document during depth-first traversal. Depth-first is the order you
-would visit nodes by reading top-down through an XML file. Not defined for `XML.Node`.
-"""
-function next(o::Raw)
- if o.has_xml_space # using xml:space context at least once in data
- return next_xml_space(o)
- else # not using xml:space context at all (same as v0.3.5)
- return next_no_xml_space(o)
- end
-end
-
-function next_xml_space(o::Raw)
- i = o.pos + o.len + 1
- depth = o.depth
- data = o.data
- type = o.type
- has_xml_space = o.has_xml_space
- ctx = copy(o.ctx)
- last_type = type
- k = findnext(!xml_isspace, data, i)
- if isnothing(k)
- return nothing
- end
- if last_type === RawElementOpen || last_type === RawDocument
- depth += 1
- push!(ctx, ctx[end]) # inherit the xml:space context from parent
- last_type === RawElementOpen && update_ctx!(ctx, o) # check attributes for xml:space and update if necessary
- end
- i = ctx[end] ? i : k
- b = i > 1 ? Char(o.data[i-1]) : Char('<')
- c = Char(o.data[i])
- d = Char(o.data[k+1])
- if c !== '<' || ctx[end] && c === '<' && b === ' ' && last_type === RawElementOpen && d === '/'
- type = RawText
- j = findnext(==(UInt8('<')), data, i) - 1
- j = ctx[end] ? j : findprev(!xml_isspace, data, j) # preserving whitespace if needed
- if last_type === RawElementClose || last_type === RawElementSelfClosed|| last_type === RawDocument
- # Maybe drop pure-whitespace inter-element text nodes?
- # (e.g. whitespace between a closing and an opening tag which would otherwise make an orphan text node)
- #if all(xml_isspace, @view data[i:j]) && depth > 1
- # return next(Raw(type, depth, j, 0, data, ctx, has_xml_space))
- #end
- end
- else
- i = k
- j = k + 1
- if c === '<'
- c2 = Char(o.data[i+1])
- if c2 === '!'
- c3 = Char(o.data[i+2])
- if c3 === '-'
- type = RawComment
- j = findnext(Vector{UInt8}("-->"), data, i)[end]
- elseif c3 === '['
- type = RawCData
- j = findnext(Vector{UInt8}("]]>"), data, i)[end]
- elseif c3 === 'D' || c3 == 'd'
- type = RawDTD
- j = findnext(==(UInt8('>')), data, i)
- while sum(==(UInt8('>')), @view data[k:j]) != sum(==(UInt8('<')), @view data[i:j])
- j = findnext(==(UInt8('>')), data, j + 1)
- end
- end
- elseif c2 === '?'
- if get_name(data, i + 2)[1] == "xml"
- type = RawDeclaration
- else
- type = RawProcessingInstruction
- end
- j = findnext(Vector{UInt8}("?>"), data, i)[end]
- elseif c2 === '/'
- type = RawElementClose
- depth -= 1
- pop!(ctx) # revert to parent xml:space context
- j = findnext(==(UInt8('>')), data, i)
- else
- j = findnext(==(UInt8('>')), data, i)
- if data[j-1] === UInt8('/')
- type = RawElementSelfClosed
- else
- type = RawElementOpen
- end
- end
- end
- end
- return Raw(type, depth, i, j - i, data, ctx, has_xml_space)
-end
-
-function next_no_xml_space(o::Raw) # same as v0.3.5
- i = o.pos + o.len + 1
- depth = o.depth
- data = o.data
- type = o.type
- has_xml_space = o.has_xml_space
- ctx = [false]
- i = findnext(!xml_isspace, data, i)
- if isnothing(i)
- return nothing
- end
- if type === RawElementOpen || type === RawDocument
- depth += 1
- end
- c = Char(o.data[i])
- d = Char(o.data[i+1])
- if c !== '<'
- type = RawText
- j = findnext(==(UInt8('<')), data, i) - 1
- j = findprev(!xml_isspace, data, j) # "rstrip"
- elseif c === '<'
- c2 = Char(o.data[i+1])
- if c2 === '!'
- c3 = Char(o.data[i+2])
- if c3 === '-'
- type = RawComment
- j = findnext(Vector{UInt8}("-->"), data, i)[end]
- elseif c3 === '['
- type = RawCData
- j = findnext(Vector{UInt8}("]]>"), data, i)[end]
- elseif c3 === 'D' || c3 == 'd'
- type = RawDTD
- j = findnext(==(UInt8('>')), data, i)
- while sum(==(UInt8('>')), @view data[i:j]) != sum(==(UInt8('<')), @view data[i:j])
- j = findnext(==(UInt8('>')), data, j + 1)
- end
- end
- elseif c2 === '?'
- if get_name(data, i + 2)[1] == "xml"
- type = RawDeclaration
- else
- type = RawProcessingInstruction
- end
- j = findnext(Vector{UInt8}("?>"), data, i)[end]
- elseif c2 === '/'
- type = RawElementClose
- depth -= 1
- j = findnext(==(UInt8('>')), data, i)
- else
- j = findnext(==(UInt8('>')), data, i)
- if data[j-1] === UInt8('/')
- type = RawElementSelfClosed
- else
- type = RawElementOpen
- end
- end
- end
- return Raw(type, depth, i, j - i, data, ctx, has_xml_space)
-end
-
-#-----------------------------------------------------------------------------# prev Raw
-"""
- prev(node) --> typeof(node), Nothing, or Missing (only for XML.Node)
-
-Return the previous node in the document during depth-first traversal. Not defined for `XML.Node`.
-"""
-function prev(o::Raw)
- if o.has_xml_space # using xml:space context at least once in data
- return prev_xml_space(o)
- else # not using xml:space context at all (same as v0.3.5)
- return prev_no_xml_space(o)
- end
-end
-
-function prev_xml_space(o::Raw)
- o.type === RawDocument && return nothing
-
- idx = _ensure_index_upto!(o, o.pos - 1)
- rec = _find_prev_token(idx.recs, o.pos - 1)
- if rec === nothing
- return Raw(o.data, o.has_xml_space, copy(o.ctx))
- end
- return Raw(rec.type, rec.depth, rec.pos, rec.len, o.data, copy(rec.ctx), o.has_xml_space)
-end
-function prev_no_xml_space(o::Raw) # same as v0.3.5
- depth = o.depth
- data = o.data
- type = o.type
- has_xml_space = o.has_xml_space
- ctx = has_xml_space ? copy(o.ctx) : [false]
- type === RawDocument && return nothing
- j = o.pos - 1
- j = findprev(!xml_isspace, data, j)
- if isnothing(j)
- return Raw(data, has_xml_space, ctx) # RawDocument
- end
- c = Char(o.data[j])
- next_type = type
- if c !== '>' # text
- type = RawText
- i = findprev(==(UInt8('>')), data, j) + 1
- i = findnext(!xml_isspace, data, i) # "lstrip"
- elseif c === '>'
- c2 = Char(o.data[j-1])
- if c2 === '-'
- type = RawComment
- i = findprev(Vector{UInt8}("<--"), data, j)[1]
- elseif c2 === ']'
- type = RawCData
- i = findprev(Vector{UInt8}(".")
- end
- end
- else
- error("Unreachable reached in XML.prev")
- end
- if type !== RawElementOpen && next_type === RawElementClose
- depth += 1
- elseif type === RawElementOpen && next_type !== RawElementClose
- depth -= 1
- end
- return Raw(type, depth, i, j - i, data, ctx, has_xml_space)
-end
-
diff --git a/src/xpath.jl b/src/xpath.jl
new file mode 100644
index 0000000..87da263
--- /dev/null
+++ b/src/xpath.jl
@@ -0,0 +1,345 @@
+#-----------------------------------------------------------------------------# XPath
+# A subset of XPath 1.0 for querying XML.Node trees.
+#
+# Supported syntax:
+# / root (absolute path)
+# tag child element by name
+# * any child element
+# // descendant-or-self (recursive)
+# . current node
+# .. parent node
+# [n] positional predicate (1-based)
+# [@attr] has-attribute predicate
+# [@attr='v'] attribute-value predicate
+# text() text node children
+# node() all node children
+# @attr attribute value (returns strings)
+
+#-----------------------------------------------------------------------------# Token types
+
+"""
+ XPathTokenKind
+
+Discriminator for the kinds of tokens produced by [`_xpath_tokenize`](@ref).
+
+| Variant | Source syntax |
+|--------------------|--------------------------|
+| `XPATH_ROOT` | `/` (path separator) |
+| `XPATH_DESCENDANT` | `//` |
+| `XPATH_NAME` | element tag name |
+| `XPATH_WILDCARD` | `*` |
+| `XPATH_DOT` | `.` (self) |
+| `XPATH_DOTDOT` | `..` (parent) |
+| `XPATH_TEXT_FN` | `text()` |
+| `XPATH_NODE_FN` | `node()` |
+| `XPATH_PREDICATE` | `[...]` body |
+| `XPATH_ATTRIBUTE` | `@attr` (result position) |
+"""
+@enum XPathTokenKind::UInt8 begin
+ XPATH_ROOT # /
+ XPATH_DESCENDANT # //
+ XPATH_NAME # tag name
+ XPATH_WILDCARD # *
+ XPATH_DOT # .
+ XPATH_DOTDOT # ..
+ XPATH_TEXT_FN # text()
+ XPATH_NODE_FN # node()
+ XPATH_PREDICATE # [...]
+ XPATH_ATTRIBUTE # @attr (in result position)
+end
+
+"""
+ XPathToken
+
+A single token from a parsed XPath expression: a [`XPathTokenKind`](@ref) tag together with
+the relevant textual payload (tag name, predicate body, attribute name, etc.). Tokens with
+no payload (`XPATH_ROOT`, `XPATH_WILDCARD`, …) carry the literal source character(s) for
+debuggability.
+"""
+struct XPathToken
+ kind::XPathTokenKind
+ value::String
+end
+
+#-----------------------------------------------------------------------------# Tokenizer
+
+# Lex an XPath expression into a flat token stream. Whitespace is discarded; unterminated
+# predicates / function calls and unrecognised characters raise an error. Tokens preserve
+# source order and are consumed left-to-right by `xpath`.
+function _xpath_tokenize(expr::AbstractString)
+ tokens = XPathToken[]
+ s = String(expr)
+ i = 1
+ n = ncodeunits(s)
+
+ while i <= n
+ c = s[i]
+
+ if c == '/'
+ if i < n && s[i+1] == '/'
+ push!(tokens, XPathToken(XPATH_DESCENDANT, "//"))
+ i += 2
+ else
+ push!(tokens, XPathToken(XPATH_ROOT, "/"))
+ i += 1
+ end
+
+ elseif c == '.'
+ if i < n && s[i+1] == '.'
+ push!(tokens, XPathToken(XPATH_DOTDOT, ".."))
+ i += 2
+ else
+ push!(tokens, XPathToken(XPATH_DOT, "."))
+ i += 1
+ end
+
+ elseif c == '*'
+ push!(tokens, XPathToken(XPATH_WILDCARD, "*"))
+ i += 1
+
+ elseif c == '['
+ j = findnext(']', s, i + 1)
+ isnothing(j) && error("Unterminated predicate in XPath: $(repr(s))")
+ push!(tokens, XPathToken(XPATH_PREDICATE, SubString(s, i + 1, j - 1)))
+ i = j + 1
+
+ elseif c == '@'
+ j = i + 1
+ while j <= n && (isletter(s[j]) || s[j] == '-' || s[j] == '_' || s[j] == ':' || isdigit(s[j]))
+ j += 1
+ end
+ j == i + 1 && error("Empty attribute name after @ in XPath: $(repr(s))")
+ push!(tokens, XPathToken(XPATH_ATTRIBUTE, SubString(s, i + 1, j - 1)))
+ i = j
+
+ elseif isletter(c) || c == '_'
+ j = i + 1
+ while j <= n && (isletter(s[j]) || s[j] == '-' || s[j] == '_' || s[j] == ':' || isdigit(s[j]) || s[j] == '.')
+ j += 1
+ end
+ name = SubString(s, i, j - 1)
+ # Check for function calls: text(), node()
+ if j <= n && s[j] == '('
+ j2 = findnext(')', s, j + 1)
+ isnothing(j2) && error("Unterminated function call in XPath: $(repr(s))")
+ if name == "text"
+ push!(tokens, XPathToken(XPATH_TEXT_FN, "text()"))
+ elseif name == "node"
+ push!(tokens, XPathToken(XPATH_NODE_FN, "node()"))
+ else
+ error("Unknown XPath function: $name()")
+ end
+ i = j2 + 1
+ else
+ push!(tokens, XPathToken(XPATH_NAME, String(name)))
+ i = j
+ end
+
+ elseif isspace(c)
+ i += 1
+
+ else
+ error("Unexpected character '$(c)' in XPath: $(repr(s))")
+ end
+ end
+ tokens
+end
+
+#-----------------------------------------------------------------------------# Predicate evaluation
+
+const _RE_ATTR_PRED = r"^@([A-Za-z_:][\w.\-:]*)$"
+const _RE_ATTR_VAL_PRED = r"^@([A-Za-z_:][\w.\-:]*)\s*=\s*['\"]([^'\"]*)['\"]$"
+
+# Filter `nodes` by the body of a `[...]` predicate. Supports positional indices `[n]`
+# (1-based; out-of-range yields empty), `[last()]`, `[@attr]` (has-attribute), and
+# `[@attr='value']` / `[@attr="value"]` (attribute equals literal). Anything else errors.
+# `root` is accepted for symmetry with `_xpath_step` but is unused by current predicates.
+function _eval_predicate(predicate::AbstractString, nodes::Vector{Node{S}}, root::Node{S}) where S
+ s = strip(predicate)
+
+ # Positional: [n]
+ pos = tryparse(Int, s)
+ if !isnothing(pos)
+ 1 <= pos <= length(nodes) || return Node{S}[]
+ return [nodes[pos]]
+ end
+
+ # last()
+ if s == "last()"
+ isempty(nodes) && return Node{S}[]
+ return [nodes[end]]
+ end
+
+ # [@attr] — has attribute
+ m = match(_RE_ATTR_PRED, s)
+ if !isnothing(m)
+ attr_name = m.captures[1]
+ return filter(n -> n.nodetype === Element && haskey(n, attr_name), nodes)
+ end
+
+ # [@attr='value'] or [@attr="value"]
+ m = match(_RE_ATTR_VAL_PRED, s)
+ if !isnothing(m)
+ attr_name = m.captures[1]
+ attr_val = m.captures[2]
+ return filter(n -> n.nodetype === Element && get(n, attr_name, nothing) == attr_val, nodes)
+ end
+
+ error("Unsupported XPath predicate: [$predicate]")
+end
+
+#-----------------------------------------------------------------------------# Step evaluation
+
+# Apply a single non-predicate, non-descendant step to the current context and return the
+# new context. Handles XPATH_NAME, XPATH_WILDCARD, XPATH_DOT, XPATH_DOTDOT, XPATH_TEXT_FN,
+# XPATH_NODE_FN. XPATH_DESCENDANT is intentionally not handled here — the main evaluator
+# expands `//` to descendant-or-self before the next step. `root` is used by `..` to avoid
+# walking past the document root.
+function _xpath_step(nodes::Vector{Node{S}}, token::XPathToken, root::Node{S}) where S
+ result = Node{S}[]
+ k = token.kind
+
+ if k === XPATH_NAME
+ for n in nodes
+ for c in children(n)
+ c.nodetype === Element && c.tag == token.value && push!(result, c)
+ end
+ end
+
+ elseif k === XPATH_WILDCARD
+ for n in nodes
+ for c in children(n)
+ c.nodetype === Element && push!(result, c)
+ end
+ end
+
+ elseif k === XPATH_DOT
+ append!(result, nodes)
+
+ elseif k === XPATH_DOTDOT
+ for n in nodes
+ n === root && continue
+ p = _find_parent(n, root)
+ isnothing(p) || push!(result, p)
+ end
+
+ elseif k === XPATH_TEXT_FN
+ for n in nodes
+ for c in children(n)
+ c.nodetype === Text && push!(result, c)
+ end
+ end
+
+ elseif k === XPATH_NODE_FN
+ for n in nodes
+ append!(result, children(n))
+ end
+
+ elseif k === XPATH_DESCENDANT
+ # Handled by caller — collects all descendants before next step
+ error("XPATH_DESCENDANT should be handled by the evaluator, not _xpath_step")
+ end
+
+ result
+end
+
+# Append every descendant of `node` (children, grandchildren, ...) to `out` in document
+# order. Does not include `node` itself.
+function _descendants!(out::Vector{Node{S}}, node::Node{S}) where S
+ for c in children(node)
+ push!(out, c)
+ _descendants!(out, c)
+ end
+end
+
+# Implements XPath's descendant-or-self axis: for each input node, emit the node itself
+# followed by all of its descendants in document order.
+function _descendants(nodes::Vector{Node{S}}) where S
+ result = Node{S}[]
+ for n in nodes
+ push!(result, n) # descendant-or-self includes self
+ _descendants!(result, n)
+ end
+ result
+end
+
+#-----------------------------------------------------------------------------# Main evaluator
+
+"""
+ xpath(node::Node, expr::AbstractString) -> Vector{Node}
+
+Evaluate an XPath expression against a `Node` tree and return matching nodes.
+
+Supports a practical subset of XPath 1.0:
+- Absolute (`/root/child`) and relative (`child/sub`) paths
+- Recursive descent (`//tag`)
+- Wildcards (`*`), self (`.`), parent (`..`)
+- Positional predicates (`[1]`, `[last()]`)
+- Attribute predicates (`[@attr]`, `[@attr='value']`)
+- `text()` and `node()` functions
+- Attribute selection (`@attr`) — returns `Text` nodes containing attribute values
+
+# Examples
+```julia
+doc = parse(" ", Node)
+xpath(doc, "/root/a") # both elements
+xpath(doc, "/root/a[1]") # first
+xpath(doc, "//a[@x='2']") #
+xpath(doc, "/root/b/@x") # attribute value as Text node (empty here)
+```
+"""
+function xpath(node::Node{S}, expr::AbstractString) where S
+ tokens = _xpath_tokenize(expr)
+ isempty(tokens) && return Node{S}[]
+
+ # Determine root for .. navigation
+ root = node.nodetype === Document ? node : node
+
+ i = 1
+ # Start context
+ if tokens[1].kind === XPATH_ROOT
+ # Absolute path — start from the document or its root element
+ if node.nodetype === Document
+ current = Node{S}[node]
+ else
+ current = Node{S}[node]
+ end
+ i = 2
+ else
+ current = Node{S}[node]
+ end
+
+ while i <= length(tokens)
+ tok = tokens[i]
+
+ if tok.kind === XPATH_PREDICATE
+ current = _eval_predicate(tok.value, current, root)
+ i += 1
+
+ elseif tok.kind === XPATH_DESCENDANT
+ current = _descendants(current)
+ # // must be followed by a step
+ i += 1
+
+ elseif tok.kind === XPATH_ROOT
+ # / as separator between steps — skip
+ i += 1
+
+ elseif tok.kind === XPATH_ATTRIBUTE
+ # @attr in result position — return attribute values as Text nodes
+ result = Node{S}[]
+ for n in current
+ v = get(n, tok.value, nothing)
+ !isnothing(v) && push!(result, Node{S}(Text, nothing, nothing, v, nothing))
+ end
+ current = result
+ i += 1
+
+ else
+ current = _xpath_step(current, tok, root)
+ i += 1
+ end
+ end
+
+ current
+end
diff --git a/test/Project.toml b/test/Project.toml
index d4883bd..c1703f7 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,4 +1,5 @@
[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
+Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/data/complex_dtd.xml b/test/data/complex_dtd.xml
new file mode 100644
index 0000000..cb69747
--- /dev/null
+++ b/test/data/complex_dtd.xml
@@ -0,0 +1,105 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ notation NOTATION (jpeg | png) #IMPLIED>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+]]>
+]>
diff --git a/test/data/preserve.xml b/test/data/preserve.xml
new file mode 100644
index 0000000..e77add1
--- /dev/null
+++ b/test/data/preserve.xml
@@ -0,0 +1,5 @@
+
+
+ This node has preserved space
+ with default children.
+
diff --git a/test/runtests.jl b/test/runtests.jl
index 89978eb..4ab562c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,646 +1,3410 @@
using XML
-using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text, escape, unescape, OrderedDict, h
-using Downloads: download
+using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text
+using XML: escape, unescape, h, parse_dtd
+using XML: ParsedDTD, ElementDecl, AttDecl, EntityDecl, NotationDecl
using Test
-import AbstractTrees
-
-AbstractTrees.children(x::Node) = children(x)
-
-#-----------------------------------------------------------------------------# files
-xml_xsd = joinpath("data", "xml.xsd")
-kml_xsd = joinpath("data", "kml.xsd")
-books_xml = joinpath("data", "books.xml")
-example_kml = joinpath("data", "example.kml")
-simple_dtd = joinpath("data", "simple_dtd.xml")
-
-all_files = [xml_xsd, kml_xsd, books_xml, example_kml, simple_dtd]
-
-#-----------------------------------------------------------------------------# h
-@testset "h function" begin
- @test h.tag == XML.Element("tag")
- @test h.tag(id="id") == XML.Element("tag"; id="id")
- @test h.tag(1, 2, a="a", b="b") == XML.Element("tag", 1, 2; a="a", b="b")
-end
-
-#-----------------------------------------------------------------------------# escaping/unescaping
-@testset "escaping/unescaping" begin
- s = "This > string < has & some \" special ' characters"
- @test escape(s) == "This > string < has & some " special ' characters"
- @test escape(escape(s)) == escape(s)
- @test s == unescape(escape(s))
- @test s == unescape(unescape(escape(s)))
-
- n = Element("tag", Text(s))
- @test XML.simple_value(n) == s
-
- XML.escape!(n)
- @test XML.simple_value(n) == escape(s)
-
- XML.unescape!(n)
- @test XML.simple_value(n) == s
-end
-
-#-----------------------------------------------------------------------------# DTD
-# @testset "DTDBody and friends" begin
-# s = read(simple_dtd, String)
-# data = read(simple_dtd)
-
-# dtd = XML.DTDBody(data)
-# dtd2 = parse(s, XML.DTDBody)
-
-# @test length(dtd.elements) == length(dtd2.elements) == 0
-# @test length(dtd.attributes) == length(dtd2.attributes) == 0
-# @test length(dtd.entities) == length(dtd2.entities) == 3
-
-# o = read("data/tv.dtd", XML.DTDBody)
-# end
-
-#-----------------------------------------------------------------------------# Raw
-@testset "Raw tag/attributes/value" begin
- examples = [
- (xml = "",
- nodetype = DTD,
- tag=nothing,
- attributes=nothing,
- value="html"),
- (xml = "",
- nodetype = Declaration,
- tag=nothing,
- attributes=Dict("version" => "1.0", "key" => "value"),
- value=nothing),
- (xml = " ",
- nodetype = Element,
- tag="tag",
- attributes=Dict("_id" => "1", "x" => "abc"),
- value=nothing),
- (xml = "",
- nodetype = Comment,
- tag=nothing,
- attributes=nothing,
- value=" comment "),
- (xml = "",
- nodetype = CData,
- tag=nothing,
- attributes=nothing,
- value="cdata test"),
- ]
- for x in examples
- # @info "Testing: $(x.xml)"
- data = XML.next(XML.parse(x.xml, XML.Raw))
- @test XML.nodetype(data) == x.nodetype
- @test XML.tag(data) == x.tag
- @test XML.attributes(data) == x.attributes
- @test XML.value(data) == x.value
- end
-end
-
-@testset "Raw with books.xml" begin
- data = read(books_xml, XML.Raw)
- doc = collect(data)
- @test length(doc) > countlines(books_xml)
- # Check that the first 5 lines are correct
- first_5_lines = [
- XML.RawDeclaration => """""",
- XML.RawElementOpen => "",
- XML.RawElementOpen => "",
- XML.RawElementOpen => "",
- XML.RawText => "Gambardella, Matthew"
- ]
- for (i, (typ, str)) in enumerate(first_5_lines)
- dt = doc[i]
- @test dt.type == typ
- @test String(dt) == str
- end
- # Check that the last line is correct
- @test doc[end].type == XML.RawElementClose
- @test String(doc[end]) == " "
-
- @testset "next and prev" begin
- @test XML.prev(doc[1]) == data # can't use === here because prev returns a copy of ctx
- @test prev(data) === nothing
- @test XML.next(doc[end]) === nothing
-
- n = length(doc)
- next_res = [doc[1]]
- foreach(_ -> push!(next_res, XML.next(next_res[end])), 1:n-1)
-
- prev_res = [doc[end]]
- foreach(_ -> pushfirst!(prev_res, XML.prev(prev_res[1])), 1:n-1)
-
- idx = findall(next_res .!= prev_res)
-
- for (a,b) in zip(next_res, prev_res)
- @test a == b
+
+#==============================================================================#
+# ESCAPE / UNESCAPE #
+#==============================================================================#
+@testset "escape / unescape" begin
+ @testset "all five predefined entities" begin
+ @test escape("&") == "&"
+ @test escape("<") == "<"
+ @test escape(">") == ">"
+ @test escape("'") == "'"
+ @test escape("\"") == """
+ end
+
+ @testset "unescape reverses escape" begin
+ @test unescape("&") == "&"
+ @test unescape("<") == "<"
+ @test unescape(">") == ">"
+ @test unescape("'") == "'"
+ @test unescape(""") == "\""
+ end
+
+ @testset "roundtrip on mixed strings" begin
+ s = "This > string < has & some \" special ' characters"
+ @test unescape(escape(s)) == s
+ end
+
+ @testset "idempotent unescape" begin
+ s = "plain text with no entities"
+ @test unescape(s) == s
+ end
+
+ @testset "multiple entities in one string" begin
+ @test escape("a < b & c > d") == "a < b & c > d"
+ @test unescape("a < b & c > d") == "a < b & c > d"
+ end
+
+ @testset "empty string" begin
+ @test escape("") == ""
+ @test unescape("") == ""
+ end
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.1: Well-Formed XML Documents #
+#==============================================================================#
+@testset "Spec 2.1: Well-Formed XML Documents" begin
+ # The spec's simplest example:
+ #
+ # Hello, world!
+ xml = """Hello, world! """
+ doc = parse(xml, Node)
+ @test nodetype(doc) == Document
+ @test length(doc) == 2 # Declaration + Element
+ @test nodetype(doc[1]) == Declaration
+ @test nodetype(doc[2]) == Element
+ @test tag(doc[2]) == "greeting"
+ @test simple_value(doc[2]) == "Hello, world!"
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.4: Character Data and Markup #
+#==============================================================================#
+@testset "Spec 2.4: Character Data and Markup" begin
+ @testset "text content between tags" begin
+ doc = parse("Hello ", Node)
+ @test simple_value(doc[1]) == "Hello"
+ end
+
+ @testset "entity references in text are unescaped" begin
+ doc = parse("& < > ' " ", Node)
+ @test simple_value(doc[1]) == "& < > ' \""
+ end
+
+ @testset "mixed text and child elements" begin
+ doc = parse("Hello world !
", Node)
+ root = doc[1]
+ @test length(root) == 3
+ @test nodetype(root[1]) == Text
+ @test value(root[1]) == "Hello "
+ @test nodetype(root[2]) == Element
+ @test tag(root[2]) == "b"
+ @test simple_value(root[2]) == "world"
+ @test nodetype(root[3]) == Text
+ @test value(root[3]) == "!"
+ end
+
+ @testset "empty element has no text" begin
+ doc = parse(" ", Node)
+ @test length(children(doc[1])) == 0
+ end
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.5: Comments #
+#==============================================================================#
+@testset "Spec 2.5: Comments" begin
+ @testset "basic comment (spec example)" begin
+ # Spec example:
+ doc = parse(" ", Node)
+ c = doc[1][1]
+ @test nodetype(c) == Comment
+ @test value(c) == " declarations for & "
+ end
+
+ @testset "empty comment" begin
+ doc = parse(" ", Node)
+ c = doc[1][1]
+ @test nodetype(c) == Comment
+ @test value(c) == ""
+ end
+
+ @testset "comment before root element" begin
+ doc = parse(" ", Node)
+ @test nodetype(doc[1]) == Comment
+ @test value(doc[1]) == " before "
+ @test nodetype(doc[2]) == Element
+ end
+
+ @testset "comment after root element" begin
+ doc = parse(" ", Node)
+ @test nodetype(doc[1]) == Element
+ @test nodetype(doc[2]) == Comment
+ end
+
+ @testset "comment with markup-like content preserved verbatim" begin
+ doc = parse(" ", Node)
+ @test value(doc[1][1]) == " not a tag "
+ end
+
+ @testset "multiple comments" begin
+ doc = parse(" ", Node)
+ @test length(doc[1]) == 2
+ @test value(doc[1][1]) == " A "
+ @test value(doc[1][2]) == " B "
+ end
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.6: Processing Instructions #
+#==============================================================================#
+@testset "Spec 2.6: Processing Instructions" begin
+ @testset "xml-stylesheet PI (spec example)" begin
+ doc = parse(""" """, Node)
+ pi = doc[1]
+ @test nodetype(pi) == ProcessingInstruction
+ @test tag(pi) == "xml-stylesheet"
+ @test contains(value(pi), "type=\"text/xsl\"")
+ end
+
+ @testset "PI with no content" begin
+ doc = parse(" ", Node)
+ pi = doc[1]
+ @test nodetype(pi) == ProcessingInstruction
+ @test tag(pi) == "target"
+ @test value(pi) === nothing
+ end
+
+ @testset "PI inside element" begin
+ doc = parse(" ", Node)
+ pi = doc[1][1]
+ @test nodetype(pi) == ProcessingInstruction
+ @test tag(pi) == "mypi"
+ @test value(pi) == "some data"
+ end
+
+ @testset "PI after root element" begin
+ doc = parse(" ", Node)
+ @test nodetype(doc[2]) == ProcessingInstruction
+ @test tag(doc[2]) == "post-process"
+ end
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.7: CDATA Sections #
+#==============================================================================#
+@testset "Spec 2.7: CDATA Sections" begin
+ @testset "CDATA preserves markup characters" begin
+ # Spec example
+ doc = parse("Hello, world!]]> ", Node)
+ cd = doc[1][1]
+ @test nodetype(cd) == CData
+ @test value(cd) == "Hello, world! "
+ end
+
+ @testset "empty CDATA" begin
+ doc = parse(" ", Node)
+ cd = doc[1][1]
+ @test nodetype(cd) == CData
+ @test value(cd) == ""
+ end
+
+ @testset "CDATA with ampersands and less-thans" begin
+ doc = parse(" d]]> ", Node)
+ @test value(doc[1][1]) == "a < b && c > d"
+ end
+
+ @testset "CDATA with special characters" begin
+ doc = parse(" ", Node)
+ @test value(doc[1][1]) == "line1\nline2\ttab"
+ end
+
+ @testset "CDATA mixed with text" begin
+ doc = parse("beforeafter ", Node)
+ @test length(doc[1]) == 3
+ @test nodetype(doc[1][1]) == Text
+ @test value(doc[1][1]) == "before"
+ @test nodetype(doc[1][2]) == CData
+ @test value(doc[1][2]) == "inside"
+ @test nodetype(doc[1][3]) == Text
+ @test value(doc[1][3]) == "after"
+ end
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.8: Prolog and Document Type Declaration #
+#==============================================================================#
+@testset "Spec 2.8: Prolog and Document Type Declaration" begin
+ @testset "XML declaration - version only" begin
+ doc = parse(""" """, Node)
+ decl = doc[1]
+ @test nodetype(decl) == Declaration
+ @test decl["version"] == "1.0"
+ end
+
+ @testset "XML declaration - version and encoding" begin
+ doc = parse(""" """, Node)
+ decl = doc[1]
+ @test decl["version"] == "1.0"
+ @test decl["encoding"] == "UTF-8"
+ end
+
+ @testset "XML declaration - all three pseudo-attributes" begin
+ doc = parse(""" """, Node)
+ decl = doc[1]
+ @test decl["version"] == "1.0"
+ @test decl["encoding"] == "UTF-8"
+ @test decl["standalone"] == "yes"
+ end
+
+ @testset "XML declaration with single quotes" begin
+ doc = parse(" ", Node)
+ @test doc[1]["version"] == "1.0"
+ end
+
+ @testset "no XML declaration" begin
+ doc = parse(" ", Node)
+ @test length(doc) == 1
+ @test nodetype(doc[1]) == Element
+ end
+
+ @testset "DOCTYPE - SYSTEM" begin
+ # Spec example
+ doc = parse(""" """, Node)
+ dtd = doc[1]
+ @test nodetype(dtd) == DTD
+ @test contains(value(dtd), "greeting")
+ @test contains(value(dtd), "SYSTEM")
+ @test contains(value(dtd), "hello.dtd")
+ end
+
+ @testset "DOCTYPE - with internal subset" begin
+ xml = """
+]>Hello, world! """
+ doc = parse(xml, Node)
+ dtd = doc[1]
+ @test nodetype(dtd) == DTD
+ @test contains(value(dtd), "greeting")
+ @test contains(value(dtd), "
+
+
+]> """
+ doc = parse(xml, Node)
+ @test nodetype(doc[1]) == DTD
+ @test contains(value(doc[1]), "ENTITY")
+ end
+
+ @testset "full prolog: declaration + DOCTYPE" begin
+ xml = """ """
+ doc = parse(xml, Node)
+ @test nodetype(doc[1]) == Declaration
+ @test nodetype(doc[2]) == DTD
+ @test nodetype(doc[3]) == Element
+ end
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.9: Standalone Document Declaration #
+#==============================================================================#
+@testset "Spec 2.9: Standalone Document Declaration" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["standalone"] == "yes"
+
+ doc2 = parse(""" """, Node)
+ @test doc2[1]["standalone"] == "no"
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 2.10: White Space Handling #
+#==============================================================================#
+@testset "Spec 2.10: White Space Handling" begin
+ @testset "parser preserves all text content verbatim" begin
+ doc = parse(" hello ", Node)
+ @test simple_value(doc[1]) == " hello "
+ end
+
+ @testset "parser preserves whitespace-only text" begin
+ doc = parse(" ", Node)
+ @test simple_value(doc[1]) == " "
+ end
+
+ @testset "parser preserves inter-element whitespace as Text nodes" begin
+ xml = "x \n y "
+ doc = parse(xml, Node)
+ @test length(doc[1]) == 3
+ @test value(doc[1][1][1]) == "x"
+ @test nodetype(doc[1][2]) == Text
+ @test value(doc[1][2]) == "\n "
+ @test value(doc[1][3][1]) == "y"
+ end
+
+ @testset "xml:space attribute is preserved during parsing" begin
+ doc = parse(""" text """, Node)
+ @test doc[1]["xml:space"] == "preserve"
+ @test value(doc[1][1][1]) == " text "
+ end
+
+ @testset "xml:space='preserve' affects write formatting" begin
+ # When xml:space="preserve", writer doesn't add indentation
+ el = Element("s", XML.Text(" pre "), Element("t"), XML.Text(" post "); var"xml:space"="preserve")
+ @test XML.write(el) == " pre post "
+ end
+
+ @testset "write formats with indentation by default" begin
+ el = Element("root", Element("a"), Element("b"))
+ s = XML.write(el)
+ @test contains(s, " ") # indented
+ @test contains(s, " ") # indented
+ end
+
+ @testset "Unicode non-breaking space is NOT XML whitespace" begin
+ nbsp = "\u00A0"
+ xml = "$(nbsp) y $(nbsp) "
+ doc = parse(xml, Node)
+ @test simple_value(doc[1]) == "$(nbsp) y $(nbsp)"
+ end
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 3.1: Start-Tags, End-Tags, Empty-Element Tags #
+#==============================================================================#
+@testset "Spec 3.1: Start-Tags, End-Tags, Empty-Element Tags" begin
+ @testset "element with attributes (spec example)" begin
+ #
+ doc = parse("""A dog. """, Node)
+ el = doc[1]
+ @test tag(el) == "termdef"
+ @test el["id"] == "dt-dog"
+ @test el["term"] == "dog"
+ @test value(el[1]) == "A dog."
+ end
+
+ @testset "self-closing tag (spec example)" begin
+ #
+ doc = parse(""" """, Node)
+ el = doc[1]
+ @test tag(el) == "IMG"
+ @test el["align"] == "left"
+ @test el["src"] == "http://www.w3.org/Icons/WWW/w3c_home"
+ @test length(children(el)) == 0
+ end
+
+ @testset "simple self-closing tag" begin
+ doc = parse(" ", Node)
+ @test tag(doc[1]) == "br"
+ @test length(children(doc[1])) == 0
+ end
+
+ @testset "self-closing tag with space before />" begin
+ doc = parse(" ", Node)
+ @test tag(doc[1]) == "br"
+ end
+
+ @testset "empty element with start and end tag" begin
+ doc = parse(" ", Node)
+ el = doc[1]
+ @test tag(el) == "empty"
+ @test isnothing(el.children)
+ end
+
+ @testset "nested elements" begin
+ doc = parse(" ", Node)
+ @test tag(doc[1]) == "a"
+ @test tag(doc[1][1]) == "b"
+ @test tag(doc[1][1][1]) == "c"
+ end
+
+ @testset "sibling elements" begin
+ doc = parse(" ", Node)
+ @test length(doc[1]) == 3
+ @test tag(doc[1][1]) == "a"
+ @test tag(doc[1][2]) == "b"
+ @test tag(doc[1][3]) == "c"
+ end
+
+ @testset "attributes with single quotes" begin
+ doc = parse(" ", Node)
+ @test doc[1]["a"] == "val"
+ end
+
+ @testset "attributes with double quotes" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["a"] == "val"
+ end
+
+ @testset "mixed quote styles in attributes" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["a"] == "1"
+ @test doc[1]["b"] == "2"
+ end
+
+ @testset "attribute with > in value" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["a"] == "1>2"
+ end
+
+ @testset "attribute with entity reference" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["a"] == "a&b"
+ end
+
+ @testset "multiple attributes accessible via attributes()" begin
+ doc = parse(""" """, Node)
+ attrs = attributes(doc[1])
+ @test attrs isa Attributes
+ @test attrs["first"] == "1"
+ @test attrs["second"] == "2"
+ @test attrs["third"] == "3"
+ end
+
+ @testset "whitespace around = in attributes" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["a"] == "1"
+ end
+end
+
+#==============================================================================#
+# XML 1.0 SPEC SECTION 4.1: Entity References #
+#==============================================================================#
+@testset "Spec 4.1: Character and Entity References" begin
+ @testset "predefined entity references in text" begin
+ doc = parse("< ", Node)
+ @test simple_value(doc[1]) == "<"
+
+ doc = parse("> ", Node)
+ @test simple_value(doc[1]) == ">"
+
+ doc = parse("& ", Node)
+ @test simple_value(doc[1]) == "&"
+
+ doc = parse("' ", Node)
+ @test simple_value(doc[1]) == "'"
+
+ doc = parse("" ", Node)
+ @test simple_value(doc[1]) == "\""
+ end
+
+ @testset "predefined entities in attribute values" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["a"] == "<>&'\""
+ end
+
+ @testset "multiple entity references in one text node" begin
+ doc = parse("<tag> & "value" ", Node)
+ @test simple_value(doc[1]) == " & \"value\""
+ end
+end
+
+#==============================================================================#
+# NAMESPACES (Colon in Tag and Attribute Names) #
+#==============================================================================#
+@testset "Namespaces" begin
+ @testset "namespaced element" begin
+ doc = parse(""" """, Node)
+ @test tag(doc[1]) == "ns:root"
+ @test doc[1]["xmlns:ns"] == "http://example.com"
+ @test tag(doc[1][1]) == "ns:child"
+ end
+
+ @testset "default namespace" begin
+ doc = parse(""" """, Node)
+ @test doc[1]["xmlns"] == "http://example.com"
+ end
+
+ @testset "multiple namespace prefixes" begin
+ xml = """ """
+ doc = parse(xml, Node)
+ @test tag(doc[1][1]) == "a:x"
+ @test tag(doc[1][2]) == "b:y"
+ end
+end
+
+#==============================================================================#
+# NODE CONSTRUCTORS #
+#==============================================================================#
+@testset "Node Constructors" begin
+ @testset "Text" begin
+ t = Text("hello")
+ @test nodetype(t) == Text
+ @test value(t) == "hello"
+ @test tag(t) === nothing
+ @test attributes(t) === nothing
+ end
+
+ @testset "Comment" begin
+ c = Comment(" a comment ")
+ @test nodetype(c) == Comment
+ @test value(c) == " a comment "
+ end
+
+ @testset "CData" begin
+ cd = CData("raw ")
+ @test nodetype(cd) == CData
+ @test value(cd) == "raw "
+ end
+
+ @testset "DTD" begin
+ d = DTD("html")
+ @test nodetype(d) == DTD
+ @test value(d) == "html"
+ end
+
+ @testset "Declaration" begin
+ decl = Declaration(; version="1.0", encoding="UTF-8")
+ @test nodetype(decl) == Declaration
+ @test decl["version"] == "1.0"
+ @test decl["encoding"] == "UTF-8"
+ end
+
+ @testset "Declaration with no attributes" begin
+ decl = Declaration()
+ @test nodetype(decl) == Declaration
+ @test attributes(decl) === nothing
+ end
+
+ @testset "ProcessingInstruction with content" begin
+ pi = ProcessingInstruction("target", "data here")
+ @test nodetype(pi) == ProcessingInstruction
+ @test tag(pi) == "target"
+ @test value(pi) == "data here"
+ end
+
+ @testset "ProcessingInstruction without content" begin
+ pi = ProcessingInstruction("target")
+ @test nodetype(pi) == ProcessingInstruction
+ @test tag(pi) == "target"
+ @test value(pi) === nothing
+ end
+
+ @testset "Element with tag only" begin
+ el = Element("div")
+ @test nodetype(el) == Element
+ @test tag(el) == "div"
+ @test length(children(el)) == 0
+ end
+
+ @testset "Element with children" begin
+ el = Element("div", Text("hello"), Element("span"))
+ @test length(el) == 2
+ @test nodetype(el[1]) == Text
+ @test nodetype(el[2]) == Element
+ end
+
+ @testset "Element with attributes" begin
+ el = Element("div"; class="main", id="content")
+ @test el["class"] == "main"
+ @test el["id"] == "content"
+ end
+
+ @testset "Element with children and attributes" begin
+ el = Element("a", "click here"; href="http://example.com")
+ @test tag(el) == "a"
+ @test el["href"] == "http://example.com"
+ @test value(el[1]) == "click here"
+ end
+
+ @testset "Element auto-converts non-Node children to Text" begin
+ el = Element("p", 42)
+ @test nodetype(el[1]) == Text
+ @test value(el[1]) == "42"
+ end
+
+ @testset "Document" begin
+ doc = Document(
+ Declaration(; version="1.0"),
+ Element("root")
+ )
+ @test nodetype(doc) == Document
+ @test length(doc) == 2
+ @test nodetype(doc[1]) == Declaration
+ @test nodetype(doc[2]) == Element
+ end
+
+ @testset "Document with all node types" begin
+ doc = Document(
+ Declaration(; version="1.0"),
+ DTD("root"),
+ Comment("comment"),
+ ProcessingInstruction("pi", "data"),
+ Element("root", CData("cdata"), Text("text"))
+ )
+ @test map(nodetype, children(doc)) == [Declaration, DTD, Comment, ProcessingInstruction, Element]
+ @test length(doc[end]) == 2
+ @test nodetype(doc[end][1]) == CData
+ @test value(doc[end][1]) == "cdata"
+ @test nodetype(doc[end][2]) == Text
+ @test value(doc[end][2]) == "text"
+ end
+
+ @testset "invalid constructions" begin
+ @test_throws Exception Text("a", "b") # too many args
+ @test_throws Exception Comment("a"; x="1") # no attrs
+ @test_throws Exception CData("a"; x="1") # no attrs
+ @test_throws Exception DTD("a"; x="1") # no attrs
+ @test_throws Exception Element() # need tag
+ @test_throws Exception Declaration("bad") # no positional args
+ @test_throws Exception Document(; x="1") # no attrs
+ @test_throws Exception ProcessingInstruction() # need target
+ @test_throws Exception ProcessingInstruction("a", "b", "c") # too many args
+ end
+end
+
+#==============================================================================#
+# h CONSTRUCTOR #
+#==============================================================================#
+@testset "h constructor" begin
+ @testset "h(tag)" begin
+ el = h("div")
+ @test nodetype(el) == Element
+ @test tag(el) == "div"
+ end
+
+ @testset "h(tag, children...)" begin
+ el = h("div", "hello")
+ @test simple_value(el) == "hello"
+ end
+
+ @testset "h(tag; attrs...)" begin
+ el = h("div"; class="main")
+ @test el["class"] == "main"
+ end
+
+ @testset "h(tag, children...; attrs...)" begin
+ el = h("div", "hello"; class="main")
+ @test el["class"] == "main"
+ @test value(el[1]) == "hello"
+ end
+
+ @testset "h.tag syntax" begin
+ el = h.div("hello"; class="main")
+ @test tag(el) == "div"
+ @test el["class"] == "main"
+ @test value(el[1]) == "hello"
+ end
+
+ @testset "h.tag with no args" begin
+ el = h.br()
+ @test tag(el) == "br"
+ @test length(children(el)) == 0
+ end
+
+ @testset "h.tag with only attrs" begin
+ el = h.img(; src="image.png")
+ @test tag(el) == "img"
+ @test el["src"] == "image.png"
+ end
+
+ @testset "nested h constructors" begin
+ el = h.div(
+ h.h1("Title"),
+ h.p("Paragraph")
+ )
+ @test tag(el) == "div"
+ @test length(el) == 2
+ @test tag(el[1]) == "h1"
+ @test tag(el[2]) == "p"
+ end
+
+ @testset "h with symbol tag" begin
+ el = h(:div)
+ @test tag(el) == "div"
+ end
+end
+
+#==============================================================================#
+# NODE INTERFACE #
+#==============================================================================#
+@testset "Node Interface" begin
+ doc = parse("""text """, Node)
+
+ @testset "nodetype" begin
+ @test nodetype(doc) == Document
+ @test nodetype(doc[1]) == Declaration
+ @test nodetype(doc[2]) == Element
+ end
+
+ @testset "tag" begin
+ @test tag(doc) === nothing
+ @test tag(doc[2]) == "root"
+ @test tag(doc[2][1]) == "child"
+ end
+
+ @testset "attributes" begin
+ @test attributes(doc) === nothing
+ @test attributes(doc[2])["attr"] == "val"
+ end
+
+ @testset "value" begin
+ @test value(doc) === nothing
+ @test value(doc[2][1][1]) == "text"
+ end
+
+ @testset "children" begin
+ @test length(children(doc)) == 2
+ @test length(children(doc[2])) == 1
+ end
+
+ @testset "is_simple" begin
+ @test is_simple(doc[2][1]) == true
+ @test is_simple(doc[2]) == false
+ end
+
+ @testset "simple_value" begin
+ @test simple_value(doc[2][1]) == "text"
+ @test_throws ErrorException simple_value(doc[2])
+ end
+
+ @testset "simple_value for CData child" begin
+ el = Element("x", CData("data"))
+ @test is_simple(el)
+ @test simple_value(el) == "data"
+ end
+end
+
+#==============================================================================#
+# NODE INDEXING #
+#==============================================================================#
+@testset "Node Indexing" begin
+ doc = parse(" ", Node)
+ root = doc[1]
+
+ @testset "integer indexing" begin
+ @test tag(root[1]) == "a"
+ @test tag(root[2]) == "b"
+ @test tag(root[3]) == "c"
+ end
+
+ @testset "colon indexing" begin
+ all = root[:]
+ @test length(all) == 3
+ end
+
+ @testset "lastindex" begin
+ @test tag(root[end]) == "c"
+ end
+
+ @testset "only" begin
+ single = parse(" ", Node)
+ @test tag(only(single[1])) == "only"
+ end
+
+ @testset "length" begin
+ @test length(root) == 3
+ end
+
+ @testset "attribute indexing" begin
+ el = parse(""" """, Node)[1]
+ @test el["a"] == "1"
+ @test el["b"] == "2"
+ @test_throws KeyError el["nonexistent"]
+ end
+
+ @testset "haskey" begin
+ el = parse(""" """, Node)[1]
+ @test haskey(el, "a") == true
+ @test haskey(el, "b") == false
+ end
+
+ @testset "keys" begin
+ el = parse(""" """, Node)[1]
+ @test collect(keys(el)) == ["a", "b"]
+ end
+
+ @testset "keys on element with no attributes" begin
+ el = parse(" ", Node)[1]
+ @test isempty(keys(el))
+ end
+end
+
+#==============================================================================#
+# NODE MUTATION #
+#==============================================================================#
+@testset "Node Mutation" begin
+ @testset "setindex! child" begin
+ el = Element("root", Element("old"))
+ el[1] = Element("new")
+ @test tag(el[1]) == "new"
+ end
+
+ @testset "setindex! child with auto-conversion" begin
+ el = Element("root", Text("old"))
+ el[1] = "new text"
+ @test value(el[1]) == "new text"
+ end
+
+ @testset "setindex! attribute" begin
+ el = Element("root"; a="1")
+ el["a"] = "2"
+ @test el["a"] == "2"
+ end
+
+ @testset "setindex! new attribute" begin
+ el = Element("root"; a="1")
+ el["b"] = "2"
+ @test el["b"] == "2"
+ end
+
+ @testset "push! child" begin
+ el = Element("root")
+ push!(el, Element("child"))
+ @test length(el) == 1
+ @test tag(el[1]) == "child"
+ end
+
+ @testset "push! with auto-conversion" begin
+ el = Element("root")
+ push!(el, "text")
+ @test nodetype(el[1]) == Text
+ @test value(el[1]) == "text"
+ end
+
+ @testset "pushfirst! child" begin
+ el = Element("root", Element("second"))
+ pushfirst!(el, Element("first"))
+ @test tag(el[1]) == "first"
+ @test tag(el[2]) == "second"
+ end
+
+ @testset "push! on non-container node errors" begin
+ t = Text("hello")
+ @test_throws ErrorException push!(t, "more")
+ end
+end
+
+#==============================================================================#
+# NODE EQUALITY #
+#==============================================================================#
+@testset "Node Equality" begin
+ @testset "identical elements are equal" begin
+ a = Element("div", Text("hello"); class="main")
+ b = Element("div", Text("hello"); class="main")
+ @test a == b
+ end
+
+ @testset "different tag names are not equal" begin
+ @test Element("a") != Element("b")
+ end
+
+ @testset "different attributes are not equal" begin
+ @test Element("a"; x="1") != Element("a"; x="2")
+ end
+
+ @testset "different children are not equal" begin
+ @test Element("a", Text("x")) != Element("a", Text("y"))
+ end
+
+ @testset "different node types are not equal" begin
+ @test Text("x") != Comment("x")
+ end
+
+ @testset "empty attributes vs nothing" begin
+ a = Element("a")
+ b = Element("a")
+ @test a == b
+ end
+
+ @testset "parse equality" begin
+ xml = "text "
+ @test parse(xml, Node) == parse(xml, Node)
+ end
+end
+
+#==============================================================================#
+# XML WRITING #
+#==============================================================================#
+@testset "XML Writing" begin
+ @testset "write Text" begin
+ el = Element("p", "hello & goodbye")
+ @test XML.write(el) == "hello & goodbye
"
+ end
+
+ @testset "write Element with attributes" begin
+ el = Element("div"; class="main", id="content")
+ s = XML.write(el)
+ @test contains(s, "
")
+ end
+
+ @testset "write self-closing element" begin
+ @test XML.write(Element("br")) == " "
+ end
+
+ @testset "write element with single text child (inline)" begin
+ @test XML.write(Element("p", "hello")) == "hello
"
+ end
+
+ @testset "write element with multiple children (indented)" begin
+ el = Element("div", Element("a"), Element("b"))
+ s = XML.write(el)
+ @test contains(s, "")
+ @test contains(s, "
")
+ @test contains(s, "
")
+ @test contains(s, "
")
+ end
+
+ @testset "write Comment" begin
+ el = Element("root", Comment(" comment "))
+ @test contains(XML.write(el), "")
+ end
+
+ @testset "write CData" begin
+ el = Element("root", CData("raw "))
+ @test contains(XML.write(el), "]]>")
+ end
+
+ @testset "write ProcessingInstruction with content" begin
+ pi = ProcessingInstruction("target", "data")
+ @test XML.write(pi) == ""
+ end
+
+ @testset "write ProcessingInstruction without content" begin
+ pi = ProcessingInstruction("target")
+ @test XML.write(pi) == ""
+ end
+
+ @testset "write Declaration" begin
+ decl = Declaration(; version="1.0", encoding="UTF-8")
+ s = XML.write(decl)
+ @test contains(s, "")
+ end
+
+ @testset "write DTD" begin
+ dtd = DTD("html")
+ @test XML.write(dtd) == ""
+ end
+
+ @testset "write Document" begin
+ doc = Document(Declaration(; version="1.0"), Element("root"))
+ s = XML.write(doc)
+ @test startswith(s, "")
+ end
+
+ @testset "write escapes special characters in text" begin
+ el = Element("p", "a < b & c > d")
+ @test XML.write(el) == "a < b & c > d
"
+ end
+
+ @testset "write escapes special characters in attribute values" begin
+ el = Element("x"; a="a\"b")
+ @test contains(XML.write(el), "a=\"a"b\"")
+ end
+
+ @testset "indentsize parameter" begin
+ el = Element("root", Element("child"))
+ s2 = XML.write(el; indentsize=2)
+ s4 = XML.write(el; indentsize=4)
+ @test contains(s2, " ")
+ @test contains(s4, " ")
+ end
+
+ @testset "write xml:space='preserve' respects whitespace" begin
+ el = Element("root", Element("p", Text(" hello "); var"xml:space"="preserve"))
+ s = XML.write(el)
+ @test contains(s, "> hello
")
+ end
+end
+
+#==============================================================================#
+# WRITE TO FILE / READ FROM FILE #
+#==============================================================================#
+@testset "File I/O" begin
+ @testset "write and read back" begin
+ doc = Document(
+ Declaration(; version="1.0"),
+ Element("root", Element("child", "text"))
+ )
+ temp = tempname() * ".xml"
+ XML.write(temp, doc)
+ content = read(temp, String)
+ @test contains(content, "")
+ @test contains(content, "