diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 5e073ac..b52ab78 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -26,7 +26,7 @@ jobs: - os: macOS-latest arch: x86 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@v2 with: version: ${{ matrix.version }} @@ -41,9 +41,13 @@ jobs: ${{ runner.os }}-test-${{ env.cache-name }}- ${{ runner.os }}-test- ${{ runner.os }}- + - uses: actions/cache@v4 + with: + path: test/data/w3c + key: w3c-xmlconf-v20130923 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v1 + - uses: codecov/codecov-action@v5 with: - file: lcov.info + files: lcov.info diff --git a/.gitignore b/.gitignore index b000475..929dfc2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ *Manifest.toml -*generated_xsd.jl -*.xml *.gz +*.tar *.DS_Store +*.claude +test/data/w3c/ +benchmarks/data/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..13d6e29 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,158 @@ +# Changelog + +All notable changes to XML.jl will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added +- New streaming tokenizer (`XMLTokenizer` module) for fine-grained XML token iteration. +- XPath support via `xpath(node, path)`. +- `test/test_libxml2_testcases.jl`: 243 test cases borrowed from the [libxml2](https://github.com/GNOME/libxml2) test suite covering CDATA, comments, processing instructions, attributes, namespaces, DTD internal subsets, entity references, whitespace handling, Unicode, error cases, and real-world document patterns. +- `AbstractTrees` package extension: loading both `XML` and `AbstractTrees` enables `print_tree`, `PreOrderDFS`, `Leaves`, etc. on `Node` and `LazyNode`. + +### Fixed +- **Tokenizer: multi-byte UTF-8 in attribute values** — Parsing attribute values containing multi-byte UTF-8 characters (e.g., ``) could produce a `StringIndexError` because `attr_value()` used byte arithmetic (`ncodeunits - 1`) instead of `prevind` to strip quotes. The same issue existed in `_read_attr_value!`. +- **Tokenizer: quotes inside DTD comments** — A `"` or `'` character inside a `` comment within a DTD internal subset caused the tokenizer to misinterpret it as a quoted string delimiter, leading to an "Unterminated quoted string" error. The DOCTYPE body parser now correctly skips comment content. + +## [0.3.8] + +### Fixed +- `XML.write` now respects `xml:space="preserve"` and suppresses indentation for elements with this attribute ([#49]). + +## [0.3.7] + +### Fixed +- Resolved remaining issues from [#45] and fixed [#46] (whitespace preservation edge cases) ([#47]). + +## [0.3.6] + +### Added +- `XML.write` respects `xml:space="preserve"` on elements, suppressing automatic indentation ([#45]). + +### Fixed +- `String` type ambiguity on Julia nightly resolved ([#38]). + +## [0.3.5] + +### Fixed +- `depth` and `parent` functions corrected to work properly with the DOM tree API ([#37]). +- `escape` updated to no longer be idempotent — every `&` is now escaped, matching spec behavior ([#32], addressing [#31]). +- `pushfirst!` support added for `Node` children ([#29]). + +## [0.3.4] + +### Fixed +- Fixed [#26]. +- CI updated to use `julia-actions/cache@v4` and `lts` Julia version. + +## [0.3.3] + +### Added +- `h` constructor for concise element creation (e.g., `h.div("hello"; class="main")`). + +### Fixed +- Path definition error in README example ([#20]). + +## [0.3.2] + +### Fixed +- Minor typos. + +## [0.3.1] + +### Added +- Julia 1.6 compatibility ([#16]). + +### Changed +- Smarter escaping logic. + +## [0.3.0] + +### Changed +- Attribute internal representation changed from `Dict` to `OrderedDict` (later reverted to `Vector{Pair}`). + +## [0.2.3] + +### Fixed +- Parse method fix. + +## [0.2.2] + +### Added +- DTD parsing via `parse_dtd`. +- `is_simple` and `simple_value` exports. +- `setindex!` methods for modifying attributes. +- `unescape` function. + +### Fixed +- DOCTYPE parsing made case-insensitive. + +## [0.2.1] + +### Fixed +- Write output fixes. + +## [0.2.0] + +### Changed +- Major rewrite: introduced `NodeType` enum, `Node{S}` parametric struct, callable `NodeType` constructors, and `XML.write`. +- Processing instruction support. +- Benchmarks added. + +## [0.1.3] + +### Changed +- Improved print output for `AbstractXMLNode`. + +## [0.1.2] + +### Added +- AbstractTrees 0.4 compatibility ([#5]). + +## [0.1.1] + +### Added +- `Node` implementation with `print_tree`. +- Color output in REPL display. +- Stopped stripping whitespace from text nodes. + +## [0.1.0] + +- Initial release. + +[Unreleased]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.8...HEAD +[0.3.8]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.7...v0.3.8 +[0.3.7]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.6...v0.3.7 +[0.3.6]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.5...v0.3.6 +[0.3.5]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.4...v0.3.5 +[0.3.4]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.3...v0.3.4 +[0.3.3]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.2...v0.3.3 +[0.3.2]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.1...v0.3.2 +[0.3.1]: https://github.com/JuliaComputing/XML.jl/compare/v0.3.0...v0.3.1 +[0.3.0]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.3...v0.3.0 +[0.2.3]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.2...v0.2.3 +[0.2.2]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.1...v0.2.2 +[0.2.1]: https://github.com/JuliaComputing/XML.jl/compare/v0.2.0...v0.2.1 +[0.2.0]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.3...v0.2.0 +[0.1.3]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.2...v0.1.3 +[0.1.2]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.1...v0.1.2 +[0.1.1]: https://github.com/JuliaComputing/XML.jl/compare/v0.1.0...v0.1.1 +[0.1.0]: https://github.com/JuliaComputing/XML.jl/releases/tag/v0.1.0 + +[#5]: https://github.com/JuliaComputing/XML.jl/pull/5 +[#16]: https://github.com/JuliaComputing/XML.jl/pull/16 +[#20]: https://github.com/JuliaComputing/XML.jl/pull/20 +[#26]: https://github.com/JuliaComputing/XML.jl/issues/26 +[#29]: https://github.com/JuliaComputing/XML.jl/pull/29 +[#31]: https://github.com/JuliaComputing/XML.jl/issues/31 +[#32]: https://github.com/JuliaComputing/XML.jl/pull/32 +[#37]: https://github.com/JuliaComputing/XML.jl/pull/37 +[#38]: https://github.com/JuliaComputing/XML.jl/pull/38 +[#43]: https://github.com/JuliaComputing/XML.jl/issues/43 +[#45]: https://github.com/JuliaComputing/XML.jl/pull/45 +[#46]: https://github.com/JuliaComputing/XML.jl/issues/46 +[#47]: https://github.com/JuliaComputing/XML.jl/pull/47 +[#49]: https://github.com/JuliaComputing/XML.jl/pull/49 diff --git a/Project.toml b/Project.toml index 49b96c0..a42a821 100644 --- a/Project.toml +++ b/Project.toml @@ -1,12 +1,14 @@ name = "XML" uuid = "72c71f33-b9b6-44de-8c94-c961784809e2" +version = "0.4.0" authors = ["Josh Day and contributors"] -version = "0.3.8" -[deps] -Mmap = "a63ad114-7e13-5084-954f-fe012c677804" -OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +[weakdeps] +AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" + +[extensions] +XMLAbstractTreesExt = "AbstractTrees" [compat] -OrderedCollections = "1.4, 1.5" -julia = "1.6" +AbstractTrees = "0.4" +julia = "1.9" diff --git a/README.md b/README.md index ddb1156..067c06d 100644 --- a/README.md +++ b/README.md @@ -4,39 +4,8 @@

Read and write XML in pure Julia.

-

- -# Introduction - -This package offers fast data structures for reading and writing XML files with a consistent interface: - -
- -### `Node`/`LazyNode` Interface: - -``` -nodetype(node) → XML.NodeType (an enum type) -tag(node) → String or Nothing -attributes(node) → OrderedDict{String, String} or Nothing -value(node) → String or Nothing -children(node) → Vector{typeof(node)} -is_simple(node) → Bool (whether node is simple .e.g. item) -simple_value(node) → e.g. "item" from item) -``` -
-### Extended Interface for `LazyNode` - -``` -depth(node) → Int -next(node) → typeof(node) -prev(node) → typeof(node) -parent(node) → typeof(node) -``` - -

- # Quickstart ```julia @@ -58,79 +27,76 @@ doc[end][2] # Second child of root # Node Element (6 children) ``` -

- -# Data Structures that Represent XML Nodes +
-## Preliminary: `NodeType` +# `Node` Interface -- Each item in an XML DOM is classified by its `NodeType`. -- Every `XML.jl` struct defines a `nodetype(x)` method that returns its `NodeType`. +Every node in the XML DOM is represented by `Node`, a single type parametrized on its string storage. -| NodeType | XML Representation | `Node` Constructor | -|----------|--------------------|------------------| -| `Document` | An entire document | `Document(children...)` -| `DTD` | `` | `DTD(...) ` -| `Declaration` | `` | `Declaration(; attrs...)` -| `ProcessingInstruction` | `` | `ProcessingInstruction(tag; attrs...)` -| `Comment` | `` | `Comment(text)` -| `CData` | `` | `CData(text)` -| `Element` | ` children... ` | `Element(tag, children...; attrs...)` -| `Text` | the `text` part of `text` | `Text(text)` +``` +nodetype(node) -> XML.NodeType (an enum) +tag(node) -> String or Nothing +attributes(node) -> XML.Attributes{String} or Nothing +value(node) -> String or Nothing +children(node) -> Vector{Node} +is_simple(node) -> Bool (e.g. text) +simple_value(node) -> e.g. "text" from text +```
-## `Node`: Probably What You're Looking For +## `NodeType` -- `read`-ing a `Node` loads the entire XML DOM in memory. -- See the table above for convenience constructors. -- `Node`s have some additional methods that aid in construction/mutation: +Each item in an XML DOM is classified by its `NodeType`: -```julia -# Add a child: -push!(parent::Node, child::Node) - -# Replace a child: -parent[2] = child - -# Add/change an attribute: -node["key"] = value +| NodeType | XML Representation | Constructor | +|----------|--------------------|-------------| +| `Document` | An entire document | `Document(children...)` | +| `DTD` | `` | `DTD(...)` | +| `Declaration` | `` | `Declaration(; attrs...)` | +| `ProcessingInstruction` | `` | `ProcessingInstruction(tag; attrs...)` | +| `Comment` | `` | `Comment(text)` | +| `CData` | `` | `CData(text)` | +| `Element` | ` children... ` | `Element(tag, children...; attrs...)` | +| `Text` | the `text` part of `text` | `Text(text)` | -node["key"] -``` +
-- `Node` is an immutable type. However, you can easily create a copy with one or more field values changed by using the `Node(::Node, children...; attrs...)` constructor where `children` are appended to the source node's children and `attrs` are appended to the node's attributes. +## Mutation ```julia -node = XML.Element("tag", "child") -# Node Element (1 child) +push!(parent, child) # Add a child +parent[2] = child # Replace a child +node["key"] = "value" # Add/change an attribute +node["key"] # Get an attribute +``` -simple_value(node) -# "child" +
-node2 = Node(node, "added"; id="my-id") -# Node Element (2 children) +## Tree Navigation -node2.children -# 2-element Vector{Node}: -# Node Text "child" -# Node Text "added" +```julia +depth(child, root) # Depth of child relative to root +parent(child, root) # Parent of child within root's tree +siblings(child, root) # Siblings of child within root's tree ``` -### Writing `Element` `Node`s with `XML.h` +
+ +## Writing Elements with `XML.h` Similar to [Cobweb.jl](https://github.com/JuliaComputing/Cobweb.jl#-creating-nodes-with-cobwebh), `XML.h` enables you to write elements with a simpler syntax: ```julia using XML: h -julia> node = h.parent( - h.child("first child content", id="id1"), - h.child("second child content", id="id2") - ) +node = h.parent( + h.child("first child content", id="id1"), + h.child("second child content", id="id2") +) # Node Element (2 children) -julia> print(XML.write(node)) +print(XML.write(node)) # # first child content # second child content @@ -139,111 +105,228 @@ julia> print(XML.write(node))
-## `XML.LazyNode`: For Fast Iteration through an XML File - -A lazy data structure that just keeps track of the position in the raw data (`Vector{UInt8}`) to read from. - -- You can iterate over a `LazyNode` to "read" through an XML file: - -```julia -doc = read(filename, LazyNode) - -foreach(println, doc) -# LazyNode Declaration -# LazyNode Element -# LazyNode Element -# LazyNode Element -# LazyNode Text "Gambardella, Matthew" -# LazyNode Element -# ⋮ -``` - -<br><br> - # Reading ```julia -# Reading from file: +# From a file: read(filename, Node) -read(filename, LazyNode) - -# Parsing from string: -parse(Node, str) -parse(LazyNode, str) +# From a string: +parse(str, Node) ``` -<br><br> +<br> # Writing ```julia XML.write(filename::String, node) # write to file +XML.write(io::IO, node) # write to stream +XML.write(node) # return String +``` + +`XML.write` respects `xml:space="preserve"` on elements, suppressing automatic indentation. + +<br> -XML.write(io::IO, node) # write to stream +# XPath -XML.write(node) # String +Query nodes using a subset of XPath 1.0 via `xpath(node, path)`: + +```julia +doc = parse(""" +<root> + <a id="1"><b>hello</b></a> + <a id="2"><b>world</b></a> +</root> +""", Node) + +root = doc[end] + +xpath(root, "//b") # All <b> descendants +xpath(root, "a[@id='2']/b") # <b> inside <a id="2"> +xpath(root, "a[1]") # First <a> child +xpath(root, "//b/text()") # Text nodes inside all <b>s ``` +### Supported syntax + +| Expression | Description | +|------------|-------------| +| `/` | Root / path separator | +| `tag` | Child element by name | +| `*` | Any child element | +| `//` | Descendant-or-self (recursive) | +| `.` | Current node | +| `..` | Parent node | +| `[n]` | Positional predicate (1-based) | +| `[@attr]` | Has-attribute predicate | +| `[@attr='v']` | Attribute-value predicate | +| `text()` | Text node children | +| `node()` | All node children | +| `@attr` | Attribute value (returns strings) | -<br><br> +<br> -# Performance +# Streaming Tokenizer -- XML.jl performs comparatively to [EzXML.jl](https://github.com/JuliaIO/EzXML.jl), which wraps the C library [libxml2](https://gitlab.gnome.org/GNOME/libxml2/-/wikis/home). -- See the `benchmarks/suite.jl` for the code to produce these results. -- The following output was generated in a Julia session with the following `versioninfo`: +For large files or when you need fine-grained control, `XML.XMLTokenizer` provides a streaming tokenizer that yields tokens without building a DOM. Token kinds live in the `XML.XMLTokenizer.TokenKinds` baremodule (e.g. `TokenKinds.OPEN_TAG`, `TokenKinds.TEXT`). -``` -julia> versioninfo() -Julia Version 1.9.4 -Commit 8e5136fa297 (2023-11-14 08:46 UTC) -Build Info: - Official https://julialang.org/ release -Platform Info: - OS: macOS (arm64-apple-darwin22.4.0) - CPU: 10 × Apple M1 Pro - WORD_SIZE: 64 - LIBM: libopenlibm - LLVM: libLLVM-14.0.6 (ORCJIT, apple-m1) - Threads: 8 on 8 virtual cores +```julia +using XML.XMLTokenizer: tokenize + +for token in tokenize("<root><child attr=\"val\">text</child></root>") + println(token.kind, " => ", repr(String(token.raw))) +end +# OPEN_TAG => "<root" +# TAG_CLOSE => ">" +# OPEN_TAG => "<child" +# ATTR_NAME => "attr" +# ATTR_VALUE => "\"val\"" +# TAG_CLOSE => ">" +# TEXT => "text" +# CLOSE_TAG => "</child" +# TAG_CLOSE => ">" +# CLOSE_TAG => "</root" +# TAG_CLOSE => ">" ``` +<br> -### Reading an XML File +# `LazyNode` -``` - XML.LazyNode 0.009583 - XML.Node ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 1071.32 - EzXML.readxml ■■■■■■■■■ 284.346 - XMLDict.xml_dict ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 1231.47 +For read-only access without building a full DOM tree, use `LazyNode`. It stores only a reference to the source string and re-tokenizes on demand, using significantly less memory: + +```julia +doc = parse(xml_string, LazyNode) +doc = read("file.xml", LazyNode) ``` -### Writing an XML File +`LazyNode` supports the same read-only interface as `Node`: `nodetype`, `tag`, `attributes`, `value`, `children`, `is_simple`, `simple_value`, plus integer and string indexing. -``` - Write: XML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 289.638 - Write: EzXML ■■■■■■■■■■■■■ 93.4631 -``` +For streaming and high-throughput workloads, several extra accessors avoid materializing intermediate collections: -### Lazily Iterating over Each Node -``` - LazyNode ■■■■■■■■■ 51.752 - EzXML.StreamReader ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 226.271 +```julia +sourcetext(n) # zero-copy SubString view of the node's raw source bytes +eachchildnode(n) # lazy iterator over children — no Vector allocation +children!(buf, n) # collect children into a reusable buffer +eachattribute(n) # lazy iterator over attribute name=>value pairs +is_simple_value(n) # combined is_simple + simple_value (one tokenizer pass) +get(n, key, default) # single-attribute read without building Attributes +XML.write(n) # zero-copy: returns node's original source text +XML.write(n; normalize=true) # re-parse + pretty-print, collapses source whitespace ``` -### Collecting All Names/Tags in an XML File -``` - XML.LazyNode ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 210.482 - EzXML.StreamReader ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 276.238 - EzXML.readxml ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 263.269 +### Memory-mapped files + +For very large files, combine `LazyNode` with memory mapping to avoid reading the entire file into heap memory: + +```julia +using XML, Mmap, StringViews + +doc = open("very_large.xml") do io + sv = StringView(Mmap.mmap(io)) + parse(sv, LazyNode) +end ``` <br> + +# AbstractTrees Integration + +Loading [`AbstractTrees`](https://github.com/JuliaCollections/AbstractTrees.jl) alongside XML enables tree-walking utilities (`print_tree`, `PreOrderDFS`, `Leaves`, etc.) on both `Node` and `LazyNode`: + +```julia +using XML, AbstractTrees + +doc = parse("<a><b/><c><d/></c></a>", Node) +print_tree(doc) +# Document +# └─ <a> +# ├─ <b> +# └─ <c> +# └─ <d> + +for n in PreOrderDFS(doc) + nodetype(n) == Element && println(tag(n)) +end +``` + <br> -# Possible Gotchas +# Benchmarks + +Benchmark source: [benchmarks.jl](benchmarks/benchmarks.jl). Test data: `books.xml` (small, ~4 KB) and a generated XMark auction XML (medium, ~14 MB). + + + +``` + Parse (small) — median time (ms) + + XML.jl ■■■■■■■ 0.0374 + XML.jl (SS) ■■■■■■■ 0.0339 + EzXML ■■■■ 0.0218 + LightXML ■■■■ 0.0218 + XMLDict ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.200 + + + Parse (medium) — median time (ms) + + XML.jl ■■■■■■■■■■■■■■ 185.0 + XML.jl (SS) ■■■■■■■■■■■■■ 168.0 + EzXML ■■■■■■ 81.5 + LightXML ■■■■■■■■ 107.0 + XMLDict ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 520.0 + + + Write (small) — median time (ms) + + XML.jl ■■■■ 0.00929 + EzXML ■■■■ 0.0103 + LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.101 + + + Write (medium) — median time (ms) + + XML.jl ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 48.0 + EzXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 52.6 + LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 56.1 + -- XML.jl doesn't automatically escape special characters (`<`, `>`, `&`, `"`, and `'` ) for you. However, we provide utility functions for doing the conversions back and forth: - - `XML.escape(::String)` and `XML.unescape(::String)` - - `XML.escape!(::Node)` and `XML.unescape!(::Node)`. + Read file — median time (ms) + + XML.jl ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 193.0 + EzXML ■■■■■■■■■■■■■■■■■■■■■■■■■ 121.0 + LightXML ■■■■■■■■■■■■■■■■■■■■ 95.6 + + + Collect tags (small) — median time (ms) + + XML.jl ■■■■■■ 0.000586 + EzXML ■■■■■■■■■■■■■■■■■■■■■■ 0.00205 + LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 0.00368 + + + Collect tags (medium) — median time (ms) + + XML.jl ■■■■■■■■■■■■■■■■■■ 13.1 + EzXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 29.4 + LightXML ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 23.2 +``` + +```julia +versioninfo() +# Julia Version 1.12.6 +# Commit 15346901f00 (2026-04-09 19:20 UTC) +# Build Info: +# Official https://julialang.org release +# Platform Info: +# OS: macOS (arm64-apple-darwin24.0.0) +# CPU: 10 × Apple M1 Pro +# WORD_SIZE: 64 +# LLVM: libLLVM-18.1.7 (ORCJIT, apple-m1) +# GC: Built with stock GC +# Threads: 8 default, 1 interactive, 8 GC (on 8 virtual cores) +# Environment: +# JULIA_NUM_THREADS = auto +``` diff --git a/benchmarks/Project.toml b/benchmarks/Project.toml index ed90996..043988c 100644 --- a/benchmarks/Project.toml +++ b/benchmarks/Project.toml @@ -2,7 +2,8 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615" -OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +LightXML = "9c8b4983-aa76-5018-a973-4c85ecc9e179" UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228" XML = "72c71f33-b9b6-44de-8c94-c961784809e2" XMLDict = "228000da-037f-5747-90a9-8195ccbf91a5" diff --git a/benchmarks/XMarkGenerator.jl b/benchmarks/XMarkGenerator.jl new file mode 100644 index 0000000..7f780a0 --- /dev/null +++ b/benchmarks/XMarkGenerator.jl @@ -0,0 +1,377 @@ +""" + XMarkGenerator + +XMark-inspired XML benchmark data generator. Produces well-formed XML documents modeling an +internet auction site, following the XMark benchmark DTD structure. + + include("xml_generator.jl") + using .XMarkGenerator + + xml = generate_xmark(1.0) # return String (~14 MB) + generate_xmark("out.xml", 5.0) # write to file (~68 MB) + generate_xmark(stdout, 0.1; seed=123) # write to IO (~1.4 MB) +""" +module XMarkGenerator + +using Random + +export generate_xmark + +#-----------------------------------------------------------------# Word lists +const WORDS = [ + "about", "above", "across", "after", "again", "against", "along", "already", "also", + "always", "among", "another", "answer", "around", "asked", "away", "back", "because", + "become", "been", "before", "began", "behind", "being", "below", "between", "body", + "book", "both", "brought", "build", "built", "business", "came", "cannot", "carry", + "cause", "certain", "change", "children", "city", "close", "come", "complete", "could", + "country", "course", "cover", "current", "dark", "days", "deep", "development", + "different", "direction", "does", "done", "door", "down", "draw", "during", "each", + "early", "earth", "east", "education", "effort", "eight", "either", "else", "end", + "enough", "even", "every", "example", "experience", "face", "fact", "family", "feel", + "field", "find", "first", "five", "follow", "food", "force", "form", "found", "four", + "from", "full", "gave", "general", "give", "going", "gone", "good", "government", + "great", "green", "ground", "group", "grow", "half", "hand", "happen", "hard", "have", + "head", "help", "here", "high", "himself", "hold", "home", "hope", "house", "however", + "hundred", "idea", "important", "inch", "include", "increase", "island", "just", "keep", + "kind", "knew", "know", "land", "large", "last", "later", "learn", "left", "less", + "letter", "life", "light", "like", "line", "list", "little", "live", "long", "look", + "lost", "made", "main", "make", "many", "mark", "matter", "mean", "might", "mind", + "miss", "money", "morning", "most", "mother", "move", "much", "music", "must", "name", + "near", "need", "never", "next", "night", "nothing", "notice", "number", "often", + "once", "only", "open", "order", "other", "over", "page", "paper", "part", "past", + "pattern", "people", "perhaps", "period", "person", "picture", "place", "plan", "plant", + "play", "point", "position", "possible", "power", "present", "problem", "produce", + "product", "program", "public", "pull", "purpose", "question", "quite", "reach", "read", + "real", "receive", "record", "remember", "rest", "result", "right", "river", "room", + "round", "rule", "same", "school", "second", "seem", "sentence", "service", "seven", + "several", "shall", "short", "should", "show", "side", "since", "sing", "size", "small", + "social", "some", "song", "soon", "south", "space", "stand", "start", "state", "still", + "stood", "story", "strong", "study", "such", "sure", "system", "table", "take", "tell", + "test", "their", "them", "then", "there", "these", "thing", "think", "those", "thought", + "three", "through", "time", "together", "took", "toward", "travel", "tree", "true", + "turn", "under", "unit", "until", "upon", "usually", "value", "very", "voice", "walk", + "want", "watch", "water", "well", "went", "were", "west", "what", "where", "which", + "while", "white", "whole", "will", "with", "without", "woman", "word", "work", "world", + "would", "write", "year", "young", +] +const FIRST_NAMES = ["James", "John", "Robert", "Michael", "William", "David", "Richard", + "Joseph", "Thomas", "Charles", "Mary", "Patricia", "Jennifer", "Linda", "Barbara", + "Elizabeth", "Susan", "Jessica", "Sarah", "Karen"] +const LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", + "Davis", "Rodriguez", "Martinez", "Wilson", "Anderson", "Taylor", "Thomas", "Hernandez", + "Moore", "Martin", "Jackson", "Thompson", "White"] +const COUNTRIES = ["United States", "Germany", "France", "Japan", "Australia", "Brazil", + "Canada", "India", "China", "Mexico", "Argentina", "Spain", "Italy", "United Kingdom", + "Netherlands", "Sweden", "Norway", "Finland", "Denmark", "Belgium"] +const CITIES = ["New York", "London", "Paris", "Tokyo", "Sydney", "Berlin", "Rome", + "Madrid", "Amsterdam", "Toronto", "Moscow", "Beijing", "Seoul", "Mumbai", "Cairo", + "Dublin", "Prague", "Vienna", "Warsaw", "Budapest"] +const STREETS = ["Main", "Oak", "Elm", "Maple", "Pine", "Cedar", "Birch", "Walnut", + "Cherry", "Ash", "Spruce", "Willow", "Poplar", "Laurel", "Juniper"] +const EDUCATIONS = ["High School", "College", "Graduate", "Associate", "Master", "Doctorate"] +const GENDERS = ["male", "female"] +const PAYMENTS = ["Creditcard", "Money order", "Personal check", "Cash"] +const SHIPPING = ["Will ship only within country", "Will ship internationally", + "Buyer pays fixed shipping costs", "Free shipping", "See description for shipping"] +const REGIONS = ["africa", "asia", "australia", "europe", "namerica", "samerica"] + +#-----------------------------------------------------------------# Random data helpers +rand_word(rng) = rand(rng, WORDS) +rand_date(rng) = string(rand(rng, 1999:2025), "/", lpad(rand(rng, 1:12), 2, '0'), "/", lpad(rand(rng, 1:28), 2, '0')) +rand_time(rng) = string(lpad(rand(rng, 0:23), 2, '0'), ":", lpad(rand(rng, 0:59), 2, '0'), ":", lpad(rand(rng, 0:59), 2, '0')) +rand_price(rng) = string(rand(rng, 1:9999), ".", lpad(rand(rng, 0:99), 2, '0')) +rand_phone(rng) = string("+", rand(rng, 1:99), " (", rand(rng, 100:999), ") ", rand(rng, 1000000:9999999)) +rand_zip(rng) = string(lpad(rand(rng, 0:99999), 5, '0')) +rand_cc(rng) = join(rand(rng, 1000:9999, 4), " ") +rand_email(rng) = string(lowercase(rand(rng, FIRST_NAMES)), rand(rng, 1:999), "@", lowercase(rand(rng, LAST_NAMES)), ".com") + +#-----------------------------------------------------------------# XML writing helpers +function xml_escape_char(io::IO, c::Char) + if c == '&'; print(io, "&") + elseif c == '<'; print(io, "<") + elseif c == '>'; print(io, ">") + elseif c == '"'; print(io, """) + else; print(io, c) + end +end + +function write_escaped(io::IO, s::AbstractString) + for c in s + xml_escape_char(io, c) + end +end + +function write_text_content(rng, io; min_words=10, max_words=50) + n = rand(rng, min_words:max_words) + for i in 1:n + i > 1 && print(io, ' ') + w = rand_word(rng) + r = rand(rng) + if r < 0.03 + print(io, "<bold>", w, "</bold>") + elseif r < 0.06 + print(io, "<emph>", w, "</emph>") + elseif r < 0.08 + print(io, "<keyword>", w, "</keyword>") + else + print(io, w) + end + end +end + +function write_description(rng, io, indent) + println(io, indent, "<description>") + if rand(rng) < 0.7 + print(io, indent, " <text>") + write_text_content(rng, io; min_words=15, max_words=80) + println(io, "</text>") + else + println(io, indent, " <parlist>") + for _ in 1:rand(rng, 2:6) + print(io, indent, " <listitem><text>") + write_text_content(rng, io; min_words=8, max_words=40) + println(io, "</text></listitem>") + end + println(io, indent, " </parlist>") + end + println(io, indent, "</description>") +end + +function write_annotation(rng, io, indent, n_people) + println(io, indent, "<annotation>") + println(io, indent, " <author person=\"", string("person",rand(rng, 1:n_people)), "\"/>") + write_description(rng, io, string(indent, " ")) + println(io, indent, " <happiness>", rand(rng, 1:10), "</happiness>") + println(io, indent, "</annotation>") +end + +#-----------------------------------------------------------------# Section writers +function write_item(rng, io, id, n_categories) + featured = rand(rng) < 0.1 ? " featured=\"yes\"" : "" + println(io, " <item id=\"", string("item",id), "\"", featured, ">") + println(io, " <location>", rand(rng, CITIES), "</location>") + println(io, " <quantity>", rand(rng, 1:50), "</quantity>") + println(io, " <name>", rand_word(rng), " ", rand_word(rng), " ", rand_word(rng), "</name>") + println(io, " <payment>", rand(rng, PAYMENTS), "</payment>") + write_description(rng, io, " ") + println(io, " <shipping>", rand(rng, SHIPPING), "</shipping>") + for _ in 1:rand(rng, 1:3) + println(io, " <incategory category=\"", string("category",rand(rng, 1:n_categories)), "\"/>") + end + println(io, " <mailbox>") + for _ in 1:rand(rng, 0:5) + println(io, " <mail>") + println(io, " <from>", rand_email(rng), "</from>") + println(io, " <to>", rand_email(rng), "</to>") + println(io, " <date>", rand_date(rng), "</date>") + print(io, " <text>") + write_text_content(rng, io; min_words=10, max_words=60) + println(io, "</text>") + println(io, " </mail>") + end + println(io, " </mailbox>") + println(io, " </item>") +end + +function write_categories(rng, io, n) + println(io, " <categories>") + for i in 1:n + println(io, " <category id=\"", string("category",i), "\">") + println(io, " <name>", rand_word(rng), " ", rand_word(rng), "</name>") + write_description(rng, io, " ") + println(io, " </category>") + end + println(io, " </categories>") +end + +function write_catgraph(rng, io, n_edges, n_categories) + println(io, " <catgraph>") + for _ in 1:n_edges + from = string("category",rand(rng, 1:n_categories)) + to = string("category",rand(rng, 1:n_categories)) + println(io, " <edge from=\"", from, "\" to=\"", to, "\"/>") + end + println(io, " </catgraph>") +end + +function write_people(rng, io, n, n_categories, n_open) + println(io, " <people>") + for i in 1:n + println(io, " <person id=\"", string("person",i), "\">") + println(io, " <name>", rand(rng, FIRST_NAMES), " ", rand(rng, LAST_NAMES), "</name>") + println(io, " <emailaddress>", rand_email(rng), "</emailaddress>") + if rand(rng) < 0.8 + println(io, " <phone>", rand_phone(rng), "</phone>") + end + if rand(rng) < 0.7 + println(io, " <address>") + println(io, " <street>", rand(rng, 1:9999), " ", rand(rng, STREETS), " St</street>") + println(io, " <city>", rand(rng, CITIES), "</city>") + println(io, " <country>", rand(rng, COUNTRIES), "</country>") + if rand(rng) < 0.5 + println(io, " <province>", rand_word(rng), "</province>") + end + println(io, " <zipcode>", rand_zip(rng), "</zipcode>") + println(io, " </address>") + end + if rand(rng) < 0.5 + println(io, " <homepage>http://www.", lowercase(rand(rng, LAST_NAMES)), ".com/~", + lowercase(rand(rng, FIRST_NAMES)), "</homepage>") + end + if rand(rng) < 0.6 + println(io, " <creditcard>", rand_cc(rng), "</creditcard>") + end + if rand(rng) < 0.7 + income = rand(rng) < 0.8 ? string(" income=\"", rand(rng, 10000.0:0.01:250000.0), "\"") : "" + println(io, " <profile", income, ">") + for _ in 1:rand(rng, 0:4) + println(io, " <interest category=\"", string("category",rand(rng, 1:n_categories)), "\"/>") + end + if rand(rng) < 0.8 + println(io, " <education>", rand(rng, EDUCATIONS), "</education>") + end + if rand(rng) < 0.7 + println(io, " <gender>", rand(rng, GENDERS), "</gender>") + end + println(io, " <business>", rand_word(rng), "</business>") + if rand(rng) < 0.8 + println(io, " <age>", rand(rng, 18:85), "</age>") + end + println(io, " </profile>") + end + if n_open > 0 && rand(rng) < 0.3 + println(io, " <watches>") + for _ in 1:rand(rng, 1:5) + println(io, " <watch open_auction=\"", string("open_auction",rand(rng, 1:n_open)), "\"/>") + end + println(io, " </watches>") + end + println(io, " </person>") + end + println(io, " </people>") +end + +function write_open_auctions(rng, io, n, n_items, n_people) + println(io, " <open_auctions>") + for i in 1:n + println(io, " <open_auction id=\"", string("open_auction",i), "\">") + println(io, " <initial>", rand_price(rng), "</initial>") + if rand(rng) < 0.5 + println(io, " <reserve>", rand_price(rng), "</reserve>") + end + for _ in 1:rand(rng, 0:12) + println(io, " <bidder>") + println(io, " <date>", rand_date(rng), "</date>") + println(io, " <time>", rand_time(rng), "</time>") + println(io, " <personref person=\"", string("person",rand(rng, 1:n_people)), "\"/>") + println(io, " <increase>", rand_price(rng), "</increase>") + println(io, " </bidder>") + end + println(io, " <current>", rand_price(rng), "</current>") + if rand(rng) < 0.3 + println(io, " <privacy>", rand(rng, ["Yes", "No"]), "</privacy>") + end + println(io, " <itemref item=\"", string("item",rand(rng, 1:n_items)), "\"/>") + println(io, " <seller person=\"", string("person",rand(rng, 1:n_people)), "\"/>") + write_annotation(rng, io, " ", n_people) + println(io, " <quantity>", rand(rng, 1:10), "</quantity>") + println(io, " <type>", rand(rng, ["Regular", "Featured"]), "</type>") + println(io, " <interval>") + println(io, " <start>", rand_date(rng), "</start>") + println(io, " <end>", rand_date(rng), "</end>") + println(io, " </interval>") + println(io, " </open_auction>") + end + println(io, " </open_auctions>") +end + +function write_closed_auctions(rng, io, n, n_open, n_items, n_people) + println(io, " <closed_auctions>") + for i in 1:n + println(io, " <closed_auction>") + println(io, " <seller person=\"", string("person",rand(rng, 1:n_people)), "\"/>") + println(io, " <buyer person=\"", string("person",rand(rng, 1:n_people)), "\"/>") + # Use item IDs that don't overlap with open auctions + item_id = n_open + i + item_id = item_id <= n_items ? item_id : rand(rng, 1:n_items) + println(io, " <itemref item=\"", string("item",item_id), "\"/>") + println(io, " <price>", rand_price(rng), "</price>") + println(io, " <date>", rand_date(rng), "</date>") + println(io, " <quantity>", rand(rng, 1:10), "</quantity>") + println(io, " <type>", rand(rng, ["Regular", "Featured"]), "</type>") + if rand(rng) < 0.7 + write_annotation(rng, io, " ", n_people) + end + println(io, " </closed_auction>") + end + println(io, " </closed_auctions>") +end + +#-----------------------------------------------------------------# Main entry points +""" + generate_xmark([io_or_filename], factor; seed=42) + +Generate an XMark-style auction XML document. `factor` scales all entity counts linearly. + +Approximate output sizes (may vary slightly): +- `factor=0.1` → ~1.4 MB +- `factor=1.0` → ~14 MB +- `factor=2.0` → ~27 MB +- `factor=5.0` → ~68 MB +""" +function generate_xmark(io::IO, factor::Real; seed::Int=42) + factor > 0 || throw(ArgumentError("factor must be positive, got $factor")) + rng = Xoshiro(seed) + + n_per_region = max(1, round(Int, 500 * factor)) + n_people = max(1, round(Int, 5000 * factor)) + n_categories = max(1, round(Int, 200 * factor)) + n_open = max(1, round(Int, 2000 * factor)) + n_closed = max(1, round(Int, 1500 * factor)) + n_edges = max(1, round(Int, 1000 * factor)) + n_items = n_per_region * 6 + + # Clamp auctions to available items + n_open = min(n_open, n_items) + n_closed = min(n_closed, max(1, n_items - n_open)) + + println(io, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>") + println(io, "<site>") + + # Regions with items + println(io, " <regions>") + item_id = 0 + for region in REGIONS + println(io, " <", region, ">") + for _ in 1:n_per_region + item_id += 1 + write_item(rng, io, item_id, n_categories) + end + println(io, " </", region, ">") + end + println(io, " </regions>") + + write_categories(rng, io, n_categories) + write_catgraph(rng, io, n_edges, n_categories) + write_people(rng, io, n_people, n_categories, n_open) + write_open_auctions(rng, io, n_open, n_items, n_people) + write_closed_auctions(rng, io, n_closed, n_open, n_items, n_people) + + println(io, "</site>") + nothing +end + +function generate_xmark(filename::AbstractString, factor::Real; seed::Int=42) + open(filename, "w") do io + generate_xmark(io, factor; seed) + end + filename +end + +function generate_xmark(factor::Real; seed::Int=42) + io = IOBuffer() + generate_xmark(io, factor; seed) + String(take!(io)) +end + +end # module diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl new file mode 100644 index 0000000..7bd2cb1 --- /dev/null +++ b/benchmarks/benchmarks.jl @@ -0,0 +1,527 @@ +using XML +using XML: Element, nodetype, tag, children +using EzXML: EzXML +using XMLDict: XMLDict +using LightXML: LightXML +using BenchmarkTools +using DataFrames +using InteractiveUtils + +include("XMarkGenerator.jl") +using .XMarkGenerator + +BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10 +BenchmarkTools.DEFAULT_PARAMETERS.samples = 20000 + +#-----------------------------------------------------------------------------# Test data +# Small file (~120 lines) +small_file = joinpath(@__DIR__, "..", "test", "data", "books.xml") +small_xml = read(small_file, String) + +# Medium file (generated XMark auction XML, ~14 MB) +medium_file = joinpath(@__DIR__, "data", "xmark.xml") +if !isfile(medium_file) + mkpath(dirname(medium_file)) + @info "Generating XMark benchmark XML..." + generate_xmark(medium_file, 1.0) +end +medium_xml = read(medium_file, String) + +df = DataFrame(kind=String[], name=String[], bench=BenchmarkTools.Trial[]) + +macro add_benchmark(kind, name, expr...) + esc(:(let + @info string($kind, " - ", $name) + bench = @benchmark $(expr...) + push!(df, (; kind=$kind, name=$name, bench)) + end)) +end + +const SSNode = Node{SubString{String}} + +#-----------------------------------------------------------------------------# Parse (small) +@add_benchmark "Parse (small)" "XML.jl" parse($small_xml, Node) +@add_benchmark "Parse (small)" "XML.jl (SS)" parse($small_xml, SSNode) +@add_benchmark "Parse (small)" "EzXML" EzXML.parsexml($small_xml) +@add_benchmark "Parse (small)" "LightXML" LightXML.parse_string($small_xml) +@add_benchmark "Parse (small)" "XMLDict" XMLDict.xml_dict($small_xml) + +#-----------------------------------------------------------------------------# Parse (medium) +@add_benchmark "Parse (medium)" "XML.jl" parse($medium_xml, Node) +@add_benchmark "Parse (medium)" "XML.jl (SS)" parse($medium_xml, SSNode) +@add_benchmark "Parse (medium)" "EzXML" EzXML.parsexml($medium_xml) +@add_benchmark "Parse (medium)" "LightXML" LightXML.parse_string($medium_xml) +@add_benchmark "Parse (medium)" "XMLDict" XMLDict.xml_dict($medium_xml) + +#-----------------------------------------------------------------------------# Write (small) +@add_benchmark "Write (small)" "XML.jl" XML.write(o) setup=(o = parse(small_xml, Node)) +@add_benchmark "Write (small)" "EzXML" sprint(print, o) setup=(o = EzXML.parsexml(small_xml)) +@add_benchmark "Write (small)" "LightXML" LightXML.save_file(o, f) setup=(o = LightXML.parse_string(small_xml); f = tempname()) teardown=(LightXML.free(o); rm(f, force=true)) + +#-----------------------------------------------------------------------------# Write (medium) +@add_benchmark "Write (medium)" "XML.jl" XML.write(o) setup=(o = parse(medium_xml, Node)) +@add_benchmark "Write (medium)" "EzXML" sprint(print, o) setup=(o = EzXML.parsexml(medium_xml)) +@add_benchmark "Write (medium)" "LightXML" LightXML.save_file(o, f) setup=(o = LightXML.parse_string(medium_xml); f = tempname()) teardown=(LightXML.free(o); rm(f, force=true)) + +#-----------------------------------------------------------------------------# Read from file +@add_benchmark "Read file" "XML.jl" read($medium_file, Node) +@add_benchmark "Read file" "EzXML" EzXML.readxml($medium_file) +@add_benchmark "Read file" "LightXML" LightXML.parse_file($medium_file) + +#-----------------------------------------------------------------------------# Collect element tags +function xml_collect_tags(node) + out = String[] + _xml_collect_tags!(out, node) + out +end +function _xml_collect_tags!(out, node) + for c in children(node) + if nodetype(c) === Element + push!(out, tag(c)) + _xml_collect_tags!(out, c) + end + end +end + +function ezxml_collect_tags(node::EzXML.Node) + out = String[] + _ezxml_collect_tags!(out, node) + out +end +function _ezxml_collect_tags!(out, node::EzXML.Node) + for child in EzXML.eachelement(node) + push!(out, child.name) + _ezxml_collect_tags!(out, child) + end +end + +function lightxml_collect_tags(root::LightXML.XMLElement) + out = String[] + _lightxml_collect_tags!(out, root) + out +end +function _lightxml_collect_tags!(out, el::LightXML.XMLElement) + for child in LightXML.child_elements(el) + push!(out, LightXML.name(child)) + _lightxml_collect_tags!(out, child) + end +end + +@add_benchmark "Collect tags (small)" "XML.jl" xml_collect_tags(o) setup=(o = parse(small_xml, Node)) +@add_benchmark "Collect tags (small)" "EzXML" ezxml_collect_tags(o.root) setup=(o = EzXML.parsexml(small_xml)) +@add_benchmark "Collect tags (small)" "LightXML" lightxml_collect_tags(LightXML.root(o)) setup=(o = LightXML.parse_string(small_xml)) teardown=(LightXML.free(o)) + +@add_benchmark "Collect tags (medium)" "XML.jl" xml_collect_tags(o) setup=(o = parse(medium_xml, Node)) +@add_benchmark "Collect tags (medium)" "EzXML" ezxml_collect_tags(o.root) setup=(o = EzXML.parsexml(medium_xml)) +@add_benchmark "Collect tags (medium)" "LightXML" lightxml_collect_tags(LightXML.root(o)) setup=(o = LightXML.parse_string(medium_xml)) teardown=(LightXML.free(o)) + +#-----------------------------------------------------------------------------# XLSX-pattern fixtures +# These fixtures mirror the shapes that XLSX.jl exercises: +# - `sst_xml` matches `xl/sharedStrings.xml` (lots of small `<si><t>…</t></si>` entries +# separated by whitespace — the layout that exposes the LazyNode write/normalize choice) +# - `ws_xml` matches `xl/sheetN.xml` (a `<sheetData>` with many `<row>`s of `<c r=… s=… t=…><v>…</v></c>`) + +@info "Generating XLSX-pattern fixtures..." + +sst_xml = let buf = IOBuffer() + print(buf, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") + print(buf, "<sst xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\" count=\"50000\" uniqueCount=\"50000\">\n") + for i in 1:50000 + print(buf, " <si><t>shared string value number ", i, "</t></si>\n") + end + print(buf, "</sst>") + String(take!(buf)) +end + +ws_xml = let buf = IOBuffer() + print(buf, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") + print(buf, "<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\">\n") + print(buf, "<sheetData>\n") + for r in 1:3000 + print(buf, " <row r=\"", r, "\">") + for c in 1:15 + col = Char(UInt32('A') + c - 1) + print(buf, "<c r=\"", col, r, "\" s=\"3\" t=\"n\"><v>", r * c, "</v></c>") + end + print(buf, "</row>\n") + end + print(buf, "</sheetData></worksheet>") + String(take!(buf)) +end + +# String-heavy worksheet: cells reference the shared string table (`t="s"`, `<v>` = SST +# index). This is the most common real-world shape and the one where the `has_entities` +# short-circuit and zero-copy accessors matter most for XLSX.jl `readtable`. +ws_str_xml = let buf = IOBuffer() + print(buf, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") + print(buf, "<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\">\n") + print(buf, "<sheetData>\n") + for r in 1:5000 + print(buf, " <row r=\"", r, "\">") + for c in 1:8 + col = Char(UInt32('A') + c - 1) + print(buf, "<c r=\"", col, r, "\" s=\"2\" t=\"s\"><v>", (r * c) % 50000, "</v></c>") + end + print(buf, "</row>\n") + end + print(buf, "</sheetData></worksheet>") + String(take!(buf)) +end + +# Entity-heavy SST: every <t> needs decoding, exercising the `has_entities` slow path. +sst_entity_xml = let buf = IOBuffer() + print(buf, "<sst count=\"50000\" uniqueCount=\"50000\">") + for i in 1:50000 + print(buf, "<si><t>A & B <tag> #", i, "</t></si>") + end + print(buf, "</sst>") + String(take!(buf)) +end + +@info " sst_xml: $(round(length(sst_xml) / 1024 / 1024, digits=2)) MB ($(50000) <si>)" +@info " ws_xml: $(round(length(ws_xml) / 1024 / 1024, digits=2)) MB ($(3000) <row> × $(15) <c>)" +@info " ws_str_xml: $(round(length(ws_str_xml) / 1024 / 1024, digits=2)) MB ($(5000) <row> × $(8) string <c>)" +@info " sst_entity_xml: $(round(length(sst_entity_xml) / 1024 / 1024, digits=2)) MB (entity-heavy)" + +# Helper: walk a Node-based <si> subtree and concatenate its <t> text content. +function _node_unformatted(io::IO, el::Node{String}) + XML.tag(el) == "rPh" && return + if XML.tag(el) == "t" + if XML.is_simple(el) + write(io, XML.simple_value(el)) + else + v = XML.value(el) + isnothing(v) || write(io, v) + end + return + end + for c in XML.children(el) + _node_unformatted(io, c) + end +end +_node_unformatted(el::Node{String}) = sprint(_node_unformatted, el) + +#-----------------------------------------------------------------------------# Parse: XLSX shapes +@add_benchmark "Parse SST (LazyNode)" "XML.jl" parse($sst_xml, LazyNode) +@add_benchmark "Parse SST (LazyNode)" "Node (for ref)" parse($sst_xml, Node) +@add_benchmark "Parse worksheet (LazyNode)" "XML.jl" parse($ws_xml, LazyNode) +@add_benchmark "Parse worksheet (LazyNode)" "Node (for ref)" parse($ws_xml, Node) + +#-----------------------------------------------------------------------------# SST loading (XLSX.jl sst.jl pattern) +# Mirrors `sst_load!`: stream <si> children, capture raw XML + unformatted text per entry. + +@add_benchmark "SST: write each <si>" "LazyNode + write (zero-copy)" begin + out = String[] + sst_el = doc[end] + for si in XML.eachchildnode(sst_el) + XML.nodetype(si) === XML.Element || continue + push!(out, XML.write(si)) + end + out +end setup=(doc = parse(sst_xml, LazyNode)) + +@add_benchmark "SST: write each <si>" "LazyNode + write (normalize)" begin + out = String[] + sst_el = doc[end] + for si in XML.eachchildnode(sst_el) + XML.nodetype(si) === XML.Element || continue + push!(out, XML.write(si; normalize=true)) + end + out +end setup=(doc = parse(sst_xml, LazyNode)) + +@add_benchmark "SST: write each <si>" "Node (for ref)" begin + out = String[] + sst_el = doc[end] + for si in XML.children(sst_el) + XML.tag(si) == "si" || continue + push!(out, XML.write(si)) + end + out +end setup=(doc = parse(sst_xml, Node)) + +@add_benchmark "SST: unformatted text" "LazyNode + is_simple_value" begin + out = Vector{Union{Nothing,SubString{String},String}}() + sst_el = doc[end] + for si in XML.eachchildnode(sst_el) + XML.nodetype(si) === XML.Element || continue + for t in XML.eachchildnode(si) + XML.nodetype(t) === XML.Element || continue + XML.tag(t) == "t" || continue + push!(out, XML.is_simple_value(t)) + end + end + out +end setup=(doc = parse(sst_xml, LazyNode)) + +@add_benchmark "SST: unformatted text" "Node (for ref)" begin + out = String[] + sst_el = doc[end] + for si in XML.children(sst_el) + XML.tag(si) == "si" || continue + push!(out, _node_unformatted(si)) + end + out +end setup=(doc = parse(sst_xml, Node)) + +#-----------------------------------------------------------------------------# Worksheet: nested row/cell loops (XLSX.jl cell.jl pattern) +# Mirrors `Cell(c::LazyNode, ws)` and `get_rowcells!`: iterate <row>, then <c>, then attrs + <v>. + +@add_benchmark "Worksheet: collect rows" "children() (fresh Vector each call)" begin + sd = doc[end][1] # <sheetData> + XML.children(sd) +end setup=(doc = parse(ws_xml, LazyNode)) + +@add_benchmark "Worksheet: collect rows" "children!(buf, n) (reused buffer)" begin + sd = doc[end][1] + XML.children!(buf, sd) +end setup=(doc = parse(ws_xml, LazyNode); buf = XML.LazyNode{String}[]) + +@add_benchmark "Worksheet: attribute scan" "eachattribute" begin + n = 0 + sd = doc[end][1] + for row in XML.eachchildnode(sd) + XML.nodetype(row) === XML.Element || continue + for c in XML.eachchildnode(row) + XML.nodetype(c) === XML.Element || continue + for (k, v) in XML.eachattribute(c) + n += sizeof(v) + end + end + end + n +end setup=(doc = parse(ws_xml, LazyNode)) + +@add_benchmark "Worksheet: attribute scan" "attributes() (materialize dict)" begin + n = 0 + sd = doc[end][1] + for row in XML.eachchildnode(sd) + XML.nodetype(row) === XML.Element || continue + for c in XML.eachchildnode(row) + XML.nodetype(c) === XML.Element || continue + a = XML.attributes(c) + isnothing(a) && continue + for (_, v) in a + n += sizeof(v) + end + end + end + n +end setup=(doc = parse(ws_xml, LazyNode)) + +@add_benchmark "Worksheet: single attr fetch" "get(c, \"r\", \"\")" begin + n = 0 + sd = doc[end][1] + for row in XML.eachchildnode(sd) + XML.nodetype(row) === XML.Element || continue + for c in XML.eachchildnode(row) + XML.nodetype(c) === XML.Element || continue + n += sizeof(get(c, "r", "")) + end + end + n +end setup=(doc = parse(ws_xml, LazyNode)) + +@add_benchmark "Worksheet: single attr fetch" "attributes(c)[\"r\"]" begin + n = 0 + sd = doc[end][1] + for row in XML.eachchildnode(sd) + XML.nodetype(row) === XML.Element || continue + for c in XML.eachchildnode(row) + XML.nodetype(c) === XML.Element || continue + a = XML.attributes(c) + isnothing(a) && continue + n += sizeof(a["r"]) + end + end + n +end setup=(doc = parse(ws_xml, LazyNode)) + +@add_benchmark "Worksheet: <v> value" "is_simple_value" begin + n = 0 + sd = doc[end][1] + for row in XML.eachchildnode(sd) + XML.nodetype(row) === XML.Element || continue + for c in XML.eachchildnode(row) + XML.nodetype(c) === XML.Element || continue + for v in XML.eachchildnode(c) + XML.nodetype(v) === XML.Element || continue + val = XML.is_simple_value(v) + isnothing(val) || (n += sizeof(val)) + end + end + end + n +end setup=(doc = parse(ws_xml, LazyNode)) + +@add_benchmark "Worksheet: <v> value" "is_simple + simple_value" begin + n = 0 + sd = doc[end][1] + for row in XML.eachchildnode(sd) + XML.nodetype(row) === XML.Element || continue + for c in XML.eachchildnode(row) + XML.nodetype(c) === XML.Element || continue + for v in XML.eachchildnode(c) + XML.nodetype(v) === XML.Element || continue + if XML.is_simple(v) + n += sizeof(XML.simple_value(v)) + end + end + end + end + n +end setup=(doc = parse(ws_xml, LazyNode)) + +#-----------------------------------------------------------------------------# End-to-end XLSX.jl hot loops +# The micro-benchmarks above isolate single operations. These mirror the *combined* work +# XLSX.jl actually does per entry, so a regression in any sub-operation (parse, accessor, +# entity short-circuit, iterator allocation) shows up where it matters for spreadsheet read +# performance. + +# Mirrors XLSX.jl `sst.jl` `unformatted_text` / `gather_strings!`: recursively walk an +# <si> subtree concatenating <t> text content. +function _xlsx_unformatted(io::IO, e::XML.LazyNode) + t = XML.tag(e) + t == "rPh" && return nothing + if t == "t" + v = XML.is_simple_value(e) + isnothing(v) || write(io, v) + else + for ch in XML.eachchildnode(e) + XML.nodetype(ch) === XML.Element && _xlsx_unformatted(io, ch) + end + end + nothing +end + +# Mirrors XLSX.jl `sst.jl` `sst_load!`: stream <si>, capture raw XML + unformatted text. +@add_benchmark "XLSX sst_load! (end-to-end)" "LazyNode" begin + sst_el = doc[end] + shared = String[] + unformatted = String[] + for si in XML.eachchildnode(sst_el) + XML.nodetype(si) === XML.Element || continue + XML.tag(si) == "si" || continue + push!(shared, XML.write(si)) + io = IOBuffer() + _xlsx_unformatted(io, si) + push!(unformatted, String(take!(io))) + end + (length(shared), length(unformatted)) +end setup=(doc = parse(sst_xml, LazyNode)) + +# Mirrors XLSX.jl `cell.jl` `Cell(c, ws)` + `get_rowcells!`: per cell, read the r/s/t +# attributes and the <v> value, exactly as the reader does. Numeric worksheet. +@add_benchmark "XLSX cell read (end-to-end)" "numeric ws" begin + sd = doc[end][1] + ncells = 0 + acc = 0 + for row in XML.eachchildnode(sd) + XML.nodetype(row) === XML.Element || continue + for c in XML.eachchildnode(row) + XML.nodetype(c) === XML.Element || continue + ref = get(c, "r", "") + t = get(c, "t", "") + s = get(c, "s", "") + acc += sizeof(ref) + sizeof(t) + sizeof(s) + for child in XML.eachchildnode(c) + XML.nodetype(child) === XML.Element || continue + if XML.tag(child) == "v" + v = XML.is_simple_value(child) + isnothing(v) || (acc += sizeof(v)) + end + end + ncells += 1 + end + end + (ncells, acc) +end setup=(doc = parse(ws_xml, LazyNode)) + +# Same loop on the string-heavy worksheet (t="s", SST-indexed) — the common real shape +# and the one most sensitive to the entity short-circuit / zero-copy accessors. +@add_benchmark "XLSX cell read (end-to-end)" "string ws" begin + sd = doc[end][1] + ncells = 0 + acc = 0 + for row in XML.eachchildnode(sd) + XML.nodetype(row) === XML.Element || continue + for c in XML.eachchildnode(row) + XML.nodetype(c) === XML.Element || continue + ref = get(c, "r", "") + t = get(c, "t", "") + s = get(c, "s", "") + acc += sizeof(ref) + sizeof(t) + sizeof(s) + for child in XML.eachchildnode(c) + XML.nodetype(child) === XML.Element || continue + if XML.tag(child) == "v" + v = XML.is_simple_value(child) + isnothing(v) || (acc += sizeof(v)) + end + end + ncells += 1 + end + end + (ncells, acc) +end setup=(doc = parse(ws_str_xml, LazyNode)) + +# Realistic-string SST: entries containing characters that DO need entity decoding, so the +# `has_entities` slow path is exercised (catches regressions in the decode branch). +@add_benchmark "XLSX sst_load! (end-to-end)" "LazyNode (entity-heavy)" begin + sst_el = doc[end] + n = 0 + for si in XML.eachchildnode(sst_el) + XML.nodetype(si) === XML.Element || continue + XML.tag(si) == "si" || continue + for t in XML.eachchildnode(si) + XML.nodetype(t) === XML.Element || continue + v = XML.is_simple_value(t) + isnothing(v) || (n += sizeof(v)) + end + end + n +end setup=(doc = parse(sst_entity_xml, LazyNode)) + +#-----------------------------------------------------------------------------# Write benchmarks_results.md +_fmt_ms(t) = string(round(t, sigdigits=3), " ms") + +function _compare_indicator(xml_ms, other_ms) + ratio = xml_ms / other_ms + pct = abs(round((ratio - 1) * 100, digits=1)) + ratio > 1.05 ? "(XML.jl $(pct)% slower)" : ratio < 0.95 ? "(XML.jl $(pct)% faster)" : "(~same)" +end + +outfile = joinpath(@__DIR__, "benchmarks_results.md") +open(outfile, "w") do io + println(io, "# XML.jl Benchmarks\n") + println(io, "```") + for kind in unique(df.kind) + g = groupby(df, :kind) + haskey(g, (;kind)) || continue + sub = g[(;kind)] + println(io, kind) + # Find XML.jl baseline (first row starting with "XML.jl") + xml_row = findfirst(r -> startswith(r.name, "XML.jl") && !contains(r.name, "(SS)"), eachrow(sub)) + xml_ms = isnothing(xml_row) ? nothing : median(sub[xml_row, :bench]).time / 1e6 + for row in eachrow(sub) + ms = median(row.bench).time / 1e6 + indicator = "" + if !isnothing(xml_ms) && !startswith(row.name, "XML.jl") + indicator = " " * _compare_indicator(xml_ms, ms) + end + println(io, "\t", rpad(row.name, 16), lpad(_fmt_ms(ms), 12), indicator) + end + println(io) + end + println(io, "```") + + println(io, "\n```julia") + println(io, "versioninfo()") + buf = IOBuffer() + InteractiveUtils.versioninfo(buf) + for line in eachline(IOBuffer(take!(buf))) + println(io, "# ", line) + end + println(io, "```") +end + +println("Results written to $outfile") diff --git a/benchmarks/benchmarks_results.md b/benchmarks/benchmarks_results.md new file mode 100644 index 0000000..60c6ae0 --- /dev/null +++ b/benchmarks/benchmarks_results.md @@ -0,0 +1,101 @@ +# XML.jl Benchmarks + +``` +Parse (small) + XML.jl 0.0378 ms + XML.jl (SS) 0.0349 ms + EzXML 0.0224 ms (XML.jl 68.8% slower) + LightXML 0.022 ms (XML.jl 72.3% slower) + XMLDict 0.209 ms (XML.jl 81.9% faster) + +Parse (medium) + XML.jl 201.0 ms + XML.jl (SS) 190.0 ms + EzXML 80.3 ms (XML.jl 150.7% slower) + LightXML 114.0 ms (XML.jl 76.1% slower) + XMLDict 608.0 ms (XML.jl 66.9% faster) + +Write (small) + XML.jl 0.00957 ms + EzXML 0.0108 ms (XML.jl 11.7% faster) + LightXML 0.105 ms (XML.jl 90.9% faster) + +Write (medium) + XML.jl 48.3 ms + EzXML 36.9 ms (XML.jl 30.9% slower) + LightXML 56.2 ms (XML.jl 14.1% faster) + +Read file + XML.jl 191.0 ms + EzXML 115.0 ms (XML.jl 67.2% slower) + LightXML 97.4 ms (XML.jl 96.6% slower) + +Collect tags (small) + XML.jl 0.000602 ms + EzXML 0.0021 ms (XML.jl 71.4% faster) + LightXML 0.00381 ms (XML.jl 84.2% faster) + +Collect tags (medium) + XML.jl 12.7 ms + EzXML 16.3 ms (XML.jl 21.8% faster) + LightXML 23.5 ms (XML.jl 45.9% faster) + +Parse SST (LazyNode) + XML.jl 5.29e-6 ms + Node (for ref) 45.8 ms (XML.jl 100.0% faster) + +Parse worksheet (LazyNode) + XML.jl 5.21e-6 ms + Node (for ref) 69.6 ms (XML.jl 100.0% faster) + +SST: write each <si> + LazyNode + write (zero-copy) 93.0 ms + LazyNode + write (normalize) 157.0 ms + Node (for ref) 9.83 ms + +SST: unformatted text + LazyNode + is_simple_value 102.0 ms + Node (for ref) 5.31 ms + +Worksheet: collect rows + children() (fresh Vector each call) 87.9 ms + children!(buf, n) (reused buffer) 87.9 ms + +Worksheet: attribute scan + eachattribute 87.8 ms + attributes() (materialize dict) 87.2 ms + +Worksheet: single attr fetch + get(c, "r", "") 87.6 ms + attributes(c)["r"] 88.0 ms + +Worksheet: <v> value + is_simple_value 87.1 ms + is_simple + simple_value 87.8 ms + +XLSX sst_load! (end-to-end) + LazyNode 149.0 ms + LazyNode (entity-heavy) 113.0 ms + +XLSX cell read (end-to-end) + numeric ws 87.9 ms + string ws 80.2 ms + +``` + +```julia +versioninfo() +# Julia Version 1.12.6 +# Commit 15346901f00 (2026-04-09 19:20 UTC) +# Build Info: +# Official https://julialang.org release +# Platform Info: +# OS: macOS (arm64-apple-darwin24.0.0) +# CPU: 10 × Apple M1 Pro +# WORD_SIZE: 64 +# LLVM: libLLVM-18.1.7 (ORCJIT, apple-m1) +# GC: Built with stock GC +# Threads: 8 default, 1 interactive, 8 GC (on 8 virtual cores) +# Environment: +# JULIA_NUM_THREADS = auto +``` diff --git a/benchmarks/compare.jl b/benchmarks/compare.jl new file mode 100644 index 0000000..4bdc22a --- /dev/null +++ b/benchmarks/compare.jl @@ -0,0 +1,290 @@ +#= Compare current dev XML.jl against the last released version. + +Usage: + julia benchmarks/compare.jl [tag] + +`tag` defaults to the latest git tag (e.g. v0.3.8). + +This script: +1. Runs benchmarks using the current (dev) code +2. Checks out the release tag into a temp worktree +3. Runs the same benchmarks against that version +4. Prints a side-by-side comparison +=# + +using BenchmarkTools, Serialization, InteractiveUtils + +BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5 +BenchmarkTools.DEFAULT_PARAMETERS.samples = 10000 + +const ROOT = dirname(@__DIR__) + +const RELEASE_TAG = if length(ARGS) >= 1 + ARGS[1] +else + tags = readlines(`git -C $ROOT tag --sort=version:refname`) + filter!(t -> startswith(t, "v"), tags) + last(tags) +end + +const SMALL_FILE = joinpath(ROOT, "test", "data", "books.xml") +const SMALL_XML = read(SMALL_FILE, String) + +# Generate medium file if needed +include(joinpath(ROOT, "benchmarks", "XMarkGenerator.jl")) +using .XMarkGenerator +const MEDIUM_FILE = joinpath(ROOT, "benchmarks", "data", "xmark.xml") +if !isfile(MEDIUM_FILE) + mkpath(dirname(MEDIUM_FILE)) + @info "Generating XMark benchmark XML..." + generate_xmark(MEDIUM_FILE, 1.0) +end +const MEDIUM_XML = read(MEDIUM_FILE, String) + +#-----------------------------------------------------------------------------# Helpers +function _collect_tags!(out, node) + for c in XML.children(node) + if XML.nodetype(c) === XML.Element + push!(out, XML.tag(c)) + _collect_tags!(out, c) + end + end +end + +function bench_collect_tags(node) + out = String[] + _collect_tags!(out, node) + out +end + +#-----------------------------------------------------------------------------# Run dev benchmarks +println("="^60) +println(" XML.jl Benchmark Comparison") +println(" Current (dev) vs $RELEASE_TAG") +println("="^60) +println() + +print("Running dev benchmarks...") +flush(stdout) + +using XML + +dev_results = Dict{String, BenchmarkTools.Trial}() + +const SSNode = Node{SubString{String}} + +dev_small = parse(SMALL_XML, Node) +dev_small_ss = parse(SMALL_XML, SSNode) +dev_medium = parse(MEDIUM_XML, Node) +dev_medium_ss = parse(MEDIUM_XML, SSNode) + +dev_results["Parse (small), String"] = @benchmark parse($SMALL_XML, Node) +dev_results["Parse (small), SubString"] = @benchmark parse($SMALL_XML, SSNode) +dev_results["Parse (medium), String"] = @benchmark parse($MEDIUM_XML, Node) +dev_results["Parse (medium), SubString"] = @benchmark parse($MEDIUM_XML, SSNode) +dev_results["Write (small)"] = @benchmark XML.write($dev_small) +dev_results["Write (medium)"] = @benchmark XML.write($dev_medium) +dev_results["Read file (medium), String"] = @benchmark read($MEDIUM_FILE, Node) +dev_results["Read file (medium), SubString"] = @benchmark parse(read($MEDIUM_FILE, String), SSNode) +dev_results["Collect tags (small), String"] = @benchmark bench_collect_tags($dev_small) +dev_results["Collect tags (small), SubString"] = @benchmark bench_collect_tags($dev_small_ss) +dev_results["Collect tags (medium), String"] = @benchmark bench_collect_tags($dev_medium) +dev_results["Collect tags (medium), SubString"] = @benchmark bench_collect_tags($dev_medium_ss) + +# LazyNode benchmarks +dev_lazy_small = parse(SMALL_XML, LazyNode) +dev_lazy_medium = parse(MEDIUM_XML, LazyNode) + +dev_results["Parse (small), LazyNode"] = @benchmark parse($SMALL_XML, LazyNode) +dev_results["Parse (medium), LazyNode"] = @benchmark parse($MEDIUM_XML, LazyNode) +dev_results["Write (small), LazyNode"] = @benchmark XML.write($(dev_lazy_small[1])) +dev_results["Write (medium), LazyNode"] = @benchmark XML.write($(dev_lazy_medium[1])) +dev_results["sourcetext, small"] = @benchmark sourcetext($(dev_lazy_small[1])) +dev_results["sourcetext, medium"] = @benchmark sourcetext($(dev_lazy_medium[1])) +dev_lazy_medium_root = let ch = children(dev_lazy_medium) + i = findfirst(c -> nodetype(c) === Element, ch) + ch[i] +end +dev_results["children vs eachchildnode, children"] = @benchmark children($dev_lazy_medium_root) +dev_results["children vs eachchildnode, eachchildnode"] = @benchmark collect(eachchildnode($dev_lazy_medium_root)) + +# SST-like benchmark: many children, write each one +const SST_N = 10_000 +const SST_XML = "<sst>" * join("""<si><t>string_$i</t></si>""" for i in 1:SST_N) * "</sst>" +dev_sst_node = parse(SST_XML, Node) +dev_sst_lazy = parse(SST_XML, LazyNode) +dev_sst_root_node = only(children(dev_sst_node)) +dev_sst_root_lazy = only(children(dev_sst_lazy)) + +function bench_sst_node(xml) + root = only(children(parse(xml, Node))) + out = String[] + for c in XML.children(root) + XML.nodetype(c) === XML.Element && push!(out, XML.write(c)) + end + out +end +function bench_sst_lazy_children(xml) + root = only(children(parse(xml, LazyNode))) + out = String[] + for c in XML.children(root) + XML.nodetype(c) === XML.Element && push!(out, XML.write(c)) + end + out +end +function bench_sst_lazy_eachchildnode(xml) + root = only(children(parse(xml, LazyNode))) + out = String[] + for c in XML.eachchildnode(root) + XML.nodetype(c) === XML.Element && push!(out, XML.write(c)) + end + out +end + +dev_results["SST (parse+iterate+write), Node"] = @benchmark bench_sst_node($SST_XML) +dev_results["SST (parse+iterate+write), LazyNode+children"] = @benchmark bench_sst_lazy_children($SST_XML) +dev_results["SST (parse+iterate+write), LazyNode+eachchildnode"] = @benchmark bench_sst_lazy_eachchildnode($SST_XML) + +println(" done") + +#-----------------------------------------------------------------------------# Run release benchmarks via temp worktree + separate process +print("Setting up $RELEASE_TAG worktree...") +flush(stdout) + +worktree_dir = mktempdir() +run(pipeline(`git -C $ROOT worktree add $worktree_dir $RELEASE_TAG`, stdout=devnull, stderr=devnull)) +println(" done") + +release_results_file = joinpath(worktree_dir, "_results.jls") + +release_script = joinpath(worktree_dir, "_bench.jl") +write(release_script, """ +using Pkg +Pkg.activate(; temp=true) +Pkg.develop(path=$(repr(worktree_dir))) +Pkg.add("BenchmarkTools") +Pkg.add("Serialization") + +using BenchmarkTools, Serialization, XML + +BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5 +BenchmarkTools.DEFAULT_PARAMETERS.samples = 10000 + +small_xml = read($(repr(SMALL_FILE)), String) +medium_xml = read($(repr(MEDIUM_FILE)), String) +results = Dict{String, BenchmarkTools.Trial}() + +results["Parse (small)"] = @benchmark parse(\$small_xml, Node) + +try + SSNode = Node{SubString{String}} + results["Parse (small, SS)"] = @benchmark parse(\$small_xml, SSNode) + results["Parse (medium, SS)"] = @benchmark parse(\$medium_xml, SSNode) +catch +end + +results["Parse (medium)"] = @benchmark parse(\$medium_xml, Node) + +small_node = parse(small_xml, Node) +medium_node = parse(medium_xml, Node) +results["Write (small)"] = @benchmark XML.write(\$small_node) +results["Write (medium)"] = @benchmark XML.write(\$medium_node) +results["Read file (medium)"] = @benchmark read($(repr(MEDIUM_FILE)), Node) + +function _collect_tags!(out, node) + for c in XML.children(node) + if XML.nodetype(c) === XML.Element + push!(out, XML.tag(c)) + _collect_tags!(out, c) + end + end +end +function bench_collect_tags(node) + out = String[] + _collect_tags!(out, node) + out +end +results["Collect tags (small)"] = @benchmark bench_collect_tags(\$small_node) +results["Collect tags (medium)"] = @benchmark bench_collect_tags(\$medium_node) + +try + lazy_small = parse(small_xml, LazyNode) + lazy_medium = parse(medium_xml, LazyNode) + results["Parse (small), LazyNode"] = @benchmark parse(\$small_xml, LazyNode) + results["Parse (medium), LazyNode"] = @benchmark parse(\$medium_xml, LazyNode) +catch +end + +serialize($(repr(release_results_file)), results) +""") + +print("Running $RELEASE_TAG benchmarks...") +flush(stdout) +run(pipeline(`julia $release_script`, stdout=devnull, stderr=devnull)) +release_results = deserialize(release_results_file) +println(" done") + +# Cleanup worktree +run(pipeline(`git -C $ROOT worktree remove --force $worktree_dir`, stdout=devnull, stderr=devnull)) + +#-----------------------------------------------------------------------------# Write compare_results.md +_fmt_ms(t) = string(round(t, sigdigits=3), " ms") + +function _compare_indicator(dev_ms, rel_ms) + change = (dev_ms / rel_ms - 1) * 100 + pct = abs(round(change, digits=1)) + change < -5 ? "($(pct)% faster)" : change > 5 ? "($(pct)% slower)" : "(~same)" +end + +groups = [ + ("Parse (small)", "Parse (small)", ["Parse (small), String", "Parse (small), SubString", "Parse (small), LazyNode"]), + ("Parse (medium)", "Parse (medium)", ["Parse (medium), String", "Parse (medium), SubString", "Parse (medium), LazyNode"]), + ("Write (small)", "Write (small)", ["Write (small)", "Write (small), LazyNode"]), + ("Write (medium)", "Write (medium)", ["Write (medium)", "Write (medium), LazyNode"]), + ("Read file (medium)", "Read file (medium)", ["Read file (medium), String", "Read file (medium), SubString"]), + ("Collect tags (small)", "Collect tags (small)", ["Collect tags (small), String", "Collect tags (small), SubString"]), + ("Collect tags (medium)","Collect tags (medium)", ["Collect tags (medium), String", "Collect tags (medium), SubString"]), + ("sourcetext", nothing, ["sourcetext, small", "sourcetext, medium"]), + ("children vs eachchildnode (medium)", nothing, ["children vs eachchildnode, children", "children vs eachchildnode, eachchildnode"]), + ("SST-like: parse+iterate+write (10k)", nothing, ["SST (parse+iterate+write), Node", "SST (parse+iterate+write), LazyNode+children", "SST (parse+iterate+write), LazyNode+eachchildnode"]), +] + +outfile = joinpath(@__DIR__, "compare_results.md") +open(outfile, "w") do io + println(io, "# XML.jl Benchmark Comparison: dev vs $RELEASE_TAG\n") + println(io, "```") + for (title, rel_key, dev_keys) in groups + rel_ms = (!isnothing(rel_key) && haskey(release_results, rel_key)) ? median(release_results[rel_key]).time / 1e6 : nothing + any(k -> haskey(dev_results, k), dev_keys) || (isnothing(rel_ms) && continue) + + println(io, title) + if !isnothing(rel_ms) + println(io, "\t", rpad(RELEASE_TAG, 16), lpad(_fmt_ms(rel_ms), 12)) + end + for dk in dev_keys + haskey(dev_results, dk) || continue + dev_ms = median(dev_results[dk]).time / 1e6 + label = occursin(", ", dk) ? split(dk, ", "; limit=2)[2] : "dev" + ms_str = lpad(_fmt_ms(dev_ms), 12) + padlen = max(16, length(label) + 2) + if isnothing(rel_ms) + println(io, "\t", rpad(label, padlen), ms_str) + else + println(io, "\t", rpad(label, padlen), ms_str, " ", _compare_indicator(dev_ms, rel_ms)) + end + end + println(io) + end + println(io, "```") + + println(io, "\n```julia") + println(io, "versioninfo()") + buf = IOBuffer() + InteractiveUtils.versioninfo(buf) + for line in eachline(IOBuffer(take!(buf))) + println(io, "# ", line) + end + println(io, "```") +end + +println("Results written to $outfile") diff --git a/benchmarks/compare_results.md b/benchmarks/compare_results.md new file mode 100644 index 0000000..dffbcae --- /dev/null +++ b/benchmarks/compare_results.md @@ -0,0 +1,71 @@ +# XML.jl Benchmark Comparison: dev vs v0.3.8 + +``` +Parse (small) + v0.3.8 0.139 ms + String 0.0409 ms (70.6% faster) + SubString 0.033 ms (76.3% faster) + LazyNode 6.33e-6 ms (100.0% faster) + +Parse (medium) + v0.3.8 829.0 ms + String 200.0 ms (75.8% faster) + SubString 163.0 ms (80.4% faster) + LazyNode 6.33e-6 ms (100.0% faster) + +Write (small) + v0.3.8 0.032 ms + dev 0.0215 ms (32.6% faster) + LazyNode 0.000217 ms (99.3% faster) + +Write (medium) + v0.3.8 156.0 ms + dev 99.2 ms (36.3% faster) + LazyNode 0.000273 ms (100.0% faster) + +Read file (medium) + v0.3.8 755.0 ms + String 193.0 ms (74.4% faster) + SubString 179.0 ms (76.3% faster) + +Collect tags (small) + v0.3.8 0.00064 ms + String 0.000714 ms (11.7% slower) + SubString 0.00211 ms (230.3% slower) + +Collect tags (medium) + v0.3.8 21.6 ms + String 13.3 ms (38.7% faster) + SubString 20.3 ms (6.2% faster) + +sourcetext + small 0.000191 ms + medium 0.000248 ms + +children vs eachchildnode (medium) + children 76.8 ms + eachchildnode 80.4 ms + +SST-like: parse+iterate+write (10k) + Node 9.01 ms + LazyNode+children 9.78 ms + LazyNode+eachchildnode 10.4 ms + +``` + +```julia +versioninfo() +# Julia Version 1.12.6 +# Commit 15346901f00 (2026-04-09 19:20 UTC) +# Build Info: +# Official https://julialang.org release +# Platform Info: +# OS: macOS (arm64-apple-darwin24.0.0) +# CPU: 10 × Apple M1 Pro +# WORD_SIZE: 64 +# LLVM: libLLVM-18.1.7 (ORCJIT, apple-m1) +# GC: Built with stock GC +# Threads: 8 default, 1 interactive, 8 GC (on 8 virtual cores) +# Environment: +# JULIA_NUM_THREADS = auto +``` diff --git a/benchmarks/dict_benchmarks.jl b/benchmarks/dict_benchmarks.jl new file mode 100644 index 0000000..7dd90a3 --- /dev/null +++ b/benchmarks/dict_benchmarks.jl @@ -0,0 +1,71 @@ +using XML +using BenchmarkTools + +BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5 + +#-----------------------------------------------------------------------------# Setup +sizes = [2, 5, 10, 20] + +function make_xml(n::Int) + attrs = join((" attr$i=\"value$i\"" for i in 1:n)) + "<root$attrs/>" +end + +function make_pairs(n::Int) + Pair{String,String}["attr$i" => "value$i" for i in 1:n] +end + +pt(t) = BenchmarkTools.prettytime(t) + +function printrow(n, op, t_dict, t_attr) + pct = round(100 * (t_dict - t_attr) / t_dict, digits=1) + label = pct > 0 ? "$(pct)% faster" : "$(-pct)% slower" + println(rpad("$n attrs", 10), " | ", rpad(op, 22), " | ", + rpad("Dict $(pt(t_dict))", 22), " | ", + rpad("Attributes $(pt(t_attr))", 26), " | ", label) +end + +#-----------------------------------------------------------------------------# Benchmarks +println("=" ^ 110) +println(" Attributes vs Dict Benchmarks") +println("=" ^ 110) +println(rpad("Size", 10), " | ", rpad("Operation", 22), " | ", + rpad("Dict", 22), " | ", rpad("Attributes", 26), " | Change") +println("-" ^ 110) + +for n in sizes + pairs = make_pairs(n) + d = Dict(pairs) + a = XML.Attributes(pairs) + key_mid = "attr$(n ÷ 2 + 1)" + key_last = "attr$n" + + tests = [ + ("construct", () -> @benchmark(Dict($pairs)), () -> @benchmark(XML.Attributes($pairs))), + ("getindex [mid]", () -> @benchmark($d[$key_mid]), () -> @benchmark($a[$key_mid])), + ("getindex [last]", () -> @benchmark($d[$key_last]), () -> @benchmark($a[$key_last])), + ("get [miss]", () -> @benchmark(get($d, "nope", nothing)), () -> @benchmark(get($a, "nope", nothing))), + ("haskey [hit]", () -> @benchmark(haskey($d, $key_mid)), () -> @benchmark(haskey($a, $key_mid))), + ("keys", () -> @benchmark(collect(keys($d))), () -> @benchmark(keys($a))), + ("iterate", () -> @benchmark(sum(length(v) for (_,v) in $d)), () -> @benchmark(sum(length(v) for (_,v) in $a))), + ] + + for (op, bench_dict, bench_attr) in tests + t_dict = median(bench_dict()).time + t_attr = median(bench_attr()).time + printrow(n, op, t_dict, t_attr) + end + println("-" ^ 110) +end + +#-----------------------------------------------------------------------------# End-to-end: attributes() call on parsed Node +println() +println(rpad("Size", 10), " | ", rpad("Operation", 22), " | Time") +println("-" ^ 50) +for n in sizes + doc = parse(make_xml(n), Node) + el = doc[1] + t = median(@benchmark(attributes($el))).time + println(rpad("$n attrs", 10), " | ", rpad("attributes(node)", 22), " | ", pt(t)) +end +println() diff --git a/benchmarks/suite.jl b/benchmarks/suite.jl deleted file mode 100644 index e06dc61..0000000 --- a/benchmarks/suite.jl +++ /dev/null @@ -1,74 +0,0 @@ -using Pkg -Pkg.activate(@__DIR__) - -using XML -using EzXML: EzXML -using XMLDict: XMLDict -using BenchmarkTools -using DataFrames -using UnicodePlots -using OrderedCollections: OrderedDict - - -BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10 -BenchmarkTools.DEFAULT_PARAMETERS.samples = 20000 - - -# nasa.xml was downloaded from: -# http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/www/repository.html#nasa -file = joinpath(@__DIR__, "nasa.xml") - -df = DataFrame(kind=String[], name=String[], bench=BenchmarkTools.Trial[]) - -macro add_benchmark(kind, name, expr...) - esc(:(let - @info string($kind, " - ", $name) - bench = @benchmark $(expr...) - push!(df, (; kind=$kind, name=$name, bench)) - end)) -end - -#-----------------------------------------------------------------------------# Write -@add_benchmark "Write" "XML.write" XML.write($(tempname()), o) setup = (o = read(file, Node)) -@add_benchmark "Write" "EzXML.writexml" EzXML.write($(tempname()), o) setup = (o = EzXML.readxml(file)) - -#-----------------------------------------------------------------------------# Read -@add_benchmark "Read" "XML.LazyNode" read($file, LazyNode) -@add_benchmark "Read" "XML.Node" read($file, Node) -@add_benchmark "Read" "EzXML.readxml" EzXML.readxml($file) -@add_benchmark "Read" "XMLDict.xml_dict" XMLDict.xml_dict(read($file, String)) - -#-----------------------------------------------------------------------------# Lazy Iteration -@add_benchmark "Lazy Iteration" "LazyNode" for x in read($file, LazyNode); end -@add_benchmark "Lazy Iteration" "EzXML.StreamReader" (reader = open(EzXML.StreamReader, $file); for x in reader; end; close(reader)) - -#-----------------------------------------------------------------------------# Lazy Iteration: Collect Tags -@add_benchmark "Collect Tags" "LazyNode" [tag(x) for x in o] setup = (o = read(file, LazyNode)) -@add_benchmark "Collect Tags" "EzXML.StreamReader" [r.name for x in r if x == EzXML.READER_ELEMENT] setup=(r=open(EzXML.StreamReader, file)) teardown=(close(r)) - -function get_tags(o::EzXML.Node) - out = String[] - for node in EzXML.eachelement(o) - push!(out, node.name) - for tag in get_tags(node) - push!(out, tag) - end - end - out -end -@add_benchmark "Collect Tags" "EzXML.readxml" get_tags(o.root) setup=(o = EzXML.readxml(file)) - - -#-----------------------------------------------------------------------------# Plots -function plot(df, kind) - g = groupby(df, :kind) - sub = g[(;kind)] - x = map(row -> "$(row.name)", eachrow(sub)) - y = map(x -> median(x).time / 1000^2, sub.bench) - display(barplot(x, y, title = "$kind Time (ms)", border=:none, width=50)) -end - -plot(df, "Read") -plot(df, "Write") -plot(df, "Lazy Iteration") -plot(df, "Collect Tags") diff --git a/ext/XMLAbstractTreesExt.jl b/ext/XMLAbstractTreesExt.jl new file mode 100644 index 0000000..60add31 --- /dev/null +++ b/ext/XMLAbstractTreesExt.jl @@ -0,0 +1,71 @@ +module XMLAbstractTreesExt + +using XML: XML, Node, LazyNode, NodeType, Element, Text, CData, Comment, + Declaration, DTD, Document, ProcessingInstruction, + nodetype, tag, value, attributes +import AbstractTrees + +#-----------------------------------------------------------------------------# children +AbstractTrees.children(n::Node) = XML.children(n) +AbstractTrees.children(n::LazyNode) = XML.children(n) + +#-----------------------------------------------------------------------------# nodevalue +AbstractTrees.nodevalue(n::Node) = n +AbstractTrees.nodevalue(n::LazyNode) = n + +#-----------------------------------------------------------------------------# printnode +# Single-line label for `print_tree`; mirrors the REPL `show` for each NodeType but +# without trailing child-count annotations (AbstractTrees draws the structure). +_printnode(io::IO, n::Union{Node, LazyNode}) = _printnode(io, n, nodetype(n)) + +function _printnode(io::IO, n, ::Val{Element}) + print(io, '<', tag(n)) + attrs = attributes(n) + if !isnothing(attrs) + for (k, v) in attrs + print(io, ' ', k, '=', '"', v, '"') + end + end + print(io, '>') +end + +_printnode(io::IO, n, ::Val{Text}) = show(io, value(n)) +_printnode(io::IO, n, ::Val{Comment}) = print(io, "<!--", value(n), "-->") +_printnode(io::IO, n, ::Val{CData}) = print(io, "<![CDATA[", value(n), "]]>") +_printnode(io::IO, n, ::Val{DTD}) = print(io, "<!DOCTYPE ", value(n), '>') + +function _printnode(io::IO, n, ::Val{Declaration}) + print(io, "<?xml") + attrs = attributes(n) + if !isnothing(attrs) + for (k, v) in attrs + print(io, ' ', k, '=', '"', v, '"') + end + end + print(io, "?>") +end + +function _printnode(io::IO, n, ::Val{ProcessingInstruction}) + print(io, "<?", tag(n)) + v = value(n) + !isnothing(v) && print(io, ' ', v) + print(io, "?>") +end + +_printnode(io::IO, n, ::Val{Document}) = print(io, "Document") + +# Dispatch helper: avoid an Enum branch chain by tag-dispatching on Val{NodeType}. +_printnode(io::IO, n, nt::NodeType) = _printnode(io, n, Val(nt)) + +AbstractTrees.printnode(io::IO, n::Node) = _printnode(io, n) +AbstractTrees.printnode(io::IO, n::LazyNode) = _printnode(io, n) + +#-----------------------------------------------------------------------------# traits +AbstractTrees.NodeType(::Type{<:Node}) = AbstractTrees.HasNodeType() +AbstractTrees.NodeType(::Type{<:LazyNode}) = AbstractTrees.HasNodeType() +AbstractTrees.nodetype(::Type{N}) where {N <: Node} = N +AbstractTrees.nodetype(::Type{L}) where {L <: LazyNode} = L + +AbstractTrees.ChildIndexing(::Type{<:Node}) = AbstractTrees.IndexedChildren() + +end # module diff --git a/src/XML.jl b/src/XML.jl index 273bfda..a431541 100644 --- a/src/XML.jl +++ b/src/XML.jl @@ -1,31 +1,66 @@ module XML -using Mmap -using OrderedCollections: OrderedDict - export - # Core Types: - Node, LazyNode, - # Interface: - children, nodetype, tag, attributes, value, is_simple, simplevalue, simple_value, - # Extended Interface for LazyNode: - parent, depth, next, prev + Node, LazyNode, NodeType, Attributes, + CData, Comment, Declaration, Document, DTD, Element, ProcessingInstruction, Text, + nodetype, tag, attributes, value, children, children!, eachchildnode, eachattribute, + is_simple, simple_value, is_simple_value, sourcetext, + depth, siblings, + xpath, + h + +include("XMLTokenizer.jl") +using .XMLTokenizer: + XMLTokenizer, tokenize, tag_name, attr_value, pi_target, + TokenKinds, Token, Tokenizer, TokenizerState #-----------------------------------------------------------------------------# escape/unescape -const escape_chars = ('&' => "&", '<' => "<", '>' => ">", "'" => "'", '"' => """) +const ESCAPE_CHARS = ('&' => "&", '<' => "<", '>' => ">", '\'' => "'", '"' => """) + +""" + escape(x::AbstractString) -> String + +Escape the five XML predefined entities: `&` `<` `>` `'` `"`. + +!!! note "Changed in v0.4" + `escape` is no longer idempotent. In previous versions, already-escaped sequences like + `&` were left untouched. Now every `&` is escaped, so `escape("&")` produces + `"&amp;"`. Call `escape` only on raw, unescaped text. +""" +escape(x::AbstractString) = replace(x, ESCAPE_CHARS...) + +# Replace a numeric character reference with its Unicode character. +# Numeric character references encode characters by code point: decimal (é → é) or hex (é → é). +function _unescape_charref(ref::AbstractString) + is_hex = length(ref) > 3 && ref[3] in ('x', 'X') + digits = SubString(ref, is_hex ? 4 : 3, length(ref) - 1) + cp = tryparse(UInt32, digits; base = is_hex ? 16 : 10) + !isnothing(cp) && isvalid(Char, cp) ? string(Char(cp)) : ref +end + +""" + unescape(x::AbstractString) -> String + unescape(x::SubString{String}) -> Union{SubString{String}, String} + +Unescape XML entities in `x`: the five predefined entities (`&` `<` `>` `'` +`"`) and numeric character references (`{`, `«`). Each reference is processed +exactly once (no double-unescaping). + +When `x` is a `SubString{String}` containing no `&`, the input is returned unchanged with +no allocation — the common case for typical XML attribute and text content. +""" function unescape(x::AbstractString) - result = x - for (pat, r) in reverse.(escape_chars) - result = replace(result, pat => r) - end - return result + s = string(x) + occursin('&', s) || return s + occursin("&#", s) && (s = replace(s, r"&#[xX]?[0-9a-fA-F]+;" => _unescape_charref)) + replace(s, "<" => "<", ">" => ">", "'" => "'", """ => "\"", "&" => "&") end -function escape(x::String) - result = replace(x, r"&(?!amp;|quot;|apos;|gt;|lt;)" => "&") - for (pat, r) in escape_chars[2:end] - result = replace(result, pat => r) - end - return result + +function unescape(x::SubString{String}) + occursin('&', x) || return x + s = String(x) + occursin("&#", s) && (s = replace(s, r"&#[xX]?[0-9a-fA-F]+;" => _unescape_charref)) + replace(s, "<" => "<", ">" => ">", "'" => "'", """ => "\"", "&" => "&") end #-----------------------------------------------------------------------------# NodeType @@ -34,9 +69,9 @@ end - Document # prolog & root Element - DTD # <!DOCTYPE ...> - Declaration # <?xml attributes... ?> - - ProcessingInstruction # <?NAME attributes... ?> + - ProcessingInstruction # <?NAME content... ?> - Comment # <!-- ... --> - - CData # <![CData[...]]> + - CData # <![CDATA[...]]> - Element # <NAME attributes... > children... </NAME> - Text # text @@ -45,381 +80,1131 @@ NodeTypes can be used to construct XML.Nodes: Document(children...) DTD(value) Declaration(; attributes) - ProcessingInstruction(tag, attributes) + ProcessingInstruction(tag, content) Comment(text) CData(text) Element(tag, children...; attributes) Text(text) """ -@enum(NodeType, CData, Comment, Declaration, Document, DTD, Element, ProcessingInstruction, Text) +@enum NodeType::UInt8 CData Comment Declaration Document DTD Element ProcessingInstruction Text +#-----------------------------------------------------------------------------# Attributes +""" + Attributes{S} <: AbstractDict{S, S} + +An ordered dictionary of XML attributes backed by a `Vector{Pair{S, S}}`. +Returned by [`attributes`](@ref). Preserves insertion order and supports the +full `AbstractDict` interface (`get`, `haskey`, `keys`, `values`, iteration, etc.). +""" +struct Attributes{S} <: AbstractDict{S, S} + entries::Vector{Pair{S, S}} +end + +Base.length(a::Attributes) = length(a.entries) +Base.iterate(a::Attributes, state...) = iterate(a.entries, state...) + +function Base.getindex(a::Attributes, key::AbstractString) + for (k, v) in a.entries + k == key && return v + end + throw(KeyError(key)) +end -#-----------------------------------------------------------------------------# includes -include("raw.jl") -include("dtd.jl") +function Base.get(a::Attributes, key::AbstractString, default) + for (k, v) in a.entries + k == key && return v + end + default +end + +function Base.haskey(a::Attributes, key::AbstractString) + any(p -> first(p) == key, a.entries) +end -abstract type AbstractXMLNode end +Base.keys(a::Attributes) = first.(a.entries) +Base.values(a::Attributes) = last.(a.entries) -#-----------------------------------------------------------------------------# LazyNode +#-----------------------------------------------------------------------------# Node """ - LazyNode(file::AbstractString) - LazyNode(data::XML.Raw) + Node{S} + +In-memory DOM node parameterized on the string storage type `S` (typically `String`, or +`SubString{String}` for zero-copy parsing). Every kind of XML node — `Element`, `Text`, +`Comment`, `CData`, `ProcessingInstruction`, `Declaration`, `DTD`, `Document` — is +represented by a single `Node{S}` whose [`NodeType`](@ref) determines which fields are +populated. -A Lazy representation of an XML node. + parse(xml, Node) # parse a string into a Node{String} + parse(xml, Node{SubString{String}}) # zero-copy variant + read(filename, Node) # read & parse a file + +Use the accessor functions ([`nodetype`](@ref), [`tag`](@ref), [`attributes`](@ref), +[`value`](@ref), [`children`](@ref)) rather than the raw fields when navigating a tree. +Integer indexing returns children (`node[1]`); string indexing returns attribute values +(`node["class"]`). """ -mutable struct LazyNode <: AbstractXMLNode - raw::Raw - tag::Union{Nothing, String} - attributes::Union{Nothing, OrderedDict{String, String}} - value::Union{Nothing, String} -end -LazyNode(raw::Raw) = LazyNode(raw, nothing, nothing, nothing) +struct Node{S} + nodetype::NodeType + tag::Union{Nothing, S} + attributes::Union{Nothing, Vector{Pair{S, S}}} + value::Union{Nothing, S} + children::Union{Nothing, Vector{Node{S}}} -function Base.getproperty(o::LazyNode, x::Symbol) - x === :raw && return getfield(o, :raw) - x === :nodetype && return nodetype(o.raw) - x === :tag && return isnothing(getfield(o, x)) ? setfield!(o, x, tag(o.raw)) : getfield(o, x) - x === :attributes && return isnothing(getfield(o, x)) ? setfield!(o, x, attributes(o.raw)) : getfield(o, x) - x === :value && return isnothing(getfield(o, x)) ? setfield!(o, x, value(o.raw)) : getfield(o, x) - x === :depth && return depth(o.raw) - x === :children && return LazyNode.(children(o.raw)) - error("type LazyNode has no field $(x)") + function Node{S}(nodetype::NodeType, tag, attributes, value, children) where {S} + if nodetype in (Text, Comment, CData, DTD) + isnothing(tag) && isnothing(attributes) && !isnothing(value) && isnothing(children) || + error("$nodetype nodes only accept a value.") + elseif nodetype === Element + !isnothing(tag) && isnothing(value) || + error("Element nodes require a tag and no value.") + elseif nodetype === Declaration + isnothing(tag) && isnothing(value) && isnothing(children) || + error("Declaration nodes only accept attributes.") + elseif nodetype === ProcessingInstruction + !isnothing(tag) && isnothing(attributes) && isnothing(children) || + error("ProcessingInstruction nodes require a tag and only accept a value.") + elseif nodetype === Document + isnothing(tag) && isnothing(attributes) && isnothing(value) || + error("Document nodes only accept children.") + end + new{S}(nodetype, tag, attributes, value, children) + end end -Base.propertynames(o::LazyNode) = (:raw, :nodetype, :tag, :attributes, :value, :depth, :children) -Base.show(io::IO, o::LazyNode) = _show_node(io, o) +#-----------------------------------------------------------------------------# interface +""" + nodetype(node) -> NodeType + +Return the [`NodeType`](@ref) of `node` (`Element`, `Text`, `Comment`, `CData`, +`ProcessingInstruction`, `Declaration`, `DTD`, or `Document`). +""" +nodetype(o::Node) = o.nodetype + +""" + tag(node) -> Union{String, SubString{String}, Nothing} + +Return the tag name of `node`. Defined for `Element` (element name) and +`ProcessingInstruction` (target name); returns `nothing` for other node types. +""" +tag(o::Node) = o.tag + +""" + attributes(node::Node) -> Union{Nothing, Attributes{String}} + +Return the attributes of an `Element` or `Declaration` node as an [`Attributes`](@ref) dict, +or `nothing` if the node has no attributes. + +!!! note "Changed in v0.4" + In previous versions, `attributes` returned an `OrderedDict` from OrderedCollections.jl. + It now returns an [`Attributes`](@ref), an ordered `AbstractDict` backed by a + `Vector{Pair}`. +""" +attributes(o::Node) = isnothing(o.attributes) ? nothing : Attributes(o.attributes) + +""" + value(node) -> Union{String, SubString{String}, Nothing} + +Return the textual content of `node`. Defined for `Text`, `Comment`, `CData`, `DTD`, and +`ProcessingInstruction`; returns `nothing` for `Element`, `Declaration`, and `Document` +(use [`children`](@ref) for those). +""" +value(o::Node) = o.value + +""" + children(node) -> Vector{Node} or () + +Return the child nodes of `node` in document order. Returns an empty tuple `()` for nodes +that cannot have children (e.g. `Text`, `Comment`, `CData`). +""" +children(o::Node) = something(o.children, ()) + +""" + is_simple(node) -> Bool + +Return `true` if `node` is an `Element` with no attributes and exactly one `Text` or +`CData` child — i.e. the `<tag>content</tag>` pattern with no nested markup. See also +[`simple_value`](@ref). +""" +is_simple(o::Node) = o.nodetype === Element && + (isnothing(o.attributes) || isempty(o.attributes)) && + !isnothing(o.children) && length(o.children) == 1 && + o.children[1].nodetype in (Text, CData) + +""" + simple_value(node) -> String + +Return the textual content of a simple element (see [`is_simple`](@ref)). Errors if +`node` is not simple. +""" +simple_value(o::Node) = is_simple(o) ? o.children[1].value : + error("`simple_value` is only defined for simple nodes.") + +""" + is_simple_value(node) -> Union{Nothing, String, SubString{String}} + +Combined predicate-and-accessor: return the simple text/CData value of `node` if it is a +simple element (see [`is_simple`](@ref)), or `nothing` otherwise. Avoids the redundant +tokenization that `is_simple(n) ? simple_value(n) : ...` does on `LazyNode`. +""" +is_simple_value(o::Node) = is_simple(o) ? o.children[1].value : nothing -Base.read(io::IO, ::Type{LazyNode}) = LazyNode(read(io, Raw)) -Base.read(filename::AbstractString, ::Type{LazyNode}) = LazyNode(read(filename, Raw)) -Base.parse(x::AbstractString, ::Type{LazyNode}) = LazyNode(parse(x, Raw)) +#-----------------------------------------------------------------------------# tree navigation -children(o::LazyNode) = LazyNode.(children(o.raw)) -parent(o::LazyNode) = LazyNode(parent(o.raw)) -depth(o::LazyNode) = depth(o.raw) +""" + parent(child::Node, root::Node) -> Node -Base.IteratorSize(::Type{LazyNode}) = Base.SizeUnknown() -Base.eltype(::Type{LazyNode}) = LazyNode +Return the parent of `child` within the tree rooted at `root`. -function Base.iterate(o::LazyNode, state=o) - n = next(state) - return isnothing(n) ? nothing : (n, n) +Since `Node` does not store parent pointers, this performs a tree search from `root`. +Throws an error if `child` is not found or if `child === root`. +""" +function Base.parent(child::Node, root::Node) + child === root && error("Root node has no parent.") + result = _find_parent(child, root) + isnothing(result) && error("Node not found in tree.") + result end -function next(o::LazyNode) - n = next(o.raw) - isnothing(n) && return nothing - n.type === RawElementClose ? next(LazyNode(n)) : LazyNode(n) +# Depth-first search for `child` within `current`; returns the containing node or nothing. +function _find_parent(child::Node, current::Node) + for c in children(current) + c === child && return current + result = _find_parent(child, c) + isnothing(result) || return result + end + nothing end -function prev(o::LazyNode) - n = prev(o.raw) - isnothing(n) && return nothing - n.type === RawElementClose ? prev(LazyNode(n)) : LazyNode(n) + +""" + depth(child::Node, root::Node) -> Int + +Return the depth of `child` within the tree rooted at `root` (root has depth 0). + +Since `Node` does not store parent pointers, this performs a tree search from `root`. +Throws an error if `child` is not found in the tree. +""" +function depth(child::Node, root::Node) + child === root && return 0 + result = _find_depth(child, root, 0) + isnothing(result) && error("Node not found in tree.") + result +end + +# Depth-first search returning the depth of `child` relative to `current` (where children +# of `current` are at depth `d + 1`), or nothing if not found. +function _find_depth(child::Node, current::Node, d::Int) + for c in children(current) + c === child && return d + 1 + result = _find_depth(child, c, d + 1) + isnothing(result) || return result + end + nothing end -#-----------------------------------------------------------------------------# Node """ - Node(nodetype, tag, attributes, value, children) - Node(node::Node; kw...) # copy node with keyword overrides - Node(node::LazyNode) # un-lazy the LazyNode + siblings(child::Node, root::Node) -> Vector{Node} + +Return the siblings of `child` (other children of the same parent) within the tree rooted +at `root`. The returned vector does not include `child` itself. -A representation of an XML DOM node. For simpler construction, use `(::NodeType)(args...)` +Throws an error if `child` is the root or is not found in the tree. """ -struct Node <: AbstractXMLNode - nodetype::NodeType - tag::Union{Nothing, String} - attributes::Union{Nothing, OrderedDict{String, String}} - value::Union{Nothing, String} - children::Union{Nothing, Vector{Node}} - - function Node(nodetype::NodeType, tag=nothing, attributes=nothing, value=nothing, children=nothing) - new(nodetype, - isnothing(tag) ? nothing : string(tag), - isnothing(attributes) ? nothing : OrderedDict(string(k) => string(v) for (k, v) in pairs(attributes)), - isnothing(value) ? nothing : string(value), - isnothing(children) ? nothing : - children isa Node ? [children] : - children isa Vector{Node} ? children : - children isa Vector ? map(Node, children) : - children isa Tuple ? map(Node, collect(children)) : - [Node(children)] - ) +function siblings(child::Node, root::Node) + p = parent(child, root) + [c for c in children(p) if c !== child] +end + +include("xpath.jl") +include("lazynode.jl") + + +#-----------------------------------------------------------------------------# _to_node +# Coerce a positional argument to a Node{String}: identity for nodes, wrap non-nodes as +# Text. The middle method rejects non-String parameterizations to keep mixed-storage trees +# from being silently constructed. +_to_node(n::Node{String}) = n +_to_node(n::Node) = throw(ArgumentError("Expected Node{String}, got $(typeof(n))")) +_to_node(x) = Node{String}(Text, nothing, nothing, string(x), nothing) + +#-----------------------------------------------------------------------------# NodeType constructors +# Make each NodeType variant callable as a constructor: `Element("div", ...)`, +# `Text("hi")`, etc. Dispatches on `T` to validate args/kwargs and build the right Node. +function (T::NodeType)(args...; attrs...) + S = String + if T in (Text, Comment, CData, DTD) + length(args) == 1 || error("$T nodes require exactly one value argument.") + !isempty(attrs) && error("$T nodes do not accept attributes.") + Node{S}(T, nothing, nothing, string(only(args)), nothing) + elseif T === Element + isempty(args) && error("Element nodes require at least a tag.") + t = string(first(args)) + a = Pair{S,S}[String(k) => String(v) for (k, v) in pairs(attrs)] + c = Node{S}[_to_node(x) for x in args[2:end]] + Node{S}(T, t, a, nothing, c) + elseif T === Declaration + !isempty(args) && error("Declaration nodes only accept keyword attributes.") + a = isempty(attrs) ? nothing : [String(k) => String(v) for (k, v) in pairs(attrs)] + Node{S}(T, nothing, a, nothing, nothing) + elseif T === ProcessingInstruction + length(args) >= 1 || error("ProcessingInstruction nodes require a target.") + length(args) <= 2 || error("ProcessingInstruction nodes accept a target and optional content.") + !isempty(attrs) && error("ProcessingInstruction nodes do not accept attributes.") + t = string(args[1]) + v = length(args) == 2 ? string(args[2]) : nothing + Node{S}(T, t, nothing, v, nothing) + elseif T === Document + !isempty(attrs) && error("Document nodes do not accept attributes.") + c = Node{S}[_to_node(x) for x in args] + Node{S}(T, nothing, nothing, nothing, c) end end -function Node(o::Node, x...; kw...) - attrs = !isnothing(kw) ? - merge( - OrderedDict(string(k) => string(v) for (k, v) in pairs(kw)), - isnothing(o.attributes) ? OrderedDict{String,String}() : o.attributes - ) : - o.attributes - children = isempty(x) ? o.children : vcat(isnothing(o.children) ? [] : o.children, collect(x)) - Node(o.nodetype, o.tag, attrs, o.value, children) +#-----------------------------------------------------------------------------# equality +# Treat `nothing` and an empty collection as equivalent so that an absent attribute / +# children field compares equal to an explicitly empty one. +_eq(::Nothing, ::Nothing) = true +_eq(::Nothing, b) = isempty(b) +_eq(a, ::Nothing) = isempty(a) +_eq(a, b) = a == b + +# Attribute equality is order-insensitive per XML spec. +function _attrs_eq(a, b) + a_empty = isnothing(a) || isempty(a) + b_empty = isnothing(b) || isempty(b) + a_empty && b_empty && return true + (a_empty != b_empty) && return false + length(a) != length(b) && return false + for p in a + p in b || return false + end + true end -function Node(node::LazyNode) - nodetype = node.nodetype - tag = node.tag - attributes = node.attributes - value = node.value - c = XML.children(node) - Node(nodetype, tag, attributes, value, isempty(c) ? nothing : map(Node, c)) +function Base.:(==)(a::Node, b::Node) + a.nodetype == b.nodetype && + a.tag == b.tag && + _attrs_eq(a.attributes, b.attributes) && + a.value == b.value && + _eq(a.children, b.children) end -Node(data::Raw) = Node(LazyNode(data)) +#-----------------------------------------------------------------------------# indexing +Base.getindex(o::Node, i::Integer) = children(o)[i] +Base.getindex(o::Node, ::Colon) = children(o) +Base.lastindex(o::Node) = lastindex(children(o)) +Base.only(o::Node) = only(children(o)) +Base.length(o::Node) = length(children(o)) + +function Base.get(o::Node, key::AbstractString, default) + isnothing(o.attributes) && return default + for (k, v) in o.attributes + k == key && return v + end + default +end -# Anything that's not Vector{UInt8} or a (Lazy)Node is converted to a Text Node -Node(x) = Node(Text, nothing, nothing, string(x), nothing) +const _MISSING_ATTR = gensym(:missing_attr) -h(tag::Union{Symbol, String}, children...; kw...) = Node(Element, tag, kw, nothing, children) -Base.getproperty(::typeof(h), tag::Symbol) = h(tag) -(o::Node)(children...; kw...) = Node(o, Node.(children)...; kw...) +function Base.getindex(o::Node, key::AbstractString) + val = get(o, key, _MISSING_ATTR) + val === _MISSING_ATTR && throw(KeyError(key)) + val +end -# NOT in-place for Text Nodes -function escape!(o::Node, warn::Bool=true) - if o.nodetype == Text - warn && @warn "escape!() called on a Text Node creates a new node." - return Text(escape(o.value)) +function Base.haskey(o::Node, key::AbstractString) + get(o, key, _MISSING_ATTR) !== _MISSING_ATTR +end + +Base.keys(o::Node) = isnothing(o.attributes) ? () : first.(o.attributes) + +#-----------------------------------------------------------------------------# mutation +function Base.setindex!(o::Node, val, i::Integer) + isnothing(o.children) && error("Node has no children.") + o.children[i] = _to_node(val) +end + +function Base.setindex!(o::Node, val, key::AbstractString) + isnothing(o.attributes) && error("Node has no attributes.") + v = string(val) + for i in eachindex(o.attributes) + if first(o.attributes[i]) == key + o.attributes[i] = key => v + return v + end end - isnothing(o.children) && return o - map!(x -> escape!(x, false), o.children, o.children) - o + push!(o.attributes, key => v) + v end -function unescape!(o::Node, warn::Bool=true) - if o.nodetype == Text - warn && @warn "unescape!() called on a Text Node creates a new node." - return Text(unescape(o.value)) + +function Base.push!(a::Node, b) + isnothing(a.children) && error("Node does not accept children.") + push!(a.children, _to_node(b)) + a +end + +function Base.pushfirst!(a::Node, b) + isnothing(a.children) && error("Node does not accept children.") + pushfirst!(a.children, _to_node(b)) + a +end + +#-----------------------------------------------------------------------------# show (REPL) +function Base.show(io::IO, o::Node) + nt = o.nodetype + print(io, nt) + if nt === Text + print(io, ' ', repr(o.value)) + elseif nt === Element + print(io, " <", o.tag) + if !isnothing(o.attributes) + for (k, v) in o.attributes + print(io, ' ', k, '=', '"', v, '"') + end + end + print(io, '>') + n = length(children(o)) + n > 0 && print(io, n == 1 ? " (1 child)" : " ($n children)") + elseif nt === DTD + print(io, " <!DOCTYPE ", o.value, '>') + elseif nt === Declaration + print(io, " <?xml") + if !isnothing(o.attributes) + for (k, v) in o.attributes + print(io, ' ', k, '=', '"', v, '"') + end + end + print(io, "?>") + elseif nt === ProcessingInstruction + print(io, " <?", o.tag) + !isnothing(o.value) && print(io, ' ', o.value) + print(io, "?>") + elseif nt === Comment + print(io, " <!--", o.value, "-->") + elseif nt === CData + print(io, " <![CDATA[", o.value, "]]>") + elseif nt === Document + n = length(children(o)) + n > 0 && print(io, n == 1 ? " (1 child)" : " ($n children)") end - isnothing(o.children) && return o - map!(x -> unescape!(x, false), o.children, o.children) - o end +#-----------------------------------------------------------------------------# show (text/xml) -Base.read(filename::AbstractString, ::Type{Node}) = Node(read(filename, Raw)) -Base.read(io::IO, ::Type{Node}) = Node(read(io, Raw)) -Base.parse(x::AbstractString, ::Type{Node}) = Node(parse(x, Raw)) +# Write XML-escaped content directly to IO (single pass, no intermediate string) +function _write_escaped(io::IO, s::String) + start = 1 + i = 1 + n = ncodeunits(s) + @inbounds while i <= n + b = codeunit(s, i) + esc = if b == UInt8('&'); "&" + elseif b == UInt8('<'); "<" + elseif b == UInt8('>'); ">" + elseif b == UInt8('"'); """ + elseif b == UInt8('\''); "'" + else + i += 1 + continue + end + i > start && GC.@preserve s Base.unsafe_write(io, pointer(s, start), (i - start) % UInt) + print(io, esc) + i += 1 + start = i + end + start <= n && GC.@preserve s Base.unsafe_write(io, pointer(s, start), (n - start + 1) % UInt) + nothing +end -Base.setindex!(o::Node, val, i::Integer) = o.children[i] = Node(val) -Base.push!(a::Node, b::Node) = push!(a.children, b) -Base.pushfirst!(a::Node, b::Node) = pushfirst!(a.children, b) +# Cached indentation strings to avoid repeated allocation +const _MAX_CACHED_INDENT = 64 +const _INDENT_STRINGS = [" " ^ n for n in 0:_MAX_CACHED_INDENT] +@inline function _indent_str(n::Int) + 0 <= n <= _MAX_CACHED_INDENT && return @inbounds _INDENT_STRINGS[n + 1] + " " ^ n +end -Base.setindex!(o::Node, val, key::AbstractString) = (o.attributes[key] = string(val)) -Base.getindex(o::Node, val::AbstractString) = o.attributes[val] -Base.haskey(o::Node, key::AbstractString) = isnothing(o.attributes) ? false : haskey(o.attributes, key) -Base.keys(o::Node) = isnothing(o.attributes) ? () : keys(o.attributes) +# Serialize `key="escaped-value"` pairs for an attributes vector (no leading space outside). +# Uses byte-level `Base.write` instead of `print` to avoid the varargs-print dispatch +# overhead that shows up under profile when an element has many attributes. +function _print_attrs(io::IO, attributes) + isnothing(attributes) && return + for (k, v) in attributes + Base.write(io, UInt8(' ')) + Base.write(io, k) + Base.write(io, UInt8('=')) + Base.write(io, UInt8('"')) + _write_escaped(io, v) + Base.write(io, UInt8('"')) + end +end -Base.show(io::IO, o::Node) = _show_node(io, o) +# Whitespace-only Text — emitted by the parser to round-trip source whitespace; pretty +# printing regenerates indentation from the tree shape and drops these. +@inline function _is_ignorable_text(node::Node) + node.nodetype === Text && !isnothing(node.value) && all(isspace, node.value) +end -#-----------------------------------------------------------------------------# Node Constructors -function (T::NodeType)(args...; attr...) - if T === Document - !isempty(attr) && error("Document nodes do not have attributes.") - Node(T, nothing, nothing, nothing, args) - elseif T === DTD - !isempty(attr) && error("DTD nodes only accept a value.") - length(args) > 1 && error("DTD nodes only accept a value.") - Node(T, nothing, nothing, only(args)) - elseif T === Declaration - !isempty(args) && error("Declaration nodes only accept attributes") - Node(T, nothing, attr) - elseif T === ProcessingInstruction - length(args) == 1 || error("ProcessingInstruction nodes require a tag and attributes.") - Node(T, only(args), attr) - elseif T === Comment - !isempty(attr) && error("Comment nodes do not have attributes.") - length(args) > 1 && error("Comment nodes only accept a single input.") - Node(T, nothing, nothing, only(args)) - elseif T === CData - !isempty(attr) && error("CData nodes do not have attributes.") - length(args) > 1 && error("CData nodes only accept a single input.") - Node(T, nothing, nothing, only(args)) - elseif T === Text - !isempty(attr) && error("Text nodes do not have attributes.") - length(args) > 1 && error("Text nodes only accept a single input.") - Node(T, nothing, nothing, only(args)) - elseif T === Element - tag = first(args) - Node(T, tag, attr, nothing, args[2:end]) - else - error("Unreachable reached while trying to create a Node via (::NodeType)(args...; kw...).") +# Mixed content = at least one Text/CData child carrying actual (non-whitespace) data. +# In that case the original whitespace is significant and we must not reformat. +function _has_significant_text(children) + for c in children + nt = c.nodetype + if nt === Text + (!isnothing(c.value) && !all(isspace, c.value)) && return true + elseif nt === CData + return true + end end + false end -#-----------------------------------------------------------------------------# !!! common !!! -# Everything below here is common to all data structures +# Main XML serializer. `depth` controls indentation; `preserve` propagates `xml:space= +# "preserve"` semantics down the subtree so we don't reformat whitespace-sensitive content. +function _write_xml(io::IO, node::Node, depth::Int=0, indent::Int=2, preserve::Bool=false) + pad = preserve ? "" : _indent_str(indent * depth) + nt = node.nodetype + if nt === Text + _write_escaped(io, node.value) + elseif nt === Element + # Check xml:space on this element + child_preserve = preserve + if !isnothing(node.attributes) + for (k, v) in node.attributes + k == "xml:space" && (child_preserve = v == "preserve") + end + end + Base.write(io, pad) + Base.write(io, UInt8('<')) + Base.write(io, node.tag) + _print_attrs(io, node.attributes) + ch = node.children + if isnothing(ch) || isempty(ch) + Base.write(io, UInt8('/')) + Base.write(io, UInt8('>')) + elseif length(ch) == 1 && only(ch).nodetype === Text + Base.write(io, UInt8('>')) + _write_xml(io, only(ch), 0, 0, child_preserve) + Base.write(io, UInt8('<')) + Base.write(io, UInt8('/')) + Base.write(io, node.tag) + Base.write(io, UInt8('>')) + else + # If real Text or any CData lives among the children, treat as mixed + # content and preserve the original layout. Otherwise pretty-print + # and skip whitespace-only Text children — those were emitted by the + # parser purely to round-trip source whitespace, and the writer + # regenerates indentation from the tree shape. + effective_preserve = child_preserve || _has_significant_text(ch) + if effective_preserve + Base.write(io, UInt8('>')) + else + Base.write(io, UInt8('>')) + Base.write(io, UInt8('\n')) + end + for child in ch + if !effective_preserve && _is_ignorable_text(child) + continue + end + _write_xml(io, child, depth + 1, indent, effective_preserve) + effective_preserve || Base.write(io, UInt8('\n')) + end + effective_preserve || Base.write(io, pad) + Base.write(io, UInt8('<')) + Base.write(io, UInt8('/')) + Base.write(io, node.tag) + Base.write(io, UInt8('>')) + end + elseif nt === Declaration + Base.write(io, pad) + Base.write(io, "<?xml") + _print_attrs(io, node.attributes) + Base.write(io, "?>") + elseif nt === ProcessingInstruction + Base.write(io, pad) + Base.write(io, "<?") + Base.write(io, node.tag) + if !isnothing(node.value) + Base.write(io, UInt8(' ')) + Base.write(io, node.value) + end + Base.write(io, "?>") + elseif nt === Comment + Base.write(io, pad) + Base.write(io, "<!--") + Base.write(io, node.value) + Base.write(io, "-->") + elseif nt === CData + Base.write(io, pad) + Base.write(io, "<![CDATA[") + Base.write(io, node.value) + Base.write(io, "]]>") + elseif nt === DTD + Base.write(io, pad) + Base.write(io, "<!DOCTYPE ") + Base.write(io, node.value) + Base.write(io, UInt8('>')) + elseif nt === Document + ch = node.children + if !isnothing(ch) + # Drop whitespace-only Text between top-level nodes when pretty + # printing (XML grammar disallows text at document level, so any + # such Text comes from inter-node whitespace in the source). + visible = preserve ? ch : filter(!_is_ignorable_text, ch) + n_visible = length(visible) + for (i, child) in enumerate(visible) + _write_xml(io, child, 0, indent, preserve) + i < n_visible && Base.write(io, UInt8('\n')) + end + end + end +end +Base.show(io::IO, ::MIME"text/xml", node::Node) = _write_xml(io, node) -#-----------------------------------------------------------------------------# interface fallbacks -nodetype(o) = o.nodetype -tag(o) = o.tag -attributes(o) = o.attributes -value(o) = o.value -children(o::T) where {T} = isnothing(o.children) ? () : o.children +#-----------------------------------------------------------------------------# write / read +write(node::Node; indentsize::Int=2) = (io = IOBuffer(); _write_xml(io, node, 0, indentsize); String(take!(io))) +write(filename::AbstractString, node::Node; kw...) = open(io -> write(io, node; kw...), filename, "w") +write(io::IO, node::Node; indentsize::Int=2) = _write_xml(io, node, 0, indentsize) -depth(o) = missing -parent(o) = missing -next(o) = missing -prev(o) = missing +Base.read(filename::AbstractString, ::Type{Node}) = parse(read(filename, String), Node) +Base.read(io::IO, ::Type{Node}) = parse(read(io, String), Node) -is_simple(o) = nodetype(o) == Element && (isnothing(attributes(o)) || isempty(attributes(o))) && - length(children(o)) == 1 && nodetype(only(o)) in (Text, CData) +#-----------------------------------------------------------------------------# parse +Base.parse(::Type{Node}, xml::AbstractString) = parse(xml, Node) -simple_value(o) = is_simple(o) ? value(only(o)) : error("`XML.simple_value` is only defined for simple nodes.") +function Base.parse(xml::AbstractString, ::Type{Node}) + _parse(String(xml), String, unescape) +end -Base.@deprecate_binding simplevalue simple_value +function Base.parse(xml::AbstractString, ::Type{Node{SubString{String}}}) + _parse(String(xml), SubString{String}, identity) +end -#-----------------------------------------------------------------------------# nodes_equal -function nodes_equal(a, b) - out = XML.tag(a) == XML.tag(b) - out &= XML.nodetype(a) == XML.nodetype(b) - out &= XML.attributes(a) == XML.attributes(b) - out &= XML.value(a) == XML.value(b) - out &= length(XML.children(a)) == length(XML.children(b)) - out &= all(nodes_equal(ai, bi) for (ai,bi) in zip(XML.children(a), XML.children(b))) - return out +# Convert a parser substring to the requested storage type — copy to a fresh String, or +# keep the zero-copy SubString view. +_to(::Type{String}, s::AbstractString) = String(s) +_to(::Type{SubString{String}}, s::SubString{String}) = s + +# Collapse an empty Vector to `nothing` so Node fields store "absent" canonically. +_nothingify(v::Vector) = isempty(v) ? nothing : v + +# Decode the raw bytes of a TEXT/ATTR_VALUE token into the parser's storage type. When the +# tokenizer guarantees no `&` was seen (`has_entities=false`), we skip the entity-decode +# pass entirely. The `convert_text=identity` specialization (SubString parse) skips the +# runtime branch as well — both arms would return the same value. +@inline _text_value(::Type{S}, raw, _, ::typeof(identity)) where {S} = _to(S, raw) +@inline _text_value(::Type{S}, raw, has_entities, convert_text::F) where {S, F} = + has_entities ? convert_text(raw) : _to(S, raw) + +# Token-stream → Node{S} builder. `convert_text` is `unescape` for parsed content (with +# entity decoding) and `identity` for zero-copy SubString parsing where the caller opts +# to keep raw escapes. +function _parse(xml::String, ::Type{S}, convert_text::F) where {S, F} + tags = S[] + attrs_stack = Vector{Pair{S,S}}[] + children_stack = Vector{Vector{Node{S}}}() + push!(children_stack, Node{S}[]) + + pending_attr_name = SubString(xml, 1, 0) + decl_attrs = nothing + pending_pi_tag = SubString(xml, 1, 0) + pending_pi_value = nothing + in_close_tag = false + + for token in tokenize(xml) + k = token.kind + + if k === TokenKinds.TEXT + v = _text_value(S, token.raw, token.has_entities, convert_text) + push!(last(children_stack), Node{S}(Text, nothing, nothing, v, nothing)) + + elseif k === TokenKinds.OPEN_TAG + push!(tags, _to(S, tag_name(token))) + push!(attrs_stack, Pair{S,S}[]) + push!(children_stack, Node{S}[]) + + elseif k === TokenKinds.SELF_CLOSE + t = pop!(tags) + a = pop!(attrs_stack) + pop!(children_stack) + push!(last(children_stack), Node{S}(Element, t, _nothingify(a), nothing, nothing)) + + elseif k === TokenKinds.TAG_CLOSE + in_close_tag && (in_close_tag = false) + + elseif k === TokenKinds.CLOSE_TAG + close_name = tag_name(token) + isempty(tags) && error("Closing tag </$close_name> with no matching open tag.") + t = pop!(tags) + t == close_name || error("Mismatched tags: expected </$t>, got </$close_name>.") + a = pop!(attrs_stack) + c = pop!(children_stack) + push!(last(children_stack), Node{S}(Element, t, _nothingify(a), nothing, isempty(c) ? nothing : c)) + in_close_tag = true + + elseif k === TokenKinds.ATTR_NAME + pending_attr_name = token.raw + + elseif k === TokenKinds.ATTR_VALUE + val = _text_value(S, attr_value(token), token.has_entities, convert_text) + name = _to(S, pending_attr_name) + if decl_attrs !== nothing + any(p -> first(p) == name, decl_attrs) && error("Duplicate attribute: $name") + push!(decl_attrs, name => val) + elseif !isempty(attrs_stack) + any(p -> first(p) == name, last(attrs_stack)) && error("Duplicate attribute: $name") + push!(last(attrs_stack), name => val) + end + + elseif k === TokenKinds.XML_DECL_OPEN + decl_attrs = Pair{S,S}[] + + elseif k === TokenKinds.XML_DECL_CLOSE + a = isempty(decl_attrs) ? nothing : decl_attrs + push!(last(children_stack), Node{S}(Declaration, nothing, a, nothing, nothing)) + decl_attrs = nothing + + elseif k === TokenKinds.COMMENT_CONTENT + push!(last(children_stack), Node{S}(Comment, nothing, nothing, _to(S, token.raw), nothing)) + + elseif k === TokenKinds.CDATA_CONTENT + push!(last(children_stack), Node{S}(CData, nothing, nothing, _to(S, token.raw), nothing)) + + elseif k === TokenKinds.DOCTYPE_CONTENT + push!(last(children_stack), Node{S}(DTD, nothing, nothing, _to(S, lstrip(token.raw)), nothing)) + + elseif k === TokenKinds.PI_OPEN + pending_pi_tag = pi_target(token) + pending_pi_value = nothing + + elseif k === TokenKinds.PI_CONTENT + content = strip(token.raw) + pending_pi_value = isempty(content) ? nothing : _to(S, content) + + elseif k === TokenKinds.PI_CLOSE + push!(last(children_stack), Node{S}(ProcessingInstruction, _to(S, pending_pi_tag), nothing, pending_pi_value, nothing)) + end + end + + !isempty(tags) && error("Unclosed tags: $(join(tags, ", "))") + doc_children = only(children_stack) + Node{S}(Document, nothing, nothing, nothing, isempty(doc_children) ? nothing : doc_children) end -Base.:(==)(a::AbstractXMLNode, b::AbstractXMLNode) = nodes_equal(a, b) +#-----------------------------------------------------------------------------# h (HTML/XML element builder) +""" + h(tag, children...; attrs...) + h.tag(children...; attrs...) -#-----------------------------------------------------------------------------# parse -Base.parse(::Type{T}, str::AbstractString) where {T <: AbstractXMLNode} = parse(str, T) +Convenience constructor for `Element` nodes. -#-----------------------------------------------------------------------------# indexing -Base.getindex(o::Union{Raw, AbstractXMLNode}) = o -Base.getindex(o::Union{Raw, AbstractXMLNode}, i::Integer) = children(o)[i] -Base.getindex(o::Union{Raw, AbstractXMLNode}, ::Colon) = children(o) -Base.lastindex(o::Union{Raw, AbstractXMLNode}) = lastindex(children(o)) - -Base.only(o::Union{Raw, AbstractXMLNode}) = only(children(o)) - -Base.length(o::AbstractXMLNode) = length(children(o)) - -#-----------------------------------------------------------------------------# printing -function _show_node(io::IO, o) - printstyled(io, typeof(o), ' '; color=:light_black) - !ismissing(depth(o)) && printstyled(io, "(depth=", depth(o), ") ", color=:light_black) - printstyled(io, nodetype(o), ; color=:light_green) - if o.nodetype === Text - printstyled(io, ' ', repr(value(o))) - elseif o.nodetype === Element - printstyled(io, " <", tag(o), color=:light_cyan) - _print_attrs(io, o; color=:light_yellow) - printstyled(io, '>', color=:light_cyan) - _print_n_children(io, o) - elseif o.nodetype === DTD - printstyled(io, " <!DOCTYPE "; color=:light_cyan) - printstyled(io, value(o), color=:light_black) - printstyled(io, '>', color=:light_cyan) - elseif o.nodetype === Declaration - printstyled(io, " <?xml", color=:light_cyan) - _print_attrs(io, o; color=:light_yellow) - printstyled(io, "?>", color=:light_cyan) - elseif o.nodetype === ProcessingInstruction - printstyled(io, " <?", tag(o), color=:light_cyan) - _print_attrs(io, o; color=:light_yellow) - printstyled(io, "?>", color=:light_cyan) - elseif o.nodetype === Comment - printstyled(io, " <!--", color=:light_cyan) - printstyled(io, value(o), color=:light_black) - printstyled(io, "-->", color=:light_cyan) - elseif o.nodetype === CData - printstyled(io, " <![CData[", color=:light_cyan) - printstyled(io, value(o), color=:light_black) - printstyled(io, "]]>", color=:light_cyan) - elseif o.nodetype === Document - _print_n_children(io, o) - elseif o.nodetype === UNKNOWN - printstyled(io, "Unknown", color=:light_cyan) - _print_n_children(io, o) - else - error("Unreachable reached") + h("div", "hello"; class="main") # <div class="main">hello</div> + h.div("hello"; class="main") # same thing +""" +function h(tag::Union{Symbol, AbstractString}, children...; attrs...) + t = String(tag) + a = Pair{String,String}[String(k) => String(v) for (k, v) in pairs(attrs)] + c = Node{String}[_to_node(x) for x in children] + Node{String}(Element, t, a, nothing, c) +end + +Base.getproperty(::typeof(h), tag::Symbol) = h(tag) + +function (o::Node)(args...; attrs...) + o.nodetype === Element || error("Only Element nodes are callable.") + old_children = something(o.children, ()) + old_attrs = isnothing(o.attributes) ? () : (Symbol(k) => v for (k, v) in o.attributes) + h(o.tag, old_children..., args...; old_attrs..., attrs...) +end + +#-----------------------------------------------------------------------------# DTD parsing +struct ElementDecl + name::String + content::String # "EMPTY", "ANY", or content model like "(#PCDATA)" or "(a,b,c)*" +end + +struct AttDecl + element::String + name::String + type::String # "CDATA", "ID", "(val1|val2)", "NOTATION (a|b)", etc. + default::String # "#REQUIRED", "#IMPLIED", "#FIXED \"val\"", or "\"val\"" +end + +struct EntityDecl + name::String + value::Union{Nothing, String} # replacement text (internal entities) + external_id::Union{Nothing, String} # "SYSTEM \"uri\"" or "PUBLIC \"pubid\" \"uri\"" + parameter::Bool +end + +struct NotationDecl + name::String + external_id::String +end + +struct ParsedDTD + root::String + system_id::Union{Nothing, String} + public_id::Union{Nothing, String} + elements::Vector{ElementDecl} + attributes::Vector{AttDecl} + entities::Vector{EntityDecl} + notations::Vector{NotationDecl} +end + +# DTD parsing helpers — each returns (parsed_piece, new_pos) so calls compose. + +# A byte that can appear in an XML Name (letters, digits, `_`, `-`, `.`, `:`). +@inline _dtd_is_name_char(c::Char) = + ('a' <= c <= 'z') || ('A' <= c <= 'Z') || ('0' <= c <= '9') || + c == '_' || c == '-' || c == '.' || c == ':' + +# Advance past any whitespace. +function _dtd_skip_ws(s, pos) + while pos <= ncodeunits(s) && isspace(s[pos]) + pos += 1 end + pos end -function _print_attrs(io::IO, o; color=:normal) - attr = attributes(o) - isnothing(attr) && return nothing - for (k,v) in attr - # printstyled(io, ' ', k, '=', '"', v, '"'; color) - print(io, ' ', k, '=', '"', v, '"') +# Read an XML Name token; errors if no name characters are present. +function _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) + start = pos + while pos <= ncodeunits(s) && _dtd_is_name_char(s[pos]) + pos += 1 end + start == pos && error("Expected name at position $pos in DTD") + SubString(s, start, pos - 1), pos end -function _print_n_children(io::IO, o::Node) - n = length(children(o)) - text = n == 0 ? "" : n == 1 ? " (1 child)" : " ($n children)" - printstyled(io, text, color=:light_black) -end -_print_n_children(io::IO, o) = nothing - -#-----------------------------------------------------------------------------# write_xml -write(x; kw...) = (io = IOBuffer(); write(io, x; kw...); String(take!(io))) - -write(filename::AbstractString, x; kw...) = open(io -> write(io, x; kw...), filename, "w") - -function write(io::IO, x, ctx::Vector{Bool}=[false]; indentsize::Int=2, depth::Int=1) - indent = ' ' ^ indentsize - nodetype = XML.nodetype(x) - tag = XML.tag(x) - value = XML.value(x) - children = XML.children(x) - - padding = indent ^ max(0, depth - 1) - !ctx[end] && print(io, padding) - - if nodetype === Text - print(io, value) - - elseif nodetype === Element - push!(ctx, ctx[end]) - update_ctx!(ctx, x) - print(io, '<', tag) - _print_attrs(io, x) - print(io, isempty(children) ? '/' : "", '>') - if !isempty(children) - if length(children) == 1 && XML.nodetype(only(children)) === Text - write(io, only(children), ctx; indentsize=0) - print(io, "</", tag, '>') - else - !ctx[end] && println(io) - foreach(children) do child - write(io, child, ctx; indentsize, depth=depth + 1) - !ctx[end] && println(io) - end - print(io, !ctx[end] ? padding : "", "</", tag, '>') + +# Read a `"..."` or `'...'` string and return the contents without the surrounding quotes. +function _dtd_read_quoted(s, pos) + pos = _dtd_skip_ws(s, pos) + q = s[pos] + (q == '"' || q == '\'') || error("Expected quoted string at position $pos in DTD") + pos += 1 + start = pos + while pos <= ncodeunits(s) && s[pos] != q + pos += 1 + end + val = SubString(s, start, pos - 1) + pos += 1 + val, pos +end + +# Read a balanced parenthesized expression (e.g. `(a|b|(c,d))`), returning the full +# substring including the outer `(` and `)`. Skips over quoted strings inside. +function _dtd_read_parens(s, pos) + pos = _dtd_skip_ws(s, pos) + s[pos] == '(' || error("Expected '(' at position $pos in DTD") + depth = 1 + start = pos + pos += 1 + while pos <= ncodeunits(s) && depth > 0 + c = s[pos] + if c == '(' + depth += 1 + elseif c == ')' + depth -= 1 + elseif c == '"' || c == '\'' + pos += 1 + while pos <= ncodeunits(s) && s[pos] != c + pos += 1 end end - pop!(ctx) + pos += 1 + end + SubString(s, start, pos - 1), pos +end - elseif nodetype === DTD - print(io, "<!DOCTYPE ", value, '>') +# Advance past the next `>` that terminates a markup declaration, ignoring `>` inside +# quoted strings. +function _dtd_skip_to_close(s, pos) + while pos <= ncodeunits(s) && s[pos] != '>' + c = s[pos] + if c == '"' || c == '\'' + pos += 1 + while pos <= ncodeunits(s) && s[pos] != c + pos += 1 + end + end + pos += 1 + end + pos <= ncodeunits(s) ? pos + 1 : pos +end - elseif nodetype === Declaration - print(io, "<?xml") - _print_attrs(io, x) - print(io, "?>") +# Parse `<!ELEMENT name content>` — content is either a name (EMPTY/ANY) or a parens +# group with an optional `*`/`+`/`?` quantifier appended. +function _dtd_parse_element(s, pos) + name, pos = _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) + if s[pos] == '(' + content, pos = _dtd_read_parens(s, pos) + if pos <= ncodeunits(s) && s[pos] in ('*', '+', '?') + content = string(content, s[pos]) + pos += 1 + end + else + content, pos = _dtd_read_name(s, pos) + end + pos = _dtd_skip_to_close(s, pos) + ElementDecl(String(name), String(content)), pos +end - elseif nodetype === ProcessingInstruction - print(io, "<?", tag) - _print_attrs(io, x) - print(io, "?>") +# Parse `<!ATTLIST element name type default ...>` — emits one AttDecl per attribute. +function _dtd_parse_attlist(s, pos) + element, pos = _dtd_read_name(s, pos) + atts = AttDecl[] + while true + pos = _dtd_skip_ws(s, pos) + (pos > ncodeunits(s) || s[pos] == '>') && break - elseif nodetype === Comment - print(io, "<!--", value, "-->") + name, pos = _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) - elseif nodetype === CData - print(io, "<![CData[", value, "]]>") + # Attribute type + if s[pos] == '(' + atype, pos = _dtd_read_parens(s, pos) + else + atype, pos = _dtd_read_name(s, pos) + if atype == "NOTATION" + pos = _dtd_skip_ws(s, pos) + parens, pos = _dtd_read_parens(s, pos) + atype = string("NOTATION ", parens) + end + end + pos = _dtd_skip_ws(s, pos) - elseif nodetype === Document - foreach(children) do child - write(io, child, ctx; indentsize) - !ctx[end] && println(io) + # Default declaration + if s[pos] == '#' + pos += 1 + keyword, pos = _dtd_read_name(s, pos) + if keyword == "FIXED" + pos = _dtd_skip_ws(s, pos) + val, pos = _dtd_read_quoted(s, pos) + default = string("#FIXED \"", val, "\"") + else + default = string("#", keyword) + end + elseif s[pos] == '"' || s[pos] == '\'' + val, pos = _dtd_read_quoted(s, pos) + default = string("\"", val, "\"") + else + error("Expected default declaration at position $pos in DTD") end + push!(atts, AttDecl(String(element), String(name), String(atype), default)) + end + pos <= ncodeunits(s) && s[pos] == '>' && (pos += 1) + atts, pos +end + +# Parse `<!ENTITY [%] name "value">` or `<!ENTITY name SYSTEM/PUBLIC ...>`. `%` marks a +# parameter entity (referenced as `%name;` in DTDs only). +function _dtd_parse_entity(s, pos) + pos = _dtd_skip_ws(s, pos) + parameter = false + if pos <= ncodeunits(s) && s[pos] == '%' + parameter = true + pos += 1 + end + name, pos = _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) + + value = nothing + external_id = nothing + if s[pos] == '"' || s[pos] == '\'' + v, pos = _dtd_read_quoted(s, pos) + value = String(v) + else + keyword, pos = _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) + if keyword == "SYSTEM" + uri, pos = _dtd_read_quoted(s, pos) + external_id = string("SYSTEM \"", uri, "\"") + elseif keyword == "PUBLIC" + pubid, pos = _dtd_read_quoted(s, pos) + pos = _dtd_skip_ws(s, pos) + uri, pos = _dtd_read_quoted(s, pos) + external_id = string("PUBLIC \"", pubid, "\" \"", uri, "\"") + else + error("Expected SYSTEM, PUBLIC, or quoted value in ENTITY declaration") + end + end + pos = _dtd_skip_to_close(s, pos) + EntityDecl(String(name), value, external_id, parameter), pos +end +# Parse `<!NOTATION name SYSTEM "uri">` / `<!NOTATION name PUBLIC "pubid" ["uri"]>`. +function _dtd_parse_notation(s, pos) + name, pos = _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) + keyword, pos = _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) + if keyword == "SYSTEM" + uri, pos = _dtd_read_quoted(s, pos) + external_id = string("SYSTEM \"", uri, "\"") + elseif keyword == "PUBLIC" + pubid, pos = _dtd_read_quoted(s, pos) + pos = _dtd_skip_ws(s, pos) + if pos <= ncodeunits(s) && (s[pos] == '"' || s[pos] == '\'') + uri, pos = _dtd_read_quoted(s, pos) + external_id = string("PUBLIC \"", pubid, "\" \"", uri, "\"") + else + external_id = string("PUBLIC \"", pubid, "\"") + end else - error("Unreachable case reached during XML.write") + error("Expected SYSTEM or PUBLIC in NOTATION declaration") end + pos = _dtd_skip_to_close(s, pos) + NotationDecl(String(name), external_id), pos +end + +""" + parse_dtd(value::AbstractString) -> ParsedDTD + parse_dtd(node::Node) -> ParsedDTD +Parse a DTD value string (from a `DTD` node) into structured declarations. +""" +function parse_dtd(value::AbstractString) + s = String(value) + pos = 1 + + root, pos = _dtd_read_name(s, pos) + pos = _dtd_skip_ws(s, pos) + + # External ID + system_id = nothing + public_id = nothing + if pos <= ncodeunits(s) && _dtd_is_name_char(s[pos]) + keyword, kpos = _dtd_read_name(s, pos) + if keyword == "SYSTEM" + pos = kpos + uri, pos = _dtd_read_quoted(s, pos) + system_id = String(uri) + elseif keyword == "PUBLIC" + pos = kpos + pubid, pos = _dtd_read_quoted(s, pos) + public_id = String(pubid) + pos = _dtd_skip_ws(s, pos) + if pos <= ncodeunits(s) && (s[pos] == '"' || s[pos] == '\'') + uri, pos = _dtd_read_quoted(s, pos) + system_id = String(uri) + end + end + end + + elements = ElementDecl[] + attributes = AttDecl[] + entities = EntityDecl[] + notations = NotationDecl[] + + # Internal subset + pos = _dtd_skip_ws(s, pos) + if pos <= ncodeunits(s) && s[pos] == '[' + pos += 1 + while pos <= ncodeunits(s) + pos = _dtd_skip_ws(s, pos) + pos > ncodeunits(s) && break + s[pos] == ']' && break + + rest = SubString(s, pos) + if startswith(rest, "<!--") + i = findnext("-->", s, pos + 4) + isnothing(i) && error("Unterminated comment in DTD") + pos = last(i) + 1 + elseif startswith(rest, "<?") + i = findnext("?>", s, pos + 2) + isnothing(i) && error("Unterminated PI in DTD") + pos = last(i) + 1 + elseif startswith(rest, "<!ELEMENT") + elem, pos = _dtd_parse_element(s, pos + 9) + push!(elements, elem) + elseif startswith(rest, "<!ATTLIST") + atts, pos = _dtd_parse_attlist(s, pos + 9) + append!(attributes, atts) + elseif startswith(rest, "<!ENTITY") + ent, pos = _dtd_parse_entity(s, pos + 8) + push!(entities, ent) + elseif startswith(rest, "<!NOTATION") + not, pos = _dtd_parse_notation(s, pos + 10) + push!(notations, not) + elseif s[pos] == '%' + i = findnext(';', s, pos + 1) + isnothing(i) && error("Unterminated parameter entity reference in DTD") + pos = i + 1 + else + pos += 1 + end + end + end + + ParsedDTD(String(root), system_id, public_id, elements, attributes, entities, notations) +end + +function parse_dtd(node::Node) + node.nodetype === DTD || error("parse_dtd requires a DTD node.") + parse_dtd(node.value) +end + +#-----------------------------------------------------------------------------# deprecations +Base.@deprecate_binding simplevalue simple_value false + +# Removed types — informative errors +struct Raw + Raw(args...; kw...) = error(""" + `XML.Raw` has been removed in XML.jl v0.4. + Use `parse(str, Node)` or `read(filename, Node)` instead. + The streaming Raw/LazyNode API has been replaced by a token-based parser. + See `?XML.Node` for the new API.""") +end + +# Removed functions — informative errors +const _REMOVED_LAZYNODE_MSG = """ + This function was part of the LazyNode API, which has been removed in XML.jl v0.4. + Use `parse(str, Node)` to get a full DOM tree and navigate with `children`, `tag`, + `attributes`, `value`, and integer indexing (e.g. `node[1]`).""" + +for f in (:next, :prev) + msg = "`XML.$f` has been removed. $_REMOVED_LAZYNODE_MSG" + @eval function $f(o::Node) + Base.depwarn($msg, $(QuoteNode(f))) + error($msg) + end +end + +# 1-arg parent/depth were part of LazyNode API; 2-arg versions are defined above +const _PARENT_1ARG_MSG = "`XML.parent(node)` (single-argument) has been removed. $_REMOVED_LAZYNODE_MSG\n Use `parent(child, root)` instead to search from a known root node." +function Base.parent(o::Node) + Base.depwarn(_PARENT_1ARG_MSG, :parent) + error(_PARENT_1ARG_MSG) +end + +const _DEPTH_1ARG_MSG = "`XML.depth(node)` (single-argument) has been removed. $_REMOVED_LAZYNODE_MSG\n Use `depth(child, root)` instead to search from a known root node." +function depth(o::Node) + Base.depwarn(_DEPTH_1ARG_MSG, :depth) + error(_DEPTH_1ARG_MSG) +end + +function nodes_equal(a, b) + msg = """`XML.nodes_equal` has been removed in XML.jl v0.4. Use `==` instead: + a == b""" + Base.depwarn(msg, :nodes_equal) + error(msg) +end + +function escape!(o::Node, warn::Bool=true) + msg = """`XML.escape!` has been removed in XML.jl v0.4. + Text is now escaped automatically during `XML.write`.""" + Base.depwarn(msg, :escape!) + error(msg) +end + +function unescape!(o::Node, warn::Bool=true) + msg = """`XML.unescape!` has been removed in XML.jl v0.4. + Text is now unescaped automatically during `parse`.""" + Base.depwarn(msg, :unescape!) + error(msg) end end # module XML diff --git a/src/XMLTokenizer.jl b/src/XMLTokenizer.jl new file mode 100644 index 0000000..c84f881 --- /dev/null +++ b/src/XMLTokenizer.jl @@ -0,0 +1,543 @@ +module XMLTokenizer + +#-----------------------------------------------------------------------# TokenKinds +baremodule TokenKinds + import Base: @enum + + @enum Kind::UInt8 begin + # Character data + TEXT # text content between markup + + # Element tags + OPEN_TAG # <name + CLOSE_TAG # </name + TAG_CLOSE # > + SELF_CLOSE # /> + ATTR_NAME # attribute name + ATTR_VALUE # "value" or 'value' (with quotes in raw) + + # CDATA sections + CDATA_OPEN # <![CDATA[ + CDATA_CONTENT # raw text content + CDATA_CLOSE # ]]> + + # Comments + COMMENT_OPEN # <!-- + COMMENT_CONTENT # comment text + COMMENT_CLOSE # --> + + # Processing instructions + PI_OPEN # <?target (includes target name) + PI_CONTENT # PI body text + PI_CLOSE # ?> + + # XML declaration (<?xml ...?>) + XML_DECL_OPEN # <?xml + XML_DECL_CLOSE # ?> + # (reuses ATTR_NAME / ATTR_VALUE for pseudo-attributes) + + # DOCTYPE + DOCTYPE_OPEN # <!DOCTYPE (or other <! declarations) + DOCTYPE_CONTENT # declaration body + DOCTYPE_CLOSE # > + end +end + +#-----------------------------------------------------------------------# Token +# `has_entities` records whether the raw bytes contain a `&`. It is set by the readers for +# `TEXT` and `ATTR_VALUE` (where entity references can appear) and stays `false` for every +# other token kind. The downstream parser uses it to skip `unescape`'s redundant byte scan +# when no entities are present. +# +# Field order matters: `has_entities` lives in the alignment padding that would otherwise +# sit between the 1-byte `kind` and the 24-byte `raw`. This keeps `sizeof(Token{String})` +# at 32 bytes instead of 40, which matters because tokens are allocated by the million +# during parse. +struct Token{S <: AbstractString} + kind::TokenKinds.Kind + has_entities::Bool + raw::SubString{S} +end + +# Backwards-compatible constructor for the many internal call sites that emit non-entity +# tokens (markup, names, close tokens, etc.). +@inline Token(kind::TokenKinds.Kind, raw::SubString{S}) where {S} = Token{S}(kind, false, raw) + +function Base.show(io::IO, t::Token) + print(io, t.kind, ": ", repr(String(t.raw))) +end + +#-----------------------------------------------------------------------# Tokenizer mode +@enum Mode::UInt8 begin + M_DEFAULT # normal content mode + M_TAG # inside open tag, reading attributes + M_TAG_VALUE # expecting quoted attribute value + M_CLOSE_TAG # inside close tag, expecting > + M_XML_DECL # inside <?xml, reading pseudo-attributes + M_XML_DECL_VALUE # expecting quoted attr value in xml decl + M_COMMENT # after <!--, reading content + M_CDATA # after <![CDATA[, reading content + M_PI # after <?target, reading content + M_DOCTYPE # after <!DOCTYPE, reading content +end + +#-----------------------------------------------------------------------# TokenizerState (immutable, SROA-friendly) +struct TokenizerState{S <: AbstractString} + pos::Int + mode::Mode + pending::Token{S} # buffered token for constructs that emit two tokens at once (e.g. content + close) +end + +# Create an empty token (no pending token buffered) +@inline no_token(s::AbstractString) = Token(TokenKinds.TEXT, @inbounds SubString(s, 1, 0)) +# Check whether the state has a buffered pending token +@inline has_pending(st::TokenizerState) = !isempty(st.pending.raw) + + +#-----------------------------------------------------------------------# Tokenizer (immutable iterator) +""" + tokenize(xml::AbstractString) -> Tokenizer + +Return a lazy iterator of `Token`s over the XML string `xml`. +""" +struct Tokenizer{S <: AbstractString} + data::S + start::Int +end + +tokenize(xml::AbstractString) = Tokenizer(xml, 1) +tokenize(xml::AbstractString, pos::Int) = StatefulTokenizer(Tokenizer(xml, pos)) + +# Lightweight mutable holder that drives the immutable `Tokenizer`'s iterate protocol with +# a single state field — avoids the `Union{VS,Nothing}` field and per-iteration tuple +# storage that `Iterators.Stateful` carries. +mutable struct StatefulTokenizer{S <: AbstractString} + const t::Tokenizer{S} + state::TokenizerState{S} + done::Bool +end + +StatefulTokenizer(t::Tokenizer{S}) where {S <: AbstractString} = + StatefulTokenizer{S}(t, TokenizerState(t.start, M_DEFAULT, no_token(t.data)), false) + +Base.IteratorSize(::Type{<:StatefulTokenizer}) = Base.SizeUnknown() +Base.eltype(::Type{StatefulTokenizer{S}}) where {S} = Token{S} + +@inline function Base.iterate(st::StatefulTokenizer, _ = nothing) + st.done && return nothing + r = iterate(st.t, st.state) + if r === nothing + st.done = true + return nothing + end + st.state = r[2] + (r[1], nothing) +end + +function Base.show(io::IO, t::Tokenizer) + n = ncodeunits(t.data) + print(io, "Tokenizer(") + t.start > 1 && print(io, t.start, "/") + print(io, Base.format_bytes(n), ")") +end + +Base.IteratorSize(::Type{<:Tokenizer}) = Base.SizeUnknown() +Base.eltype(::Type{Tokenizer{S}}) where {S} = Token{S} + +function Base.iterate(t::Tokenizer, st::TokenizerState=TokenizerState(t.start, M_DEFAULT, no_token(t.data))) + (; data) = t + (; pending, pos, mode) = st + + if has_pending(st) + return (pending, TokenizerState(pos, mode, no_token(data))) + end + iseof(data, pos) && return nothing + + if mode == M_DEFAULT + peek(data, pos) == UInt8('<') ? read_markup(data, pos) : read_text(data, pos) + elseif mode == M_TAG || mode == M_XML_DECL + read_in_tag(data, pos, mode) + elseif mode == M_TAG_VALUE || mode == M_XML_DECL_VALUE + read_attr_value(data, pos, mode) + elseif mode == M_CLOSE_TAG + read_close_tag_end(data, pos) + elseif mode == M_COMMENT + read_comment_body(data, pos) + elseif mode == M_CDATA + read_cdata_body(data, pos) + elseif mode == M_PI + read_pi_body(data, pos) + else # M_DOCTYPE + read_doctype_body(data, pos) + end +end + +#-----------------------------------------------------------------------# Internal helpers +# Check if pos is past the end of data +@inline iseof(data::AbstractString, pos::Int)::Bool = pos > ncodeunits(data) +# Read the byte at pos without bounds checking +@inline peek(data::AbstractString, pos::Int)::UInt8 = @inbounds codeunit(data, pos) +# Check if pos + offset is within bounds +@inline canpeek(data::AbstractString, pos::Int, offset::Int)::Bool = pos + offset <= ncodeunits(data) + +# Lookup table for XML name bytes (letter, digit, _, -, ., :) +const NAME_BYTE_TABLE = let t = falses(256) + for r in (UInt8('a'):UInt8('z'), UInt8('A'):UInt8('Z'), UInt8('0'):UInt8('9')) + for b in r; t[b + 1] = true; end + end + for b in (UInt8('_'), UInt8('-'), UInt8('.'), UInt8(':')); t[b + 1] = true; end + NTuple{256,Bool}(t) +end +@inline is_name_byte(b::UInt8)::Bool = @inbounds NAME_BYTE_TABLE[b + 1] + +# Check if byte is XML whitespace (space, tab, newline, carriage return) +@inline function is_whitespace(b::UInt8)::Bool + b == UInt8(' ') || b == UInt8('\t') || b == UInt8('\n') || b == UInt8('\r') +end + +# Advance pos past any whitespace bytes +@inline function skip_whitespace(data::AbstractString, pos::Int)::Int + @inbounds while !iseof(data, pos) && is_whitespace(peek(data, pos)) + pos += 1 + end + pos +end + +# Advance pos past a quoted string (single or double quotes) +function skip_quoted(data::AbstractString, pos::Int)::Int + q = @inbounds peek(data, pos) + pos += 1 + @inbounds while !iseof(data, pos) + peek(data, pos) == q && return pos + 1 + pos += 1 + end + error("Unterminated quoted string") +end + +# Throw a tokenizer error with position context (noinline to keep error paths out of hot code) +@noinline err(msg::AbstractString, pos::Int) = throw(ArgumentError("XML tokenizer error at position $pos: $msg")) + +#-----------------------------------------------------------------------# Text and markup +# Read text content up to the next '<'. Uses `findnext` (memchr-backed for `String`) to +# find the end-of-text delimiter, then scans for `&` only within the text region — a full +# document `findnext('&', ...)` would be O(doc_size) per text token and degrade to +# O(doc_size²) on entity-free documents. +function read_text(data::AbstractString, pos::Int) + start = pos + n = ncodeunits(data) + lt_idx = findnext('<', data, pos) + end_pos = isnothing(lt_idx) ? n + 1 : lt_idx + raw = @inbounds SubString(data, start, prevind(data, end_pos)) + has_amp = occursin('&', raw) + tok = Token{typeof(data)}(TokenKinds.TEXT, has_amp, raw) + (tok, TokenizerState(end_pos, M_DEFAULT, no_token(data))) +end + +# Dispatch on the character after '<' to the appropriate reader +function read_markup(data::AbstractString, pos::Int) + start = pos + pos += 1 # skip '<' + iseof(data, pos) && err("unexpected end of input after '<'", start) + + b = peek(data, pos) + if b == UInt8('!') + read_bang(data, pos + 1, start) + elseif b == UInt8('?') + read_pi_start(data, pos + 1, start) + elseif b == UInt8('/') + read_close_tag_start(data, pos + 1, start) + else + read_open_tag_start(data, pos, start) + end +end + +#-----------------------------------------------------------------------# <! dispatch +# Handle '<!' — comment, CDATA, or DOCTYPE +function read_bang(data::AbstractString, pos::Int, start::Int) + # Comment: <!-- + if !iseof(data, pos) && peek(data, pos) == UInt8('-') + pos += 1 + (!iseof(data, pos) && peek(data, pos) == UInt8('-')) || err("expected '<!--'", start) + pos += 1 + tok = Token(TokenKinds.COMMENT_OPEN, @inbounds SubString(data, start, pos - 1)) + return (tok, TokenizerState(pos, M_COMMENT, no_token(data))) + end + + # CDATA: <![CDATA[ + if !iseof(data, pos) && peek(data, pos) == UInt8('[') + pos += 1 + for expected in (UInt8('C'), UInt8('D'), UInt8('A'), UInt8('T'), UInt8('A'), UInt8('[')) + iseof(data, pos) && err("unterminated CDATA", start) + peek(data, pos) == expected || err("invalid CDATA section", start) + pos += 1 + end + tok = Token(TokenKinds.CDATA_OPEN, @inbounds SubString(data, start, pos - 1)) + return (tok, TokenizerState(pos, M_CDATA, no_token(data))) + end + + # <!DOCTYPE ...> or other <! declaration + @inbounds while !iseof(data, pos) && is_name_byte(peek(data, pos)) + pos += 1 + end + tok = Token(TokenKinds.DOCTYPE_OPEN, @inbounds SubString(data, start, pos - 1)) + (tok, TokenizerState(pos, M_DOCTYPE, no_token(data))) +end + +#-----------------------------------------------------------------------# <? (PI / XML declaration) +# Handle '<?' — XML declaration or processing instruction +function read_pi_start(data::AbstractString, pos::Int, start::Int) + name_start = pos + @inbounds while !iseof(data, pos) && is_name_byte(peek(data, pos)) + pos += 1 + end + + is_xml = (pos - name_start == 3) && + codeunit(data, name_start) == UInt8('x') && + codeunit(data, name_start + 1) == UInt8('m') && + codeunit(data, name_start + 2) == UInt8('l') + + if is_xml + tok = Token(TokenKinds.XML_DECL_OPEN, @inbounds SubString(data, start, pos - 1)) + (tok, TokenizerState(pos, M_XML_DECL, no_token(data))) + else + tok = Token(TokenKinds.PI_OPEN, @inbounds SubString(data, start, pos - 1)) + (tok, TokenizerState(pos, M_PI, no_token(data))) + end +end + +#-----------------------------------------------------------------------# Tags +# Read '<name' and enter tag-attribute mode +function read_open_tag_start(data::AbstractString, pos::Int, start::Int) + @inbounds while !iseof(data, pos) && is_name_byte(peek(data, pos)) + pos += 1 + end + tok = Token(TokenKinds.OPEN_TAG, @inbounds SubString(data, start, pos - 1)) + (tok, TokenizerState(pos, M_TAG, no_token(data))) +end + +# Read '</name' and enter close-tag mode +function read_close_tag_start(data::AbstractString, pos::Int, start::Int) + @inbounds while !iseof(data, pos) && is_name_byte(peek(data, pos)) + pos += 1 + end + tok = Token(TokenKinds.CLOSE_TAG, @inbounds SubString(data, start, pos - 1)) + (tok, TokenizerState(pos, M_CLOSE_TAG, no_token(data))) +end + +# Consume the '>' that closes a '</name>' tag +function read_close_tag_end(data::AbstractString, pos::Int) + pos = skip_whitespace(data, pos) + iseof(data, pos) && err("unterminated close tag", pos) + peek(data, pos) == UInt8('>') || err("expected '>'", pos) + tok = Token(TokenKinds.TAG_CLOSE, @inbounds SubString(data, pos, pos)) + (tok, TokenizerState(pos + 1, M_DEFAULT, no_token(data))) +end + +#-----------------------------------------------------------------------# Attributes (shared by M_TAG and M_XML_DECL) +# Read the next attribute name or tag-close delimiter (>, />, ?>) +function read_in_tag(data::AbstractString, pos::Int, mode::Mode) + pos = skip_whitespace(data, pos) + iseof(data, pos) && err("unterminated tag", pos) + + b = peek(data, pos) + is_decl = (mode == M_XML_DECL) + + # Check for end delimiters + if is_decl + if b == UInt8('?') && canpeek(data, pos, 1) && peek(data, pos + 1) == UInt8('>') + tok = Token(TokenKinds.XML_DECL_CLOSE, @inbounds SubString(data, pos, pos + 1)) + return (tok, TokenizerState(pos + 2, M_DEFAULT, no_token(data))) + end + else + if b == UInt8('>') + tok = Token(TokenKinds.TAG_CLOSE, @inbounds SubString(data, pos, pos)) + return (tok, TokenizerState(pos + 1, M_DEFAULT, no_token(data))) + end + if b == UInt8('/') && canpeek(data, pos, 1) && peek(data, pos + 1) == UInt8('>') + tok = Token(TokenKinds.SELF_CLOSE, @inbounds SubString(data, pos, pos + 1)) + return (tok, TokenizerState(pos + 2, M_DEFAULT, no_token(data))) + end + end + + # Attribute name + name_start = pos + @inbounds while !iseof(data, pos) && is_name_byte(peek(data, pos)) + pos += 1 + end + name_end = pos - 1 + name_start > name_end && err("expected attribute name or tag close", pos) + + # Consume '=' and surrounding whitespace (not part of any token) + pos = skip_whitespace(data, pos) + (!iseof(data, pos) && peek(data, pos) == UInt8('=')) || err("expected '=' after attribute name", pos) + pos += 1 + pos = skip_whitespace(data, pos) + + next_state = is_decl ? M_XML_DECL_VALUE : M_TAG_VALUE + tok = Token(TokenKinds.ATTR_NAME, @inbounds SubString(data, name_start, name_end)) + (tok, TokenizerState(pos, next_state, no_token(data))) +end + +# Read a quoted attribute value (including the quotes). Same shape as `read_text`: use +# `findnext` for the closing quote (memchr-backed for `String`), then a bounded `occursin` +# over the value range for entity detection so we never scan past the quote. +function read_attr_value(data::AbstractString, pos::Int, mode::Mode) + iseof(data, pos) && err("expected attribute value", pos) + + q = peek(data, pos) + (q == UInt8('"') || q == UInt8('\'')) || err("expected quoted attribute value", pos) + + start = pos + pos += 1 # skip opening quote + quote_char = Char(q) + close_idx = findnext(quote_char, data, pos) + isnothing(close_idx) && err("unterminated attribute value", start) + # Value range is [pos, close_idx - 1]; entity check is bounded to this view. + inner = @inbounds SubString(data, pos, prevind(data, close_idx)) + has_amp = occursin('&', inner) + pos = close_idx + 1 # one past the closing quote (always ASCII) + + next_state = (mode == M_XML_DECL_VALUE) ? M_XML_DECL : M_TAG + raw = @inbounds SubString(data, start, pos - 1) + tok = Token{typeof(data)}(TokenKinds.ATTR_VALUE, has_amp, raw) + (tok, TokenizerState(pos, next_state, no_token(data))) +end + +#-----------------------------------------------------------------------# Content bodies (comment, CDATA, PI, DOCTYPE) +# Scan for '-->' and emit comment content + close tokens +function read_comment_body(data::AbstractString, pos::Int) + start = pos + @inbounds while !iseof(data, pos) + if peek(data, pos) == UInt8('-') && + canpeek(data, pos, 1) && peek(data, pos + 1) == UInt8('-') && + canpeek(data, pos, 2) && peek(data, pos + 2) == UInt8('>') + content_end = prevind(data, pos) + close_start = pos + pos += 3 + pending = Token(TokenKinds.COMMENT_CLOSE, SubString(data, close_start, pos - 1)) + tok = Token(TokenKinds.COMMENT_CONTENT, SubString(data, start, content_end)) + return (tok, TokenizerState(pos, M_DEFAULT, pending)) + end + pos += 1 + end + err("unterminated comment", start) +end + +# Scan for ']]>' and emit CDATA content + close tokens +function read_cdata_body(data::AbstractString, pos::Int) + start = pos + @inbounds while !iseof(data, pos) + if peek(data, pos) == UInt8(']') && + canpeek(data, pos, 1) && peek(data, pos + 1) == UInt8(']') && + canpeek(data, pos, 2) && peek(data, pos + 2) == UInt8('>') + content_end = prevind(data, pos) + close_start = pos + pos += 3 + pending = Token(TokenKinds.CDATA_CLOSE, SubString(data, close_start, pos - 1)) + tok = Token(TokenKinds.CDATA_CONTENT, SubString(data, start, content_end)) + return (tok, TokenizerState(pos, M_DEFAULT, pending)) + end + pos += 1 + end + err("unterminated CDATA section", start) +end + +# Scan for '?>' and emit PI content + close tokens +function read_pi_body(data::AbstractString, pos::Int) + start = pos + @inbounds while !iseof(data, pos) + if peek(data, pos) == UInt8('?') && canpeek(data, pos, 1) && peek(data, pos + 1) == UInt8('>') + content_end = prevind(data, pos) + close_start = pos + pos += 2 + pending = Token(TokenKinds.PI_CLOSE, SubString(data, close_start, pos - 1)) + tok = Token(TokenKinds.PI_CONTENT, SubString(data, start, content_end)) + return (tok, TokenizerState(pos, M_DEFAULT, pending)) + end + pos += 1 + end + err("unterminated processing instruction", start) +end + +# Scan DOCTYPE body, handling nested brackets, quotes, and comments +function read_doctype_body(data::AbstractString, pos::Int) + start = pos + depth = 0 + @inbounds while !iseof(data, pos) + b = peek(data, pos) + if b == UInt8('-') && canpeek(data, pos, 1) && peek(data, pos + 1) == UInt8('-') && + pos >= 3 && + codeunit(data, pos - 1) == UInt8('!') && + codeunit(data, pos - 2) == UInt8('<') + # Inside a <!-- comment: skip until --> + pos += 2 # skip "--" + while !iseof(data, pos) + if peek(data, pos) == UInt8('-') && canpeek(data, pos, 1) && peek(data, pos + 1) == UInt8('-') && + canpeek(data, pos, 2) && peek(data, pos + 2) == UInt8('>') + pos += 3 # skip "-->" + break + end + pos += 1 + end + elseif b == UInt8('"') || b == UInt8('\'') + pos = skip_quoted(data, pos) + elseif b == UInt8('[') + depth += 1 + pos += 1 + elseif b == UInt8(']') + depth -= 1 + pos += 1 + elseif b == UInt8('>') && depth == 0 + content_end = prevind(data, pos) + close_start = pos + pos += 1 + pending = Token(TokenKinds.DOCTYPE_CLOSE, @inbounds SubString(data, close_start, pos - 1)) + tok = Token(TokenKinds.DOCTYPE_CONTENT, @inbounds SubString(data, start, content_end)) + return (tok, TokenizerState(pos, M_DEFAULT, pending)) + else + pos += 1 + end + end + err("unterminated DOCTYPE", start) +end + +#-----------------------------------------------------------------------# Utility functions + +""" + tag_name(token::Token) -> SubString{String} + +Extract the element name from an `OPEN_TAG` or `CLOSE_TAG` token. +""" +function tag_name(token::Token) + if token.kind == TokenKinds.OPEN_TAG + @inbounds SubString(token.raw, 2, ncodeunits(token.raw)) # skip '<' + elseif token.kind == TokenKinds.CLOSE_TAG + @inbounds SubString(token.raw, 3, ncodeunits(token.raw)) # skip '</' + else + throw(ArgumentError("tag_name requires OPEN_TAG or CLOSE_TAG, got $(token.kind)")) + end +end + +""" + attr_value(token::Token) -> SubString{String} + +Strip the surrounding quotes from an `ATTR_VALUE` token. +""" +function attr_value(token::Token) + token.kind == TokenKinds.ATTR_VALUE || + throw(ArgumentError("attr_value requires ATTR_VALUE, got $(token.kind)")) + @inbounds SubString(token.raw, 2, prevind(token.raw, lastindex(token.raw))) +end + +""" + pi_target(token::Token) -> SubString{String} + +Extract the target name from a `PI_OPEN` or `XML_DECL_OPEN` token. +""" +function pi_target(token::Token) + (token.kind == TokenKinds.PI_OPEN || token.kind == TokenKinds.XML_DECL_OPEN) || + throw(ArgumentError("pi_target requires PI_OPEN or XML_DECL_OPEN, got $(token.kind)")) + @inbounds SubString(token.raw, 3, ncodeunits(token.raw)) # skip '<?' +end + +end # module XMLTokenizer diff --git a/src/dtd.jl b/src/dtd.jl deleted file mode 100644 index 58299f0..0000000 --- a/src/dtd.jl +++ /dev/null @@ -1,141 +0,0 @@ -# This is all a work in progress - -#-----------------------------------------------------------------------------# position_after -function position_after(needle::Vector{UInt8}, haystack::Vector{UInt8}, i) - x = findnext(needle, haystack, i) - isnothing(x) ? nothing : x[end] + 1 -end - -position_after(needle::String, haystack::Vector{UInt8}, i) = position_after(Vector{UInt8}(needle), haystack, i) - - -#-----------------------------------------------------------------------------# DeclaredElement -struct DeclaredElement - name::String - content::String # "ANY", "EMPTY", or "(children...)" - function DeclaredElement(name, content) - content in ("ANY", "EMPTY") || (content[1] == '(' && content[end] == ')') || - error("DeclaredElement `content` must be 'ANY', 'EMPTY', or '(children...)'. Got $content.") - new(name, content) - end -end -Base.show(io::IO, o::DeclaredElement) = print(io, "<!ELEMENT ", o.name, " ", o.content, ">") - -function get_declared_elements(data::Vector{UInt8}) - i = position_after("<!ELEMENT", data, 1) - out = DeclaredElement[] - while !isnothing(i) - name, i = get_name(data, i + 1) - i = findnext(!isspace, data, i) - if data[i] == UInt8('(') - j = findnext(==(UInt8(')')), data, i + 1) - content = String(data[i:j]) - else - content, i = get_name(data, i) - end - push!(out, DeclaredElement(name, content)) - i = position_after("<!ELEMENT", data, i) - end - return out -end - -#-----------------------------------------------------------------------------# DeclaredAttribute -struct DeclaredAttribute - element_name::String - attribute_name::String - attribute_type::String - attribute_value::String -end -Base.show(io::IO, o::DeclaredAttribute) = print(io, "<!ATTLIST ", o.element_name, " ", o.attribute_name, " ", o.attribute_type, " ", o.attribute_value, ">") - - -function get_declared_attributes(data) - i = position_after("<!ATTLIST", data, 1) - out = DeclaredAttribute[] - while !isnothing(i) - element_name, i = get_name(data, i) - attribute_name, i = get_name(data, i) - i = findnext(!isspace, data, i) - attribute_type = if data[i] == UInt('(') - j = findnext(==(UInt8(')')), data, i) - String(data[i:j]) - i = j + 1 - else - nm, i = get_name(data, i) - nm - end - i = findnext(!isspace, data, i) - is_hash = data[i] == UInt8('#') - val, i = get_name(data, i) - attribute_value = is_hash ? '#' * val : val - push!(out, DeclaredAttribute(element_name, attribute_name, attribute_type, attribute_value)) - i = position_after("<!ATTLIST", data, i) - end - return out -end - -#-----------------------------------------------------------------------------# DeclaredEntity -struct DeclaredEntity - name::String - external::Bool - value::String -end -function Base.show(io::IO, o::DeclaredEntity) - print(io, "<!ENTITY ", o.name, " ", o.external ? "SYSTEM" : "", repr(o.value), ">") -end - -function get_declared_entities(data) - i = position_after("<!ENTITY", data, 1) - out = DeclaredEntity[] - while !isnothing(i) - name, i = get_name(data, i) - value, i = get_name(data, i) - external = value == "SYSTEM" - if external - value, i = get_name(data, i) - end - push!(out, DeclaredEntity(name, external, value)) - i = position_after("<!ENTITY", data, i) - end - return out -end - -#-----------------------------------------------------------------------------# DTDBody -struct DTDBody - elements::Vector{DeclaredElement} - attributes::Vector{DeclaredAttribute} - entities::Vector{DeclaredEntity} -end - -function Base.show(io::IO, o::DTDBody) - printstyled(io, "DTDBody\n", color=:light_cyan) - printstyled(io, " DeclaredElements (", length(o.elements), ")\n", color=:light_green) - foreach(x -> println(io, " ", x), o.elements) - printstyled(io, " DeclaredAttributes (", length(o.attributes), ")\n", color=:light_green) - foreach(x -> println(io, " ", x), o.attributes) - printstyled(io, " DeclaredEntities (", length(o.entities), ")\n", color=:light_green) - foreach(x -> println(io, " ", x), o.entities) -end - - -function DTDBody(data::Vector{UInt8}, file = false) - file && @goto isfile - i = position_after("<!DOCTYPE", data, 1) - root, i = get_name(data, i) - - i = findnext(==(UInt8('[')), data, i) - isnothing(i) && return DTDBody(root, [], [], []) - - @label isfile - elements = get_declared_elements(data) - attributes = get_declared_attributes(data) - entities = get_declared_entities(data) - return DTDBody(root, elements, attributes, entities) -end - - -Base.read(filename::String, ::Type{DTDBody}) = DTDBody(read(filename), true) -Base.read(io::IO, ::Type{DTDBody}) = DTDBody(read(io), true) - -Base.parse(s::AbstractString, ::Type{DTDBody}) = DTDBody(Vector{UInt8}(s)) -Base.parse(::Type{DTDBody}, s::AbstractString) = parse(s, DTDBody) diff --git a/src/lazynode.jl b/src/lazynode.jl new file mode 100644 index 0000000..185b53d --- /dev/null +++ b/src/lazynode.jl @@ -0,0 +1,548 @@ +#-----------------------------------------------------------------------------# LazyNode +""" + LazyNode + +A lightweight, read-only view into an XML document that navigates the token stream on demand +instead of building a full tree in memory. + + doc = parse(xml_string, LazyNode) + doc = read("file.xml", LazyNode) + +Supports the same read-only interface as `Node`: [`nodetype`](@ref), [`tag`](@ref), +[`attributes`](@ref), [`value`](@ref), [`children`](@ref), plus integer and string indexing. + +Accessor methods (`tag`, `value`, `keys`, `attributes`) return `SubString{String}` views +into the original document rather than allocated `String`s, so reading a large document +through `LazyNode` does not duplicate its text data. +""" +struct LazyNode{S <: AbstractString} + data::S + token::Token{S} + nodetype::NodeType +end + +function LazyNode(data::S, nt::NodeType) where {S <: AbstractString} + LazyNode{S}(data, Token(TokenKinds.TEXT, SubString(data, 1, 0)), nt) +end + +nodetype(n::LazyNode) = n.nodetype + +_lazy_pos(n::LazyNode) = n.token.raw.offset + 1 +_lazy_tokenizer(n::LazyNode) = tokenize(n.data, _lazy_pos(n)) + +# Entity-decode a TEXT/ATTR_VALUE token only when the tokenizer actually saw a `&`. When +# `has_entities` is false the raw `SubString{String}` view is returned with no allocation +# and no byte scan — the dominant case for spreadsheet-style data. `_decode_attr` strips +# the surrounding quotes first; the flag is read from the token, not the stripped view. +@inline _decode(tok::Token) = tok.has_entities ? unescape(tok.raw) : tok.raw +@inline _decode_attr(tok::Token) = tok.has_entities ? unescape(attr_value(tok)) : attr_value(tok) + +#-----------------------------------------------------------------------------# tag / value +function tag(n::LazyNode) + nt = n.nodetype + if nt === Element + return tag_name(n.token) + elseif nt === ProcessingInstruction + return pi_target(n.token) + end + nothing +end + +function value(n::LazyNode) + nt = n.nodetype + if nt === Text + return _decode(n.token) + elseif nt === Comment + iter = _lazy_tokenizer(n) + iterate(iter) # COMMENT_OPEN + return iterate(iter)[1].raw + elseif nt === CData + iter = _lazy_tokenizer(n) + iterate(iter) # CDATA_OPEN + return iterate(iter)[1].raw + elseif nt === DTD + iter = _lazy_tokenizer(n) + iterate(iter) # DOCTYPE_OPEN + return lstrip(iterate(iter)[1].raw) + elseif nt === ProcessingInstruction + iter = _lazy_tokenizer(n) + iterate(iter) # PI_OPEN + result = iterate(iter) + result === nothing && return nothing + result[1].kind === TokenKinds.PI_CONTENT || return nothing + content = strip(result[1].raw) + return isempty(content) ? nothing : content + end + nothing +end + +#-----------------------------------------------------------------------------# attributes +# Promote a `String` returned from `unescape` to a SubString so the homogeneous +# `Attributes{SubString{String}}` parameterization works. The String was already +# allocated for entity decoding; the SubString wrapper is just a view on top. +@inline _as_substring(s::SubString{String}) = s +@inline _as_substring(s::String) = SubString(s, 1, lastindex(s)) + +function attributes(n::LazyNode) + n.nodetype in (Element, Declaration) || return nothing + iter = _lazy_tokenizer(n) + iterate(iter) # skip OPEN_TAG or XML_DECL_OPEN + attrs = Pair{SubString{String}, SubString{String}}[] + for tok in iter + tok.kind === TokenKinds.ATTR_NAME || break + name = tok.raw + result = iterate(iter) + result === nothing && break + push!(attrs, name => _as_substring(_decode_attr(result[1]))) + end + isempty(attrs) ? nothing : Attributes(attrs) +end + +""" + get(n::LazyNode, key::AbstractString, default) + +Return the value of attribute `key` on `n`, or `default` if absent. Walks the token stream +once — no `Attributes` allocation — so this is the recommended way to read a single +attribute from a `LazyNode`. Use [`eachattribute`](@ref) to stream all attribute pairs +without allocating, or [`attributes`](@ref) for the materialized dict. +""" +function Base.get(n::LazyNode, key::AbstractString, default) + n.nodetype in (Element, Declaration) || return default + iter = _lazy_tokenizer(n) + iterate(iter) # skip OPEN_TAG or XML_DECL_OPEN + for tok in iter + tok.kind === TokenKinds.ATTR_NAME || return default + if tok.raw == key + result = iterate(iter) + result === nothing && return default + return _decode_attr(result[1]) + else + iterate(iter) # skip value + end + end + default +end + +#-----------------------------------------------------------------------------# eachattribute +struct LazyAttrIterator{I} + iter::I + done::Base.RefValue{Bool} +end + +Base.IteratorSize(::Type{<:LazyAttrIterator}) = Base.SizeUnknown() +Base.eltype(::Type{<:LazyAttrIterator}) = Pair{SubString{String}, Union{SubString{String}, String}} + +""" + eachattribute(n::LazyNode) + +Lazy iterator yielding `name => value` pairs for the attributes of `n` (an `Element` or +`Declaration`). Does not allocate an [`Attributes`](@ref) dict or intermediate vector; +suitable for hot paths that only need to scan attributes. + +For a single attribute by name, prefer `get(n, key, default)` — it short-circuits as soon +as the match is found. +""" +function eachattribute(n::LazyNode) + iter = _lazy_tokenizer(n) + is_attrs = n.nodetype === Element || n.nodetype === Declaration + is_attrs && iterate(iter) # skip OPEN_TAG / XML_DECL_OPEN + LazyAttrIterator{typeof(iter)}(iter, Ref(!is_attrs)) +end + +function Base.iterate(it::LazyAttrIterator, _ = nothing) + it.done[] && return nothing + r = iterate(it.iter) + isnothing(r) && (it.done[] = true; return nothing) + tok = r[1] + if tok.kind !== TokenKinds.ATTR_NAME + it.done[] = true + return nothing + end + name = tok.raw + r = iterate(it.iter) + if isnothing(r) + it.done[] = true + return nothing + end + val = _decode_attr(r[1]) + ((name => val), nothing) +end + +function Base.getindex(n::LazyNode, key::AbstractString) + val = get(n, key, _MISSING_ATTR) + val === _MISSING_ATTR && throw(KeyError(key)) + val +end + +function Base.haskey(n::LazyNode, key::AbstractString) + get(n, key, _MISSING_ATTR) !== _MISSING_ATTR +end + +function Base.keys(n::LazyNode) + n.nodetype in (Element, Declaration) || return () + iter = _lazy_tokenizer(n) + iterate(iter) + result = SubString{String}[] + for tok in iter + tok.kind === TokenKinds.ATTR_NAME || break + push!(result, tok.raw) + iterate(iter) # skip value + end + result +end + +#-----------------------------------------------------------------------------# children +function children(n::LazyNode{S}) where {S} + nt = n.nodetype + (nt === Document || nt === Element) || return () + children!(LazyNode{S}[], n) +end + +""" + children!(buf::Vector{LazyNode{S}}, n::LazyNode{S}) -> buf + +Collect children of `n` into `buf` (cleared first) and return it. Lets callers reuse a +single buffer across many nodes — useful when streaming through siblings (e.g. XLSX row +iteration) to avoid one `Vector` allocation per node. +""" +function children!(buf::Vector{LazyNode{S}}, n::LazyNode{S}) where {S} + empty!(buf) + nt = n.nodetype + if nt === Document + return _lazy_collect_children!(buf, n.data, _lazy_tokenizer(n)) + elseif nt !== Element + return buf + end + iter = _lazy_tokenizer(n) + for tok in iter + tok.kind === TokenKinds.SELF_CLOSE && return buf + tok.kind === TokenKinds.TAG_CLOSE && break + end + _lazy_collect_children!(buf, n.data, iter) +end + +function _lazy_collect_children!(result::Vector{LazyNode{S}}, data::S, iter) where {S <: AbstractString} + for tok in iter + k = tok.kind + if k === TokenKinds.TEXT + push!(result, LazyNode(data, tok, Text)) + elseif k === TokenKinds.OPEN_TAG + push!(result, LazyNode(data, tok, Element)) + _lazy_skip_element!(iter) + elseif k === TokenKinds.COMMENT_OPEN + push!(result, LazyNode(data, tok, Comment)) + _lazy_skip_until!(iter, TokenKinds.COMMENT_CLOSE) + elseif k === TokenKinds.CDATA_OPEN + push!(result, LazyNode(data, tok, CData)) + _lazy_skip_until!(iter, TokenKinds.CDATA_CLOSE) + elseif k === TokenKinds.PI_OPEN + push!(result, LazyNode(data, tok, ProcessingInstruction)) + _lazy_skip_until!(iter, TokenKinds.PI_CLOSE) + elseif k === TokenKinds.XML_DECL_OPEN + push!(result, LazyNode(data, tok, Declaration)) + _lazy_skip_until!(iter, TokenKinds.XML_DECL_CLOSE) + elseif k === TokenKinds.DOCTYPE_OPEN + push!(result, LazyNode(data, tok, DTD)) + _lazy_skip_until!(iter, TokenKinds.DOCTYPE_CLOSE) + elseif k === TokenKinds.CLOSE_TAG + break + end + end + result +end + +function _lazy_skip_element!(iter) + depth = 1 + for tok in iter + k = tok.kind + if k === TokenKinds.OPEN_TAG + depth += 1 + elseif k === TokenKinds.SELF_CLOSE + depth -= 1 + depth == 0 && return + elseif k === TokenKinds.CLOSE_TAG + depth -= 1 + if depth == 0 + iterate(iter) # consume trailing TAG_CLOSE + return + end + end + end +end + +function _lazy_skip_until!(iter, target::TokenKinds.Kind) + for tok in iter + tok.kind === target && return + end +end + +_token_end(tok) = tok.raw.offset + tok.raw.ncodeunits + +function _scan_to_close(iter, close_kind::TokenKinds.Kind) + for tok in iter + tok.kind === close_kind && return _token_end(tok) + end + error("Could not find closing token") +end + +#-----------------------------------------------------------------------------# sourcetext +""" + sourcetext(n::LazyNode) -> SubString{String} + +Return the original source text of the node as a `SubString`, with no parsing, escaping, +or reformatting. This is the zero-copy counterpart of [`write`](@ref) for lazy nodes. +""" +function sourcetext(n::LazyNode) + nt = n.nodetype + start = _lazy_pos(n) + if nt === Element + iter = _lazy_tokenizer(n) + for tok in iter + tok.kind === TokenKinds.SELF_CLOSE && return SubString(n.data, start, _token_end(tok)) + tok.kind === TokenKinds.TAG_CLOSE && break + end + depth = 1 + for tok in iter + k = tok.kind + if k === TokenKinds.OPEN_TAG + depth += 1 + elseif k === TokenKinds.SELF_CLOSE + depth -= 1 + elseif k === TokenKinds.CLOSE_TAG + depth -= 1 + if depth == 0 + result = iterate(iter) + result === nothing && error("Could not find closing '>'") + return SubString(n.data, start, _token_end(result[1])) + end + end + end + error("Could not find closing tag") + elseif nt === Comment + return SubString(n.data, start, _scan_to_close(_lazy_tokenizer(n), TokenKinds.COMMENT_CLOSE)) + elseif nt === CData + return SubString(n.data, start, _scan_to_close(_lazy_tokenizer(n), TokenKinds.CDATA_CLOSE)) + elseif nt === ProcessingInstruction + return SubString(n.data, start, _scan_to_close(_lazy_tokenizer(n), TokenKinds.PI_CLOSE)) + elseif nt === Declaration + return SubString(n.data, start, _scan_to_close(_lazy_tokenizer(n), TokenKinds.XML_DECL_CLOSE)) + elseif nt === DTD + return SubString(n.data, start, _scan_to_close(_lazy_tokenizer(n), TokenKinds.DOCTYPE_CLOSE)) + elseif nt === Text + return n.token.raw + elseif nt === Document + return SubString(n.data) + end +end + +#-----------------------------------------------------------------------------# write +""" + write(n::LazyNode; normalize::Bool=false, indentsize::Int=2) -> String + write(io::IO, n::LazyNode; normalize::Bool=false, indentsize::Int=2) + write(filename::AbstractString, n::LazyNode; normalize::Bool=false, indentsize::Int=2) + +Serialize a `LazyNode`. With `normalize=false` (the default) the result is the node's +original source bytes (zero-copy via [`sourcetext`](@ref)) — fast, but any source-side +whitespace between tags is preserved verbatim. + +With `normalize=true` the node is parsed into a `Node` tree and re-serialized, which +collapses incidental source whitespace and pretty-prints with `indentsize`-space +indentation. +""" +function write(n::LazyNode; normalize::Bool=false, indentsize::Int=2) + normalize ? write(parse(String(sourcetext(n)), Node); indentsize) : String(sourcetext(n)) +end + +function write(io::IO, n::LazyNode; normalize::Bool=false, indentsize::Int=2) + if normalize + write(io, parse(String(sourcetext(n)), Node); indentsize) + else + Base.write(io, sourcetext(n)) + end +end + +function write(filename::AbstractString, n::LazyNode; normalize::Bool=false, indentsize::Int=2) + open(io -> write(io, n; normalize, indentsize), filename, "w") +end + +#-----------------------------------------------------------------------------# eachchildnode +struct LazyChildIterator{S <: AbstractString, I} + data::S + iter::I + done::Base.RefValue{Bool} +end + +Base.IteratorSize(::Type{<:LazyChildIterator}) = Base.SizeUnknown() +Base.eltype(::Type{LazyChildIterator{S,I}}) where {S,I} = LazyNode{S} + +""" + eachchildnode(n::LazyNode) + +Return a lazy iterator over the children of `n`, yielding one [`LazyNode`](@ref) at a time +without collecting them all into a vector. + +See also [`children`](@ref), which returns a `Vector{LazyNode}`. +""" +function eachchildnode(n::LazyNode{S}) where {S} + nt = n.nodetype + iter = _lazy_tokenizer(n) + if nt === Document + return LazyChildIterator{S, typeof(iter)}(n.data, iter, Ref(false)) + elseif nt === Element + for tok in iter + if tok.kind === TokenKinds.SELF_CLOSE + return LazyChildIterator{S, typeof(iter)}(n.data, iter, Ref(true)) + elseif tok.kind === TokenKinds.TAG_CLOSE + return LazyChildIterator{S, typeof(iter)}(n.data, iter, Ref(false)) + end + end + end + LazyChildIterator{S, typeof(iter)}(n.data, iter, Ref(true)) +end + +function Base.iterate(ci::LazyChildIterator, _ = nothing) + ci.done[] && return nothing + for tok in ci.iter + k = tok.kind + if k === TokenKinds.TEXT + return (LazyNode(ci.data, tok, Text), nothing) + elseif k === TokenKinds.OPEN_TAG + node = LazyNode(ci.data, tok, Element) + _lazy_skip_element!(ci.iter) + return (node, nothing) + elseif k === TokenKinds.COMMENT_OPEN + node = LazyNode(ci.data, tok, Comment) + _lazy_skip_until!(ci.iter, TokenKinds.COMMENT_CLOSE) + return (node, nothing) + elseif k === TokenKinds.CDATA_OPEN + node = LazyNode(ci.data, tok, CData) + _lazy_skip_until!(ci.iter, TokenKinds.CDATA_CLOSE) + return (node, nothing) + elseif k === TokenKinds.PI_OPEN + node = LazyNode(ci.data, tok, ProcessingInstruction) + _lazy_skip_until!(ci.iter, TokenKinds.PI_CLOSE) + return (node, nothing) + elseif k === TokenKinds.XML_DECL_OPEN + node = LazyNode(ci.data, tok, Declaration) + _lazy_skip_until!(ci.iter, TokenKinds.XML_DECL_CLOSE) + return (node, nothing) + elseif k === TokenKinds.DOCTYPE_OPEN + node = LazyNode(ci.data, tok, DTD) + _lazy_skip_until!(ci.iter, TokenKinds.DOCTYPE_CLOSE) + return (node, nothing) + elseif k === TokenKinds.CLOSE_TAG || k === TokenKinds.TAG_CLOSE + ci.done[] = true + return nothing + end + end + ci.done[] = true + return nothing +end + +#-----------------------------------------------------------------------------# is_simple / simple_value +function is_simple(n::LazyNode) + n.nodetype === Element || return false + attrs = attributes(n) + (!isnothing(attrs) && !isempty(attrs)) && return false + ch = children(n) + length(ch) == 1 && ch[1].nodetype in (Text, CData) +end + +function simple_value(n::LazyNode) + n.nodetype === Element || error("`simple_value` is only defined for simple nodes.") + attrs = attributes(n) + (!isnothing(attrs) && !isempty(attrs)) && error("`simple_value` is only defined for simple nodes.") + ch = children(n) + length(ch) == 1 && ch[1].nodetype in (Text, CData) || error("`simple_value` is only defined for simple nodes.") + value(ch[1]) +end + +# Single-pass combined predicate+accessor: returns the simple text/CData value, or +# `nothing` if `n` is not a simple element. Avoids the double tokenization of +# `is_simple(n) ? simple_value(n) : ...`. +function is_simple_value(n::LazyNode) + n.nodetype === Element || return nothing + iter = _lazy_tokenizer(n) + iterate(iter) # skip OPEN_TAG + found_close = false + for tok in iter + k = tok.kind + k === TokenKinds.TAG_CLOSE && (found_close = true; break) + return nothing # attributes (ATTR_NAME), self-close, or anything else => not simple + end + found_close || return nothing + result = iterate(iter) + isnothing(result) && return nothing + tok = result[1] + k = tok.kind + if k === TokenKinds.TEXT + nxt = iterate(iter) + (isnothing(nxt) || nxt[1].kind !== TokenKinds.CLOSE_TAG) && return nothing + return _decode(tok) + elseif k === TokenKinds.CDATA_OPEN + r = iterate(iter) + (isnothing(r) || r[1].kind !== TokenKinds.CDATA_CONTENT) && return nothing + content = r[1].raw + r = iterate(iter) + (isnothing(r) || r[1].kind !== TokenKinds.CDATA_CLOSE) && return nothing + r = iterate(iter) + (isnothing(r) || r[1].kind !== TokenKinds.CLOSE_TAG) && return nothing + return content + end + nothing +end + +#-----------------------------------------------------------------------------# indexing +Base.getindex(n::LazyNode, i::Integer) = children(n)[i] +Base.getindex(n::LazyNode, ::Colon) = children(n) +Base.lastindex(n::LazyNode) = lastindex(children(n)) +Base.only(n::LazyNode) = only(children(n)) +Base.length(n::LazyNode) = length(children(n)) + +#-----------------------------------------------------------------------------# parse / read +Base.parse(::Type{LazyNode}, xml::AbstractString) = parse(xml, LazyNode) +Base.parse(xml::AbstractString, ::Type{LazyNode}) = LazyNode(String(xml), Document) + +Base.read(filename::AbstractString, ::Type{LazyNode}) = parse(read(filename, String), LazyNode) +Base.read(io::IO, ::Type{LazyNode}) = parse(read(io, String), LazyNode) + +#-----------------------------------------------------------------------------# show +function Base.show(io::IO, n::LazyNode) + nt = n.nodetype + print(io, "Lazy ", nt) + if nt === Text + print(io, ' ', repr(value(n))) + elseif nt === Element + print(io, " <", tag(n)) + attrs = attributes(n) + if !isnothing(attrs) + for (k, v) in attrs + print(io, ' ', k, '=', '"', v, '"') + end + end + print(io, '>') + elseif nt === DTD + print(io, " <!DOCTYPE ", value(n), '>') + elseif nt === Declaration + print(io, " <?xml") + attrs = attributes(n) + if !isnothing(attrs) + for (k, v) in attrs + print(io, ' ', k, '=', '"', v, '"') + end + end + print(io, "?>") + elseif nt === ProcessingInstruction + print(io, " <?", tag(n)) + v = value(n) + !isnothing(v) && print(io, ' ', v) + print(io, "?>") + elseif nt === Comment + print(io, " <!--", value(n), "-->") + elseif nt === CData + print(io, " <![CDATA[", value(n), "]]>") + elseif nt === Document + n_ch = length(children(n)) + n_ch > 0 && print(io, n_ch == 1 ? " (1 child)" : " ($n_ch children)") + end +end diff --git a/src/raw.jl b/src/raw.jl deleted file mode 100644 index 29d0a10..0000000 --- a/src/raw.jl +++ /dev/null @@ -1,568 +0,0 @@ -#-----------------------------------------------------------------------------# RawType -""" - RawType: - - RawText # text - - RawComment # <!-- ... --> - - RawCData # <![CData[...]]> - - RawDeclaration # <?xml attributes... ?> - - RawProcessingInstruction # <?NAME attributes... ?> - - RawDTD # <!DOCTYPE ...> - - RawElementOpen # <NAME attributes... > - - RawElementClose # </NAME> - - RawElementSelfClosed # <NAME attributes... /> - - RawDocument # Something to initialize with (not really used) -""" -@enum(RawType, RawDocument, RawText, RawComment, RawCData, RawProcessingInstruction, - RawDeclaration, RawDTD, RawElementOpen, RawElementClose, RawElementSelfClosed) - -@inline nodetype(x::RawType) = - x === RawElementOpen ? Element : - x === RawElementClose ? Element : - x === RawElementSelfClosed ? Element : - x === RawText ? Text : - x === RawComment ? Comment : - x === RawCData ? CData : - x === RawDeclaration ? Declaration : - x === RawDTD ? DTD : - x === RawProcessingInstruction ? ProcessingInstruction : - x === RawDocument ? Document : - nothing - -#-----------------------------------------------------------------------------# Raw -""" - Raw(filename::String) - -Create an iterator over raw chunks of data in an XML file. Each chunk of data represents one of: - - - RawDocument # Only used to initialize the iterator state. - - RawText # text - - RawComment # <!-- ... --> - - RawCData # <![CData[...]]> - - RawDeclaration # <?xml attributes... ?> - - RawProcessingInstruction # <?NAME attributes... ?> - - RawDTD # <!DOCTYPE ...> - - RawElementOpen # <NAME attributes... > - - RawElementClose # </NAME> - - RawElementSelfClosed # <NAME attributes... /> - -Useful functions: - - - view(o::Raw) --> view of the Vector{UInt8} chunk. - - String(o::Raw) --> String of the chunk. - - next(o::Raw) --> Raw of the next chunk (or `nothing`). - - prev(o::Raw) --> Raw of the previous chunk (or `nothing`). - - tag(o::Raw) --> String of the tag name (or `nothing`). - - attributes(o::Raw) --> OrderedDict{String, String} of the attributes (or `nothing`). - - value(o::Raw) --> String of the value (or `nothing`). - - children(o::Raw) --> Vector{Raw} of the children (or `nothing`). - - parent(o::Raw) --> Raw of the parent (or `nothing`) - - depth(o::Raw) --> Int of the depth of the node in the XML DOM. -""" -struct Raw - type::RawType - depth::Int - pos::Int - len::Int - data::Vector{UInt8} - ctx::Vector{Bool} # Context for xml:space (Vector to support inheritance of context) - has_xml_space::Bool # Whether data contains `xml:space` attribute at least once -end -function Raw(data::Vector{UInt8})#, ctx::Vector{Bool}=Bool[false]) - needle = Vector{UInt8}("xml:space") - has_xml_space = findfirst(needle, data) !== nothing - return Raw(RawDocument, 0, 0, 0, data, [false], has_xml_space) -end -function Raw(data::Vector{UInt8}, has_xml_space::Bool, ctx::Vector{Bool}=Bool[false]) - return Raw(RawDocument, 0, 0, 0, data, ctx, has_xml_space) -end - -const _RAW_INDEX = WeakKeyDict{Vector{UInt8}, Any}() - -struct _TokRec - type::RawType - depth::Int - pos::Int - len::Int - ctx::Vector{Bool} -end - -mutable struct _Index - recs::Vector{_TokRec} - last_raw::Raw - built_end::Int -end - -Base.read(filename::String, ::Type{Raw}) = isfile(filename) ? - Raw(Mmap.mmap(filename)) : - error("File \"$filename\" does not exist.") - -Base.read(io::IO, ::Type{Raw}) = Raw(read(io)) - -Base.parse(x::AbstractString, ::Type{Raw}) = Raw(Vector{UInt8}(x)) - -# Mostly for debugging -Base.peek(o::Raw, n::Int) = String(view(o.data[o.pos+o.len+1:min(end, o.pos + o.len + n + 1)])) - -function Base.show(io::IO, o::Raw) - print(io, o.type, ':', o.depth, " (pos=", o.pos, ", len=", o.len, ")") - o.len > 0 && printstyled(io, ": ", String(o); color=:light_green) -end -function Base.:(==)(a::Raw, b::Raw) - a.type == b.type && a.depth == b.depth && a.pos == b.pos && a.len == b.len && a.data === b.data && a.ctx == b.ctx && a.has_xml_space == b.has_xml_space -end - -Base.view(o::Raw) = view(o.data, o.pos:o.pos+o.len) -Base.String(o::Raw) = String(view(o)) - -Base.IteratorSize(::Type{Raw}) = Base.SizeUnknown() -Base.eltype(::Type{Raw}) = Raw - -function Base.iterate(o::Raw, state=o) - n = next(state) - return isnothing(n) ? nothing : (n, n) -end - -is_node(o::Raw) = o.type !== RawElementClose -xml_nodes(o::Raw) = Iterators.Filter(is_node, o) - -#-----------------------------------------------------------------------------# get_name -is_name_start_char(x::UInt8) = x in UInt8('A'):UInt8('Z') || x in UInt8('a'):UInt8('z') || x == UInt8('_') -is_name_char(x::UInt8) = is_name_start_char(x) || x in UInt8('0'):UInt8('9') || x == UInt8('-') || x == UInt8('.') || x == UInt8(':') - -name_start(data, i) = findnext(is_name_start_char, data, i) -name_stop(data, i) = findnext(!is_name_char, data, i) - 1 - -function get_name(data, i) - i = name_start(data, i) - j = name_stop(data, i) - @views String(data[i:j]), j + 1 -end - -#-----------------------------------------------------------------------------# get_attributes -# starting at position i, return attributes up until the next '>' or '?' (DTD) -function get_attributes(data, i, j) - i = name_start(data, i) - (isnothing(j) || isnothing(i) || i > j) && return nothing - out = OrderedDict{String,String}() - while !isnothing(i) && i < j - key, i = get_name(data, i) - # get quotechar the value is wrapped in (either ' or ") - i = findnext(x -> x === UInt8('"') || x === UInt8('''), data, i + 1) - quotechar = data[i] - i2 = findnext(==(quotechar), data, i + 1) - @views value = String(data[i+1:i2-1]) - out[key] = value - i = name_start(data, i2) - end - return out -end - -# ----------------------------------------------------------------------------# Utilities supporting prev -function _get_or_init_index(o::Raw) - idx = get(_RAW_INDEX, o.data, nothing) - if idx === nothing - start = Raw(o.data) # fresh RawDocument - _RAW_INDEX[o.data] = _Index(_TokRec[], start, 0) - idx = _RAW_INDEX[o.data] - end - return idx -end -function _ensure_index_upto!(o::Raw, target_pos::Int) - idx = _get_or_init_index(o) - r = idx.last_raw - while true - n = next(r) - if n === nothing - idx.built_end = typemax(Int) - idx.last_raw = r - return idx - end - push!(idx.recs, _TokRec(n.type, n.depth, n.pos, n.len, copy(n.ctx))) - endpos = n.pos + n.len - idx.built_end = endpos - idx.last_raw = n - r = n - if endpos >= target_pos - return idx - end - end -end -function _find_prev_token(recs::Vector{_TokRec}, p::Int) - lo, hi = 1, length(recs) - ans = 0 - while lo <= hi - mid = (lo + hi) >>> 1 - endpos = recs[mid].pos + recs[mid].len - if endpos < p + 1 - ans = mid - lo = mid + 1 - else - hi = mid - 1 - end - end - return ans == 0 ? nothing : recs[ans] -end - -#-----------------------------------------------------------------------------# update xml:space context -# check attributes for xml:space and update ctx if necessary -function get_ctx(o) - att = attributes(o) - if !isnothing(att) && haskey(att, "xml:space") - if att["xml:space"] == "preserve" - return true - elseif att["xml:space"] == "default" - return false - else - error("Invalid value for xml:space attribute: $(att["xml:space"]). Must be 'preserve' or 'default'.") - end - end - return nothing -end -function update_ctx!(ctx, o) - new_ctx = get_ctx(o) - if new_ctx !== nothing - ctx[end] = new_ctx - end - return nothing -end - -#-----------------------------------------------------------------------------# interface -""" - nodetype(node) --> XML.NodeType - -Return the `XML.NodeType` of the node. -""" -nodetype(o::Raw) = nodetype(o.type) - -""" - tag(node) --> String or Nothing - -Return the tag name of `Element` and `PROCESSING_INSTRUCTION` nodes. -""" -function tag(o::Raw) - o.type ∉ [RawElementOpen, RawElementClose, RawElementSelfClosed, RawProcessingInstruction] && return nothing - return get_name(o.data, o.pos + 1)[1] -end - -""" - attributes(node) --> OrderedDict{String, String} or Nothing - -Return the attributes of `Element`, `Declaration`, or `ProcessingInstruction` nodes. -""" -function attributes(o::Raw) - if o.type === RawElementOpen || o.type === RawElementSelfClosed || o.type === RawProcessingInstruction - i = o.pos - i = name_start(o.data, i) - i = name_stop(o.data, i) - get_attributes(o.data, i + 1, o.pos + o.len) - elseif o.type === RawDeclaration - get_attributes(o.data, o.pos + 6, o.pos + o.len) - else - nothing - end -end - -""" - value(node) --> String or Nothing - -Return the value of `Text`, `CData`, `Comment`, or `DTD` nodes. -""" -function value(o::Raw) - if o.type === RawText - String(o) - elseif o.type === RawCData - String(view(o.data, o.pos+length("<![CData["):o.pos+o.len-3)) - elseif o.type === RawComment - String(view(o.data, o.pos+length("<!--"):o.pos+o.len-3)) - elseif o.type === RawDTD - String(view(o.data, o.pos+length("<!DOCTYPE "):o.pos+o.len-1)) - else - nothing - end -end - -""" - children(node) --> Vector{typeof(node)} - -Return the children the node. Will only be nonempty for `Element` and `Document` nodes. -""" -function children(o::Raw) - if o.type === RawElementOpen || o.type === RawDocument - depth = o.depth - out = Raw[] - for item in xml_nodes(o) - if item.depth == depth + 1 - push!(out, item) - end - item.depth == depth && break - o.type === RawDocument && item.depth == 2 && break # break if we've seen the doc root - end - out - else - Raw[] - end -end - -""" - depth(node) --> Int - -Return the depth of the node. Will be `0` for `Document` nodes. Not defined for `XML.Node`. -""" -function depth(o::Raw) - o.depth -end - -""" - parent(node) --> typeof(node), Nothing - -Return the parent of the node. Will be `nothing` for `Document` nodes. Not defined for `XML.Node`. -""" -function parent(o::Raw) - depth = o.depth - depth === 0 && return nothing - p = prev(o) - while p.depth >= depth - p = prev(p) - end - return p -end - -#-----------------------------------------------------------------------------# next Raw -# isspace(x::UInt8) = Base.isspace(Char(x)) - -# XML whitespace per XML 1.0/1.1 production S: -# S ::= (#x20 | #x9 | #xD | #xA)+ -@inline xml_isspace(b::UInt8)::Bool = (b == 0x20) | (b == 0x09) | (b == 0x0A) | (b == 0x0D) - -""" - next(node) --> typeof(node) or Nothing - -Return the next node in the document during depth-first traversal. Depth-first is the order you -would visit nodes by reading top-down through an XML file. Not defined for `XML.Node`. -""" -function next(o::Raw) - if o.has_xml_space # using xml:space context at least once in data - return next_xml_space(o) - else # not using xml:space context at all (same as v0.3.5) - return next_no_xml_space(o) - end -end - -function next_xml_space(o::Raw) - i = o.pos + o.len + 1 - depth = o.depth - data = o.data - type = o.type - has_xml_space = o.has_xml_space - ctx = copy(o.ctx) - last_type = type - k = findnext(!xml_isspace, data, i) - if isnothing(k) - return nothing - end - if last_type === RawElementOpen || last_type === RawDocument - depth += 1 - push!(ctx, ctx[end]) # inherit the xml:space context from parent - last_type === RawElementOpen && update_ctx!(ctx, o) # check attributes for xml:space and update if necessary - end - i = ctx[end] ? i : k - b = i > 1 ? Char(o.data[i-1]) : Char('<') - c = Char(o.data[i]) - d = Char(o.data[k+1]) - if c !== '<' || ctx[end] && c === '<' && b === ' ' && last_type === RawElementOpen && d === '/' - type = RawText - j = findnext(==(UInt8('<')), data, i) - 1 - j = ctx[end] ? j : findprev(!xml_isspace, data, j) # preserving whitespace if needed - if last_type === RawElementClose || last_type === RawElementSelfClosed|| last_type === RawDocument - # Maybe drop pure-whitespace inter-element text nodes? - # (e.g. whitespace between a closing and an opening tag which would otherwise make an orphan text node) - #if all(xml_isspace, @view data[i:j]) && depth > 1 - # return next(Raw(type, depth, j, 0, data, ctx, has_xml_space)) - #end - end - else - i = k - j = k + 1 - if c === '<' - c2 = Char(o.data[i+1]) - if c2 === '!' - c3 = Char(o.data[i+2]) - if c3 === '-' - type = RawComment - j = findnext(Vector{UInt8}("-->"), data, i)[end] - elseif c3 === '[' - type = RawCData - j = findnext(Vector{UInt8}("]]>"), data, i)[end] - elseif c3 === 'D' || c3 == 'd' - type = RawDTD - j = findnext(==(UInt8('>')), data, i) - while sum(==(UInt8('>')), @view data[k:j]) != sum(==(UInt8('<')), @view data[i:j]) - j = findnext(==(UInt8('>')), data, j + 1) - end - end - elseif c2 === '?' - if get_name(data, i + 2)[1] == "xml" - type = RawDeclaration - else - type = RawProcessingInstruction - end - j = findnext(Vector{UInt8}("?>"), data, i)[end] - elseif c2 === '/' - type = RawElementClose - depth -= 1 - pop!(ctx) # revert to parent xml:space context - j = findnext(==(UInt8('>')), data, i) - else - j = findnext(==(UInt8('>')), data, i) - if data[j-1] === UInt8('/') - type = RawElementSelfClosed - else - type = RawElementOpen - end - end - end - end - return Raw(type, depth, i, j - i, data, ctx, has_xml_space) -end - -function next_no_xml_space(o::Raw) # same as v0.3.5 - i = o.pos + o.len + 1 - depth = o.depth - data = o.data - type = o.type - has_xml_space = o.has_xml_space - ctx = [false] - i = findnext(!xml_isspace, data, i) - if isnothing(i) - return nothing - end - if type === RawElementOpen || type === RawDocument - depth += 1 - end - c = Char(o.data[i]) - d = Char(o.data[i+1]) - if c !== '<' - type = RawText - j = findnext(==(UInt8('<')), data, i) - 1 - j = findprev(!xml_isspace, data, j) # "rstrip" - elseif c === '<' - c2 = Char(o.data[i+1]) - if c2 === '!' - c3 = Char(o.data[i+2]) - if c3 === '-' - type = RawComment - j = findnext(Vector{UInt8}("-->"), data, i)[end] - elseif c3 === '[' - type = RawCData - j = findnext(Vector{UInt8}("]]>"), data, i)[end] - elseif c3 === 'D' || c3 == 'd' - type = RawDTD - j = findnext(==(UInt8('>')), data, i) - while sum(==(UInt8('>')), @view data[i:j]) != sum(==(UInt8('<')), @view data[i:j]) - j = findnext(==(UInt8('>')), data, j + 1) - end - end - elseif c2 === '?' - if get_name(data, i + 2)[1] == "xml" - type = RawDeclaration - else - type = RawProcessingInstruction - end - j = findnext(Vector{UInt8}("?>"), data, i)[end] - elseif c2 === '/' - type = RawElementClose - depth -= 1 - j = findnext(==(UInt8('>')), data, i) - else - j = findnext(==(UInt8('>')), data, i) - if data[j-1] === UInt8('/') - type = RawElementSelfClosed - else - type = RawElementOpen - end - end - end - return Raw(type, depth, i, j - i, data, ctx, has_xml_space) -end - -#-----------------------------------------------------------------------------# prev Raw -""" - prev(node) --> typeof(node), Nothing, or Missing (only for XML.Node) - -Return the previous node in the document during depth-first traversal. Not defined for `XML.Node`. -""" -function prev(o::Raw) - if o.has_xml_space # using xml:space context at least once in data - return prev_xml_space(o) - else # not using xml:space context at all (same as v0.3.5) - return prev_no_xml_space(o) - end -end - -function prev_xml_space(o::Raw) - o.type === RawDocument && return nothing - - idx = _ensure_index_upto!(o, o.pos - 1) - rec = _find_prev_token(idx.recs, o.pos - 1) - if rec === nothing - return Raw(o.data, o.has_xml_space, copy(o.ctx)) - end - return Raw(rec.type, rec.depth, rec.pos, rec.len, o.data, copy(rec.ctx), o.has_xml_space) -end -function prev_no_xml_space(o::Raw) # same as v0.3.5 - depth = o.depth - data = o.data - type = o.type - has_xml_space = o.has_xml_space - ctx = has_xml_space ? copy(o.ctx) : [false] - type === RawDocument && return nothing - j = o.pos - 1 - j = findprev(!xml_isspace, data, j) - if isnothing(j) - return Raw(data, has_xml_space, ctx) # RawDocument - end - c = Char(o.data[j]) - next_type = type - if c !== '>' # text - type = RawText - i = findprev(==(UInt8('>')), data, j) + 1 - i = findnext(!xml_isspace, data, i) # "lstrip" - elseif c === '>' - c2 = Char(o.data[j-1]) - if c2 === '-' - type = RawComment - i = findprev(Vector{UInt8}("<--"), data, j)[1] - elseif c2 === ']' - type = RawCData - i = findprev(Vector{UInt8}("<![CData["), data, j)[1] - elseif c2 === '?' - i = findprev(Vector{UInt8}("<?"), data, j)[1] - if get_name(data, i + 2)[1] == "xml" - type = RawDeclaration - else - type = RawProcessingInstruction - end - else - i = findprev(==(UInt8('<')), data, j) - char = Char(data[i+1]) - if char === '/' - type = RawElementClose - elseif char === '!' - type = DTD - elseif isletter(char) || char === '_' - type = Char(o.data[j-2]) === '/' ? RawElementSelfClosed : RawElementOpen - else - error("Should be unreachable. Unexpected data: <$char ... $c3$c2$c1>.") - end - end - else - error("Unreachable reached in XML.prev") - end - if type !== RawElementOpen && next_type === RawElementClose - depth += 1 - elseif type === RawElementOpen && next_type !== RawElementClose - depth -= 1 - end - return Raw(type, depth, i, j - i, data, ctx, has_xml_space) -end - diff --git a/src/xpath.jl b/src/xpath.jl new file mode 100644 index 0000000..87da263 --- /dev/null +++ b/src/xpath.jl @@ -0,0 +1,345 @@ +#-----------------------------------------------------------------------------# XPath +# A subset of XPath 1.0 for querying XML.Node trees. +# +# Supported syntax: +# / root (absolute path) +# tag child element by name +# * any child element +# // descendant-or-self (recursive) +# . current node +# .. parent node +# [n] positional predicate (1-based) +# [@attr] has-attribute predicate +# [@attr='v'] attribute-value predicate +# text() text node children +# node() all node children +# @attr attribute value (returns strings) + +#-----------------------------------------------------------------------------# Token types + +""" + XPathTokenKind + +Discriminator for the kinds of tokens produced by [`_xpath_tokenize`](@ref). + +| Variant | Source syntax | +|--------------------|--------------------------| +| `XPATH_ROOT` | `/` (path separator) | +| `XPATH_DESCENDANT` | `//` | +| `XPATH_NAME` | element tag name | +| `XPATH_WILDCARD` | `*` | +| `XPATH_DOT` | `.` (self) | +| `XPATH_DOTDOT` | `..` (parent) | +| `XPATH_TEXT_FN` | `text()` | +| `XPATH_NODE_FN` | `node()` | +| `XPATH_PREDICATE` | `[...]` body | +| `XPATH_ATTRIBUTE` | `@attr` (result position) | +""" +@enum XPathTokenKind::UInt8 begin + XPATH_ROOT # / + XPATH_DESCENDANT # // + XPATH_NAME # tag name + XPATH_WILDCARD # * + XPATH_DOT # . + XPATH_DOTDOT # .. + XPATH_TEXT_FN # text() + XPATH_NODE_FN # node() + XPATH_PREDICATE # [...] + XPATH_ATTRIBUTE # @attr (in result position) +end + +""" + XPathToken + +A single token from a parsed XPath expression: a [`XPathTokenKind`](@ref) tag together with +the relevant textual payload (tag name, predicate body, attribute name, etc.). Tokens with +no payload (`XPATH_ROOT`, `XPATH_WILDCARD`, …) carry the literal source character(s) for +debuggability. +""" +struct XPathToken + kind::XPathTokenKind + value::String +end + +#-----------------------------------------------------------------------------# Tokenizer + +# Lex an XPath expression into a flat token stream. Whitespace is discarded; unterminated +# predicates / function calls and unrecognised characters raise an error. Tokens preserve +# source order and are consumed left-to-right by `xpath`. +function _xpath_tokenize(expr::AbstractString) + tokens = XPathToken[] + s = String(expr) + i = 1 + n = ncodeunits(s) + + while i <= n + c = s[i] + + if c == '/' + if i < n && s[i+1] == '/' + push!(tokens, XPathToken(XPATH_DESCENDANT, "//")) + i += 2 + else + push!(tokens, XPathToken(XPATH_ROOT, "/")) + i += 1 + end + + elseif c == '.' + if i < n && s[i+1] == '.' + push!(tokens, XPathToken(XPATH_DOTDOT, "..")) + i += 2 + else + push!(tokens, XPathToken(XPATH_DOT, ".")) + i += 1 + end + + elseif c == '*' + push!(tokens, XPathToken(XPATH_WILDCARD, "*")) + i += 1 + + elseif c == '[' + j = findnext(']', s, i + 1) + isnothing(j) && error("Unterminated predicate in XPath: $(repr(s))") + push!(tokens, XPathToken(XPATH_PREDICATE, SubString(s, i + 1, j - 1))) + i = j + 1 + + elseif c == '@' + j = i + 1 + while j <= n && (isletter(s[j]) || s[j] == '-' || s[j] == '_' || s[j] == ':' || isdigit(s[j])) + j += 1 + end + j == i + 1 && error("Empty attribute name after @ in XPath: $(repr(s))") + push!(tokens, XPathToken(XPATH_ATTRIBUTE, SubString(s, i + 1, j - 1))) + i = j + + elseif isletter(c) || c == '_' + j = i + 1 + while j <= n && (isletter(s[j]) || s[j] == '-' || s[j] == '_' || s[j] == ':' || isdigit(s[j]) || s[j] == '.') + j += 1 + end + name = SubString(s, i, j - 1) + # Check for function calls: text(), node() + if j <= n && s[j] == '(' + j2 = findnext(')', s, j + 1) + isnothing(j2) && error("Unterminated function call in XPath: $(repr(s))") + if name == "text" + push!(tokens, XPathToken(XPATH_TEXT_FN, "text()")) + elseif name == "node" + push!(tokens, XPathToken(XPATH_NODE_FN, "node()")) + else + error("Unknown XPath function: $name()") + end + i = j2 + 1 + else + push!(tokens, XPathToken(XPATH_NAME, String(name))) + i = j + end + + elseif isspace(c) + i += 1 + + else + error("Unexpected character '$(c)' in XPath: $(repr(s))") + end + end + tokens +end + +#-----------------------------------------------------------------------------# Predicate evaluation + +const _RE_ATTR_PRED = r"^@([A-Za-z_:][\w.\-:]*)$" +const _RE_ATTR_VAL_PRED = r"^@([A-Za-z_:][\w.\-:]*)\s*=\s*['\"]([^'\"]*)['\"]$" + +# Filter `nodes` by the body of a `[...]` predicate. Supports positional indices `[n]` +# (1-based; out-of-range yields empty), `[last()]`, `[@attr]` (has-attribute), and +# `[@attr='value']` / `[@attr="value"]` (attribute equals literal). Anything else errors. +# `root` is accepted for symmetry with `_xpath_step` but is unused by current predicates. +function _eval_predicate(predicate::AbstractString, nodes::Vector{Node{S}}, root::Node{S}) where S + s = strip(predicate) + + # Positional: [n] + pos = tryparse(Int, s) + if !isnothing(pos) + 1 <= pos <= length(nodes) || return Node{S}[] + return [nodes[pos]] + end + + # last() + if s == "last()" + isempty(nodes) && return Node{S}[] + return [nodes[end]] + end + + # [@attr] — has attribute + m = match(_RE_ATTR_PRED, s) + if !isnothing(m) + attr_name = m.captures[1] + return filter(n -> n.nodetype === Element && haskey(n, attr_name), nodes) + end + + # [@attr='value'] or [@attr="value"] + m = match(_RE_ATTR_VAL_PRED, s) + if !isnothing(m) + attr_name = m.captures[1] + attr_val = m.captures[2] + return filter(n -> n.nodetype === Element && get(n, attr_name, nothing) == attr_val, nodes) + end + + error("Unsupported XPath predicate: [$predicate]") +end + +#-----------------------------------------------------------------------------# Step evaluation + +# Apply a single non-predicate, non-descendant step to the current context and return the +# new context. Handles XPATH_NAME, XPATH_WILDCARD, XPATH_DOT, XPATH_DOTDOT, XPATH_TEXT_FN, +# XPATH_NODE_FN. XPATH_DESCENDANT is intentionally not handled here — the main evaluator +# expands `//` to descendant-or-self before the next step. `root` is used by `..` to avoid +# walking past the document root. +function _xpath_step(nodes::Vector{Node{S}}, token::XPathToken, root::Node{S}) where S + result = Node{S}[] + k = token.kind + + if k === XPATH_NAME + for n in nodes + for c in children(n) + c.nodetype === Element && c.tag == token.value && push!(result, c) + end + end + + elseif k === XPATH_WILDCARD + for n in nodes + for c in children(n) + c.nodetype === Element && push!(result, c) + end + end + + elseif k === XPATH_DOT + append!(result, nodes) + + elseif k === XPATH_DOTDOT + for n in nodes + n === root && continue + p = _find_parent(n, root) + isnothing(p) || push!(result, p) + end + + elseif k === XPATH_TEXT_FN + for n in nodes + for c in children(n) + c.nodetype === Text && push!(result, c) + end + end + + elseif k === XPATH_NODE_FN + for n in nodes + append!(result, children(n)) + end + + elseif k === XPATH_DESCENDANT + # Handled by caller — collects all descendants before next step + error("XPATH_DESCENDANT should be handled by the evaluator, not _xpath_step") + end + + result +end + +# Append every descendant of `node` (children, grandchildren, ...) to `out` in document +# order. Does not include `node` itself. +function _descendants!(out::Vector{Node{S}}, node::Node{S}) where S + for c in children(node) + push!(out, c) + _descendants!(out, c) + end +end + +# Implements XPath's descendant-or-self axis: for each input node, emit the node itself +# followed by all of its descendants in document order. +function _descendants(nodes::Vector{Node{S}}) where S + result = Node{S}[] + for n in nodes + push!(result, n) # descendant-or-self includes self + _descendants!(result, n) + end + result +end + +#-----------------------------------------------------------------------------# Main evaluator + +""" + xpath(node::Node, expr::AbstractString) -> Vector{Node} + +Evaluate an XPath expression against a `Node` tree and return matching nodes. + +Supports a practical subset of XPath 1.0: +- Absolute (`/root/child`) and relative (`child/sub`) paths +- Recursive descent (`//tag`) +- Wildcards (`*`), self (`.`), parent (`..`) +- Positional predicates (`[1]`, `[last()]`) +- Attribute predicates (`[@attr]`, `[@attr='value']`) +- `text()` and `node()` functions +- Attribute selection (`@attr`) — returns `Text` nodes containing attribute values + +# Examples +```julia +doc = parse("<root><a x='1'/><a x='2'/><b/></root>", Node) +xpath(doc, "/root/a") # both <a> elements +xpath(doc, "/root/a[1]") # first <a> +xpath(doc, "//a[@x='2']") # <a x="2"/> +xpath(doc, "/root/b/@x") # attribute value as Text node (empty here) +``` +""" +function xpath(node::Node{S}, expr::AbstractString) where S + tokens = _xpath_tokenize(expr) + isempty(tokens) && return Node{S}[] + + # Determine root for .. navigation + root = node.nodetype === Document ? node : node + + i = 1 + # Start context + if tokens[1].kind === XPATH_ROOT + # Absolute path — start from the document or its root element + if node.nodetype === Document + current = Node{S}[node] + else + current = Node{S}[node] + end + i = 2 + else + current = Node{S}[node] + end + + while i <= length(tokens) + tok = tokens[i] + + if tok.kind === XPATH_PREDICATE + current = _eval_predicate(tok.value, current, root) + i += 1 + + elseif tok.kind === XPATH_DESCENDANT + current = _descendants(current) + # // must be followed by a step + i += 1 + + elseif tok.kind === XPATH_ROOT + # / as separator between steps — skip + i += 1 + + elseif tok.kind === XPATH_ATTRIBUTE + # @attr in result position — return attribute values as Text nodes + result = Node{S}[] + for n in current + v = get(n, tok.value, nothing) + !isnothing(v) && push!(result, Node{S}(Text, nothing, nothing, v, nothing)) + end + current = result + i += 1 + + else + current = _xpath_step(current, tok, root) + i += 1 + end + end + + current +end diff --git a/test/Project.toml b/test/Project.toml index d4883bd..c1703f7 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/data/complex_dtd.xml b/test/data/complex_dtd.xml new file mode 100644 index 0000000..cb69747 --- /dev/null +++ b/test/data/complex_dtd.xml @@ -0,0 +1,105 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!DOCTYPE test [ +<!-- ===== Bookstore DTD (complex demo) ===== --> + +<!-- Reusable parameter entities --> +<!ENTITY % text "(#PCDATA | em | code | xref | br)*"> +<!ENTITY % block "p | ul | ol | figure | table"> + +<!-- Notations (used by unparsed entities) --> +<!NOTATION jpeg SYSTEM "image/jpeg"> +<!NOTATION png SYSTEM "image/png"> + +<!-- Unparsed external entities (binary media) --> +<!ENTITY cover1 SYSTEM "covers/b123.jpg" NDATA jpeg> +<!ENTITY cover2 SYSTEM "covers/b456.png" NDATA png> + +<!ELEMENT catalog (metadata?, (book | magazine)+)> +<!ATTLIST catalog + tier CDATA #FIXED "retail" + xml:lang CDATA #IMPLIED> + +<!ELEMENT metadata (publisher?, contact?)> +<!ELEMENT publisher %text;> +<!ELEMENT contact (email, phone?)> +<!ELEMENT email (#PCDATA)> +<!ELEMENT phone (#PCDATA)> + +<!ELEMENT book (title, subtitle?, authors, pubinfo, description?, section*, reviews?, related?)> +<!ATTLIST book + id ID #REQUIRED + isbn CDATA #IMPLIED + format (hardcover | paperback | ebook) "paperback" + inStock (yes | no) #REQUIRED + xml:space (default | preserve) "default"> + +<!ELEMENT magazine (title, issue, article+)> +<!ATTLIST magazine id ID #REQUIRED> +<!ELEMENT issue (#PCDATA)> + +<!ELEMENT title %text;> +<!ELEMENT subtitle %text;> + +<!ELEMENT authors (author+)> +<!ELEMENT author (name, affiliation?)> +<!ATTLIST author id ID #IMPLIED> +<!ELEMENT name %text;> +<!ELEMENT affiliation %text;> + +<!ELEMENT pubinfo (publisher, year, price?, pages?)> +<!ELEMENT year (#PCDATA)> +<!ELEMENT price (#PCDATA)> +<!ELEMENT pages (#PCDATA)> + +<!ELEMENT description (%block;)*> + +<!ELEMENT section (title, (%block;)*, section*)> +<!ATTLIST section id ID #IMPLIED> + +<!ELEMENT p %text;> +<!ELEMENT ul (li+)> +<!ELEMENT ol (li+)> +<!ELEMENT li %text;> + +<!ELEMENT figure (caption?, media)> +<!ATTLIST figure + entity ENTITY #IMPLIED <!-- refers to cover1/cover2 --> + notation NOTATION (jpeg | png) #IMPLIED> +<!ELEMENT caption %text;> +<!ELEMENT media EMPTY> +<!ATTLIST media + src CDATA #REQUIRED + alt CDATA #IMPLIED> + +<!ELEMENT table (thead?, tbody, tfoot?)> +<!ELEMENT thead (tr+)> +<!ELEMENT tbody (tr+)> +<!ELEMENT tfoot (tr+)> +<!ELEMENT tr (th | td)+> +<!ELEMENT th %text;> +<!ELEMENT td %text;> + +<!ELEMENT reviews (review+)> +<!ELEMENT review (rating, p+)> +<!ATTLIST review by IDREF #REQUIRED + date CDATA #IMPLIED> +<!ELEMENT rating EMPTY> +<!ATTLIST rating value (poor | avg | good | excellent) #REQUIRED> + +<!ELEMENT related (seealso*)> +<!ELEMENT seealso EMPTY> +<!ATTLIST seealso ref IDREF #REQUIRED> + +<!ELEMENT article (title, authorrefs, p+)> +<!ELEMENT authorrefs (authorref+)> +<!ELEMENT authorref EMPTY> +<!ATTLIST authorref ref IDREF #REQUIRED> + +<!-- Conditional section controlled by a parameter entity --> +<!ENTITY % longdocs "INCLUDE"> +<![%longdocs;[ + <!ELEMENT appendix (title, p+)> + <!ATTLIST appendix id ID #REQUIRED> +]]> +]> diff --git a/test/data/preserve.xml b/test/data/preserve.xml new file mode 100644 index 0000000..e77add1 --- /dev/null +++ b/test/data/preserve.xml @@ -0,0 +1,5 @@ +<?xml version="1.0" encoding="UTF-8"?> +<root xml:space="preserve"> + This node has preserved space + with <child xml:space="default"> default </child> children. +</root> diff --git a/test/runtests.jl b/test/runtests.jl index 89978eb..4ab562c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,646 +1,3410 @@ using XML -using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text, escape, unescape, OrderedDict, h -using Downloads: download +using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text +using XML: escape, unescape, h, parse_dtd +using XML: ParsedDTD, ElementDecl, AttDecl, EntityDecl, NotationDecl using Test -import AbstractTrees - -AbstractTrees.children(x::Node) = children(x) - -#-----------------------------------------------------------------------------# files -xml_xsd = joinpath("data", "xml.xsd") -kml_xsd = joinpath("data", "kml.xsd") -books_xml = joinpath("data", "books.xml") -example_kml = joinpath("data", "example.kml") -simple_dtd = joinpath("data", "simple_dtd.xml") - -all_files = [xml_xsd, kml_xsd, books_xml, example_kml, simple_dtd] - -#-----------------------------------------------------------------------------# h -@testset "h function" begin - @test h.tag == XML.Element("tag") - @test h.tag(id="id") == XML.Element("tag"; id="id") - @test h.tag(1, 2, a="a", b="b") == XML.Element("tag", 1, 2; a="a", b="b") -end - -#-----------------------------------------------------------------------------# escaping/unescaping -@testset "escaping/unescaping" begin - s = "This > string < has & some \" special ' characters" - @test escape(s) == "This > string < has & some " special ' characters" - @test escape(escape(s)) == escape(s) - @test s == unescape(escape(s)) - @test s == unescape(unescape(escape(s))) - - n = Element("tag", Text(s)) - @test XML.simple_value(n) == s - - XML.escape!(n) - @test XML.simple_value(n) == escape(s) - - XML.unescape!(n) - @test XML.simple_value(n) == s -end - -#-----------------------------------------------------------------------------# DTD -# @testset "DTDBody and friends" begin -# s = read(simple_dtd, String) -# data = read(simple_dtd) - -# dtd = XML.DTDBody(data) -# dtd2 = parse(s, XML.DTDBody) - -# @test length(dtd.elements) == length(dtd2.elements) == 0 -# @test length(dtd.attributes) == length(dtd2.attributes) == 0 -# @test length(dtd.entities) == length(dtd2.entities) == 3 - -# o = read("data/tv.dtd", XML.DTDBody) -# end - -#-----------------------------------------------------------------------------# Raw -@testset "Raw tag/attributes/value" begin - examples = [ - (xml = "<!DOCTYPE html>", - nodetype = DTD, - tag=nothing, - attributes=nothing, - value="html"), - (xml = "<?xml version=\"1.0\" key=\"value\"?>", - nodetype = Declaration, - tag=nothing, - attributes=Dict("version" => "1.0", "key" => "value"), - value=nothing), - (xml = "<tag _id=\"1\", x=\"abc\" />", - nodetype = Element, - tag="tag", - attributes=Dict("_id" => "1", "x" => "abc"), - value=nothing), - (xml = "<!-- comment -->", - nodetype = Comment, - tag=nothing, - attributes=nothing, - value=" comment "), - (xml = "<![CData[cdata test]]>", - nodetype = CData, - tag=nothing, - attributes=nothing, - value="cdata test"), - ] - for x in examples - # @info "Testing: $(x.xml)" - data = XML.next(XML.parse(x.xml, XML.Raw)) - @test XML.nodetype(data) == x.nodetype - @test XML.tag(data) == x.tag - @test XML.attributes(data) == x.attributes - @test XML.value(data) == x.value - end -end - -@testset "Raw with books.xml" begin - data = read(books_xml, XML.Raw) - doc = collect(data) - @test length(doc) > countlines(books_xml) - # Check that the first 5 lines are correct - first_5_lines = [ - XML.RawDeclaration => """<?xml version="1.0"?>""", - XML.RawElementOpen => "<catalog>", - XML.RawElementOpen => "<book id=\"bk101\">", - XML.RawElementOpen => "<author>", - XML.RawText => "Gambardella, Matthew" - ] - for (i, (typ, str)) in enumerate(first_5_lines) - dt = doc[i] - @test dt.type == typ - @test String(dt) == str - end - # Check that the last line is correct - @test doc[end].type == XML.RawElementClose - @test String(doc[end]) == "</catalog>" - - @testset "next and prev" begin - @test XML.prev(doc[1]) == data # can't use === here because prev returns a copy of ctx - @test prev(data) === nothing - @test XML.next(doc[end]) === nothing - - n = length(doc) - next_res = [doc[1]] - foreach(_ -> push!(next_res, XML.next(next_res[end])), 1:n-1) - - prev_res = [doc[end]] - foreach(_ -> pushfirst!(prev_res, XML.prev(prev_res[1])), 1:n-1) - - idx = findall(next_res .!= prev_res) - - for (a,b) in zip(next_res, prev_res) - @test a == b + +#==============================================================================# +# ESCAPE / UNESCAPE # +#==============================================================================# +@testset "escape / unescape" begin + @testset "all five predefined entities" begin + @test escape("&") == "&" + @test escape("<") == "<" + @test escape(">") == ">" + @test escape("'") == "'" + @test escape("\"") == """ + end + + @testset "unescape reverses escape" begin + @test unescape("&") == "&" + @test unescape("<") == "<" + @test unescape(">") == ">" + @test unescape("'") == "'" + @test unescape(""") == "\"" + end + + @testset "roundtrip on mixed strings" begin + s = "This > string < has & some \" special ' characters" + @test unescape(escape(s)) == s + end + + @testset "idempotent unescape" begin + s = "plain text with no entities" + @test unescape(s) == s + end + + @testset "multiple entities in one string" begin + @test escape("a < b & c > d") == "a < b & c > d" + @test unescape("a < b & c > d") == "a < b & c > d" + end + + @testset "empty string" begin + @test escape("") == "" + @test unescape("") == "" + end +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.1: Well-Formed XML Documents # +#==============================================================================# +@testset "Spec 2.1: Well-Formed XML Documents" begin + # The spec's simplest example: + # <?xml version="1.0"?> + # <greeting>Hello, world!</greeting> + xml = """<?xml version="1.0"?><greeting>Hello, world!</greeting>""" + doc = parse(xml, Node) + @test nodetype(doc) == Document + @test length(doc) == 2 # Declaration + Element + @test nodetype(doc[1]) == Declaration + @test nodetype(doc[2]) == Element + @test tag(doc[2]) == "greeting" + @test simple_value(doc[2]) == "Hello, world!" +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.4: Character Data and Markup # +#==============================================================================# +@testset "Spec 2.4: Character Data and Markup" begin + @testset "text content between tags" begin + doc = parse("<root>Hello</root>", Node) + @test simple_value(doc[1]) == "Hello" + end + + @testset "entity references in text are unescaped" begin + doc = parse("<root>& < > ' "</root>", Node) + @test simple_value(doc[1]) == "& < > ' \"" + end + + @testset "mixed text and child elements" begin + doc = parse("<p>Hello <b>world</b>!</p>", Node) + root = doc[1] + @test length(root) == 3 + @test nodetype(root[1]) == Text + @test value(root[1]) == "Hello " + @test nodetype(root[2]) == Element + @test tag(root[2]) == "b" + @test simple_value(root[2]) == "world" + @test nodetype(root[3]) == Text + @test value(root[3]) == "!" + end + + @testset "empty element has no text" begin + doc = parse("<empty/>", Node) + @test length(children(doc[1])) == 0 + end +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.5: Comments # +#==============================================================================# +@testset "Spec 2.5: Comments" begin + @testset "basic comment (spec example)" begin + # Spec example: <!-- declarations for <head> & <body> --> + doc = parse("<root><!-- declarations for <head> & <body> --></root>", Node) + c = doc[1][1] + @test nodetype(c) == Comment + @test value(c) == " declarations for <head> & <body> " + end + + @testset "empty comment" begin + doc = parse("<root><!----></root>", Node) + c = doc[1][1] + @test nodetype(c) == Comment + @test value(c) == "" + end + + @testset "comment before root element" begin + doc = parse("<!-- before --><root/>", Node) + @test nodetype(doc[1]) == Comment + @test value(doc[1]) == " before " + @test nodetype(doc[2]) == Element + end + + @testset "comment after root element" begin + doc = parse("<root/><!-- after -->", Node) + @test nodetype(doc[1]) == Element + @test nodetype(doc[2]) == Comment + end + + @testset "comment with markup-like content preserved verbatim" begin + doc = parse("<root><!-- <b>not</b> a tag --></root>", Node) + @test value(doc[1][1]) == " <b>not</b> a tag " + end + + @testset "multiple comments" begin + doc = parse("<root><!-- A --><!-- B --></root>", Node) + @test length(doc[1]) == 2 + @test value(doc[1][1]) == " A " + @test value(doc[1][2]) == " B " + end +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.6: Processing Instructions # +#==============================================================================# +@testset "Spec 2.6: Processing Instructions" begin + @testset "xml-stylesheet PI (spec example)" begin + doc = parse("""<?xml-stylesheet type="text/xsl" href="style.xsl"?><root/>""", Node) + pi = doc[1] + @test nodetype(pi) == ProcessingInstruction + @test tag(pi) == "xml-stylesheet" + @test contains(value(pi), "type=\"text/xsl\"") + end + + @testset "PI with no content" begin + doc = parse("<?target?><root/>", Node) + pi = doc[1] + @test nodetype(pi) == ProcessingInstruction + @test tag(pi) == "target" + @test value(pi) === nothing + end + + @testset "PI inside element" begin + doc = parse("<root><?mypi some data?></root>", Node) + pi = doc[1][1] + @test nodetype(pi) == ProcessingInstruction + @test tag(pi) == "mypi" + @test value(pi) == "some data" + end + + @testset "PI after root element" begin + doc = parse("<root/><?post-process?>", Node) + @test nodetype(doc[2]) == ProcessingInstruction + @test tag(doc[2]) == "post-process" + end +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.7: CDATA Sections # +#==============================================================================# +@testset "Spec 2.7: CDATA Sections" begin + @testset "CDATA preserves markup characters" begin + # Spec example + doc = parse("<root><![CDATA[<greeting>Hello, world!</greeting>]]></root>", Node) + cd = doc[1][1] + @test nodetype(cd) == CData + @test value(cd) == "<greeting>Hello, world!</greeting>" + end + + @testset "empty CDATA" begin + doc = parse("<root><![CDATA[]]></root>", Node) + cd = doc[1][1] + @test nodetype(cd) == CData + @test value(cd) == "" + end + + @testset "CDATA with ampersands and less-thans" begin + doc = parse("<root><![CDATA[a < b && c > d]]></root>", Node) + @test value(doc[1][1]) == "a < b && c > d" + end + + @testset "CDATA with special characters" begin + doc = parse("<root><![CDATA[line1\nline2\ttab]]></root>", Node) + @test value(doc[1][1]) == "line1\nline2\ttab" + end + + @testset "CDATA mixed with text" begin + doc = parse("<root>before<![CDATA[inside]]>after</root>", Node) + @test length(doc[1]) == 3 + @test nodetype(doc[1][1]) == Text + @test value(doc[1][1]) == "before" + @test nodetype(doc[1][2]) == CData + @test value(doc[1][2]) == "inside" + @test nodetype(doc[1][3]) == Text + @test value(doc[1][3]) == "after" + end +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.8: Prolog and Document Type Declaration # +#==============================================================================# +@testset "Spec 2.8: Prolog and Document Type Declaration" begin + @testset "XML declaration - version only" begin + doc = parse("""<?xml version="1.0"?><root/>""", Node) + decl = doc[1] + @test nodetype(decl) == Declaration + @test decl["version"] == "1.0" + end + + @testset "XML declaration - version and encoding" begin + doc = parse("""<?xml version="1.0" encoding="UTF-8"?><root/>""", Node) + decl = doc[1] + @test decl["version"] == "1.0" + @test decl["encoding"] == "UTF-8" + end + + @testset "XML declaration - all three pseudo-attributes" begin + doc = parse("""<?xml version="1.0" encoding="UTF-8" standalone="yes"?><root/>""", Node) + decl = doc[1] + @test decl["version"] == "1.0" + @test decl["encoding"] == "UTF-8" + @test decl["standalone"] == "yes" + end + + @testset "XML declaration with single quotes" begin + doc = parse("<?xml version='1.0'?><root/>", Node) + @test doc[1]["version"] == "1.0" + end + + @testset "no XML declaration" begin + doc = parse("<root/>", Node) + @test length(doc) == 1 + @test nodetype(doc[1]) == Element + end + + @testset "DOCTYPE - SYSTEM" begin + # Spec example + doc = parse("""<!DOCTYPE greeting SYSTEM "hello.dtd"><greeting/>""", Node) + dtd = doc[1] + @test nodetype(dtd) == DTD + @test contains(value(dtd), "greeting") + @test contains(value(dtd), "SYSTEM") + @test contains(value(dtd), "hello.dtd") + end + + @testset "DOCTYPE - with internal subset" begin + xml = """<!DOCTYPE greeting [ + <!ELEMENT greeting (#PCDATA)> +]><greeting>Hello, world!</greeting>""" + doc = parse(xml, Node) + dtd = doc[1] + @test nodetype(dtd) == DTD + @test contains(value(dtd), "greeting") + @test contains(value(dtd), "<!ELEMENT") + end + + @testset "DOCTYPE with entities (spec-like)" begin + xml = """<!DOCTYPE note [ +<!ENTITY nbsp " "> +<!ENTITY writer "Writer: Donald Duck."> +<!ENTITY copyright "Copyright: W3Schools."> +]><note/>""" + doc = parse(xml, Node) + @test nodetype(doc[1]) == DTD + @test contains(value(doc[1]), "ENTITY") + end + + @testset "full prolog: declaration + DOCTYPE" begin + xml = """<?xml version="1.0"?><!DOCTYPE root SYSTEM "root.dtd"><root/>""" + doc = parse(xml, Node) + @test nodetype(doc[1]) == Declaration + @test nodetype(doc[2]) == DTD + @test nodetype(doc[3]) == Element + end +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.9: Standalone Document Declaration # +#==============================================================================# +@testset "Spec 2.9: Standalone Document Declaration" begin + doc = parse("""<?xml version="1.0" standalone="yes"?><root/>""", Node) + @test doc[1]["standalone"] == "yes" + + doc2 = parse("""<?xml version="1.0" standalone="no"?><root/>""", Node) + @test doc2[1]["standalone"] == "no" +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 2.10: White Space Handling # +#==============================================================================# +@testset "Spec 2.10: White Space Handling" begin + @testset "parser preserves all text content verbatim" begin + doc = parse("<root> hello </root>", Node) + @test simple_value(doc[1]) == " hello " + end + + @testset "parser preserves whitespace-only text" begin + doc = parse("<root> </root>", Node) + @test simple_value(doc[1]) == " " + end + + @testset "parser preserves inter-element whitespace as Text nodes" begin + xml = "<root><a>x</a>\n <b>y</b></root>" + doc = parse(xml, Node) + @test length(doc[1]) == 3 + @test value(doc[1][1][1]) == "x" + @test nodetype(doc[1][2]) == Text + @test value(doc[1][2]) == "\n " + @test value(doc[1][3][1]) == "y" + end + + @testset "xml:space attribute is preserved during parsing" begin + doc = parse("""<root xml:space="preserve"><child> text </child></root>""", Node) + @test doc[1]["xml:space"] == "preserve" + @test value(doc[1][1][1]) == " text " + end + + @testset "xml:space='preserve' affects write formatting" begin + # When xml:space="preserve", writer doesn't add indentation + el = Element("s", XML.Text(" pre "), Element("t"), XML.Text(" post "); var"xml:space"="preserve") + @test XML.write(el) == "<s xml:space=\"preserve\"> pre <t/> post </s>" + end + + @testset "write formats with indentation by default" begin + el = Element("root", Element("a"), Element("b")) + s = XML.write(el) + @test contains(s, " <a/>") # indented + @test contains(s, " <b/>") # indented + end + + @testset "Unicode non-breaking space is NOT XML whitespace" begin + nbsp = "\u00A0" + xml = "<root>$(nbsp) y $(nbsp)</root>" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "$(nbsp) y $(nbsp)" + end +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 3.1: Start-Tags, End-Tags, Empty-Element Tags # +#==============================================================================# +@testset "Spec 3.1: Start-Tags, End-Tags, Empty-Element Tags" begin + @testset "element with attributes (spec example)" begin + # <termdef id="dt-dog" term="dog"> + doc = parse("""<termdef id="dt-dog" term="dog">A dog.</termdef>""", Node) + el = doc[1] + @test tag(el) == "termdef" + @test el["id"] == "dt-dog" + @test el["term"] == "dog" + @test value(el[1]) == "A dog." + end + + @testset "self-closing tag (spec example)" begin + # <IMG align="left" src="http://www.w3.org/Icons/WWW/w3c_home"/> + doc = parse("""<IMG align="left" src="http://www.w3.org/Icons/WWW/w3c_home"/>""", Node) + el = doc[1] + @test tag(el) == "IMG" + @test el["align"] == "left" + @test el["src"] == "http://www.w3.org/Icons/WWW/w3c_home" + @test length(children(el)) == 0 + end + + @testset "simple self-closing tag" begin + doc = parse("<br/>", Node) + @test tag(doc[1]) == "br" + @test length(children(doc[1])) == 0 + end + + @testset "self-closing tag with space before />" begin + doc = parse("<br />", Node) + @test tag(doc[1]) == "br" + end + + @testset "empty element with start and end tag" begin + doc = parse("<empty></empty>", Node) + el = doc[1] + @test tag(el) == "empty" + @test isnothing(el.children) + end + + @testset "nested elements" begin + doc = parse("<a><b><c/></b></a>", Node) + @test tag(doc[1]) == "a" + @test tag(doc[1][1]) == "b" + @test tag(doc[1][1][1]) == "c" + end + + @testset "sibling elements" begin + doc = parse("<root><a/><b/><c/></root>", Node) + @test length(doc[1]) == 3 + @test tag(doc[1][1]) == "a" + @test tag(doc[1][2]) == "b" + @test tag(doc[1][3]) == "c" + end + + @testset "attributes with single quotes" begin + doc = parse("<x a='val'/>", Node) + @test doc[1]["a"] == "val" + end + + @testset "attributes with double quotes" begin + doc = parse("""<x a="val"/>""", Node) + @test doc[1]["a"] == "val" + end + + @testset "mixed quote styles in attributes" begin + doc = parse("""<x a="1" b='2'/>""", Node) + @test doc[1]["a"] == "1" + @test doc[1]["b"] == "2" + end + + @testset "attribute with > in value" begin + doc = parse("""<x a="1>2"/>""", Node) + @test doc[1]["a"] == "1>2" + end + + @testset "attribute with entity reference" begin + doc = parse("""<x a="a&b"/>""", Node) + @test doc[1]["a"] == "a&b" + end + + @testset "multiple attributes accessible via attributes()" begin + doc = parse("""<x first="1" second="2" third="3"/>""", Node) + attrs = attributes(doc[1]) + @test attrs isa Attributes + @test attrs["first"] == "1" + @test attrs["second"] == "2" + @test attrs["third"] == "3" + end + + @testset "whitespace around = in attributes" begin + doc = parse("""<x a = "1" />""", Node) + @test doc[1]["a"] == "1" + end +end + +#==============================================================================# +# XML 1.0 SPEC SECTION 4.1: Entity References # +#==============================================================================# +@testset "Spec 4.1: Character and Entity References" begin + @testset "predefined entity references in text" begin + doc = parse("<root><</root>", Node) + @test simple_value(doc[1]) == "<" + + doc = parse("<root>></root>", Node) + @test simple_value(doc[1]) == ">" + + doc = parse("<root>&</root>", Node) + @test simple_value(doc[1]) == "&" + + doc = parse("<root>'</root>", Node) + @test simple_value(doc[1]) == "'" + + doc = parse("<root>"</root>", Node) + @test simple_value(doc[1]) == "\"" + end + + @testset "predefined entities in attribute values" begin + doc = parse("""<x a="<>&'""/>""", Node) + @test doc[1]["a"] == "<>&'\"" + end + + @testset "multiple entity references in one text node" begin + doc = parse("<root><tag> & "value"</root>", Node) + @test simple_value(doc[1]) == "<tag> & \"value\"" + end +end + +#==============================================================================# +# NAMESPACES (Colon in Tag and Attribute Names) # +#==============================================================================# +@testset "Namespaces" begin + @testset "namespaced element" begin + doc = parse("""<ns:root xmlns:ns="http://example.com"><ns:child/></ns:root>""", Node) + @test tag(doc[1]) == "ns:root" + @test doc[1]["xmlns:ns"] == "http://example.com" + @test tag(doc[1][1]) == "ns:child" + end + + @testset "default namespace" begin + doc = parse("""<root xmlns="http://example.com"/>""", Node) + @test doc[1]["xmlns"] == "http://example.com" + end + + @testset "multiple namespace prefixes" begin + xml = """<root xmlns:a="http://a.com" xmlns:b="http://b.com"><a:x/><b:y/></root>""" + doc = parse(xml, Node) + @test tag(doc[1][1]) == "a:x" + @test tag(doc[1][2]) == "b:y" + end +end + +#==============================================================================# +# NODE CONSTRUCTORS # +#==============================================================================# +@testset "Node Constructors" begin + @testset "Text" begin + t = Text("hello") + @test nodetype(t) == Text + @test value(t) == "hello" + @test tag(t) === nothing + @test attributes(t) === nothing + end + + @testset "Comment" begin + c = Comment(" a comment ") + @test nodetype(c) == Comment + @test value(c) == " a comment " + end + + @testset "CData" begin + cd = CData("raw <data>") + @test nodetype(cd) == CData + @test value(cd) == "raw <data>" + end + + @testset "DTD" begin + d = DTD("html") + @test nodetype(d) == DTD + @test value(d) == "html" + end + + @testset "Declaration" begin + decl = Declaration(; version="1.0", encoding="UTF-8") + @test nodetype(decl) == Declaration + @test decl["version"] == "1.0" + @test decl["encoding"] == "UTF-8" + end + + @testset "Declaration with no attributes" begin + decl = Declaration() + @test nodetype(decl) == Declaration + @test attributes(decl) === nothing + end + + @testset "ProcessingInstruction with content" begin + pi = ProcessingInstruction("target", "data here") + @test nodetype(pi) == ProcessingInstruction + @test tag(pi) == "target" + @test value(pi) == "data here" + end + + @testset "ProcessingInstruction without content" begin + pi = ProcessingInstruction("target") + @test nodetype(pi) == ProcessingInstruction + @test tag(pi) == "target" + @test value(pi) === nothing + end + + @testset "Element with tag only" begin + el = Element("div") + @test nodetype(el) == Element + @test tag(el) == "div" + @test length(children(el)) == 0 + end + + @testset "Element with children" begin + el = Element("div", Text("hello"), Element("span")) + @test length(el) == 2 + @test nodetype(el[1]) == Text + @test nodetype(el[2]) == Element + end + + @testset "Element with attributes" begin + el = Element("div"; class="main", id="content") + @test el["class"] == "main" + @test el["id"] == "content" + end + + @testset "Element with children and attributes" begin + el = Element("a", "click here"; href="http://example.com") + @test tag(el) == "a" + @test el["href"] == "http://example.com" + @test value(el[1]) == "click here" + end + + @testset "Element auto-converts non-Node children to Text" begin + el = Element("p", 42) + @test nodetype(el[1]) == Text + @test value(el[1]) == "42" + end + + @testset "Document" begin + doc = Document( + Declaration(; version="1.0"), + Element("root") + ) + @test nodetype(doc) == Document + @test length(doc) == 2 + @test nodetype(doc[1]) == Declaration + @test nodetype(doc[2]) == Element + end + + @testset "Document with all node types" begin + doc = Document( + Declaration(; version="1.0"), + DTD("root"), + Comment("comment"), + ProcessingInstruction("pi", "data"), + Element("root", CData("cdata"), Text("text")) + ) + @test map(nodetype, children(doc)) == [Declaration, DTD, Comment, ProcessingInstruction, Element] + @test length(doc[end]) == 2 + @test nodetype(doc[end][1]) == CData + @test value(doc[end][1]) == "cdata" + @test nodetype(doc[end][2]) == Text + @test value(doc[end][2]) == "text" + end + + @testset "invalid constructions" begin + @test_throws Exception Text("a", "b") # too many args + @test_throws Exception Comment("a"; x="1") # no attrs + @test_throws Exception CData("a"; x="1") # no attrs + @test_throws Exception DTD("a"; x="1") # no attrs + @test_throws Exception Element() # need tag + @test_throws Exception Declaration("bad") # no positional args + @test_throws Exception Document(; x="1") # no attrs + @test_throws Exception ProcessingInstruction() # need target + @test_throws Exception ProcessingInstruction("a", "b", "c") # too many args + end +end + +#==============================================================================# +# h CONSTRUCTOR # +#==============================================================================# +@testset "h constructor" begin + @testset "h(tag)" begin + el = h("div") + @test nodetype(el) == Element + @test tag(el) == "div" + end + + @testset "h(tag, children...)" begin + el = h("div", "hello") + @test simple_value(el) == "hello" + end + + @testset "h(tag; attrs...)" begin + el = h("div"; class="main") + @test el["class"] == "main" + end + + @testset "h(tag, children...; attrs...)" begin + el = h("div", "hello"; class="main") + @test el["class"] == "main" + @test value(el[1]) == "hello" + end + + @testset "h.tag syntax" begin + el = h.div("hello"; class="main") + @test tag(el) == "div" + @test el["class"] == "main" + @test value(el[1]) == "hello" + end + + @testset "h.tag with no args" begin + el = h.br() + @test tag(el) == "br" + @test length(children(el)) == 0 + end + + @testset "h.tag with only attrs" begin + el = h.img(; src="image.png") + @test tag(el) == "img" + @test el["src"] == "image.png" + end + + @testset "nested h constructors" begin + el = h.div( + h.h1("Title"), + h.p("Paragraph") + ) + @test tag(el) == "div" + @test length(el) == 2 + @test tag(el[1]) == "h1" + @test tag(el[2]) == "p" + end + + @testset "h with symbol tag" begin + el = h(:div) + @test tag(el) == "div" + end +end + +#==============================================================================# +# NODE INTERFACE # +#==============================================================================# +@testset "Node Interface" begin + doc = parse("""<?xml version="1.0"?><root attr="val"><child>text</child></root>""", Node) + + @testset "nodetype" begin + @test nodetype(doc) == Document + @test nodetype(doc[1]) == Declaration + @test nodetype(doc[2]) == Element + end + + @testset "tag" begin + @test tag(doc) === nothing + @test tag(doc[2]) == "root" + @test tag(doc[2][1]) == "child" + end + + @testset "attributes" begin + @test attributes(doc) === nothing + @test attributes(doc[2])["attr"] == "val" + end + + @testset "value" begin + @test value(doc) === nothing + @test value(doc[2][1][1]) == "text" + end + + @testset "children" begin + @test length(children(doc)) == 2 + @test length(children(doc[2])) == 1 + end + + @testset "is_simple" begin + @test is_simple(doc[2][1]) == true + @test is_simple(doc[2]) == false + end + + @testset "simple_value" begin + @test simple_value(doc[2][1]) == "text" + @test_throws ErrorException simple_value(doc[2]) + end + + @testset "simple_value for CData child" begin + el = Element("x", CData("data")) + @test is_simple(el) + @test simple_value(el) == "data" + end +end + +#==============================================================================# +# NODE INDEXING # +#==============================================================================# +@testset "Node Indexing" begin + doc = parse("<root><a/><b/><c/></root>", Node) + root = doc[1] + + @testset "integer indexing" begin + @test tag(root[1]) == "a" + @test tag(root[2]) == "b" + @test tag(root[3]) == "c" + end + + @testset "colon indexing" begin + all = root[:] + @test length(all) == 3 + end + + @testset "lastindex" begin + @test tag(root[end]) == "c" + end + + @testset "only" begin + single = parse("<root><only/></root>", Node) + @test tag(only(single[1])) == "only" + end + + @testset "length" begin + @test length(root) == 3 + end + + @testset "attribute indexing" begin + el = parse("""<x a="1" b="2"/>""", Node)[1] + @test el["a"] == "1" + @test el["b"] == "2" + @test_throws KeyError el["nonexistent"] + end + + @testset "haskey" begin + el = parse("""<x a="1"/>""", Node)[1] + @test haskey(el, "a") == true + @test haskey(el, "b") == false + end + + @testset "keys" begin + el = parse("""<x a="1" b="2"/>""", Node)[1] + @test collect(keys(el)) == ["a", "b"] + end + + @testset "keys on element with no attributes" begin + el = parse("<x/>", Node)[1] + @test isempty(keys(el)) + end +end + +#==============================================================================# +# NODE MUTATION # +#==============================================================================# +@testset "Node Mutation" begin + @testset "setindex! child" begin + el = Element("root", Element("old")) + el[1] = Element("new") + @test tag(el[1]) == "new" + end + + @testset "setindex! child with auto-conversion" begin + el = Element("root", Text("old")) + el[1] = "new text" + @test value(el[1]) == "new text" + end + + @testset "setindex! attribute" begin + el = Element("root"; a="1") + el["a"] = "2" + @test el["a"] == "2" + end + + @testset "setindex! new attribute" begin + el = Element("root"; a="1") + el["b"] = "2" + @test el["b"] == "2" + end + + @testset "push! child" begin + el = Element("root") + push!(el, Element("child")) + @test length(el) == 1 + @test tag(el[1]) == "child" + end + + @testset "push! with auto-conversion" begin + el = Element("root") + push!(el, "text") + @test nodetype(el[1]) == Text + @test value(el[1]) == "text" + end + + @testset "pushfirst! child" begin + el = Element("root", Element("second")) + pushfirst!(el, Element("first")) + @test tag(el[1]) == "first" + @test tag(el[2]) == "second" + end + + @testset "push! on non-container node errors" begin + t = Text("hello") + @test_throws ErrorException push!(t, "more") + end +end + +#==============================================================================# +# NODE EQUALITY # +#==============================================================================# +@testset "Node Equality" begin + @testset "identical elements are equal" begin + a = Element("div", Text("hello"); class="main") + b = Element("div", Text("hello"); class="main") + @test a == b + end + + @testset "different tag names are not equal" begin + @test Element("a") != Element("b") + end + + @testset "different attributes are not equal" begin + @test Element("a"; x="1") != Element("a"; x="2") + end + + @testset "different children are not equal" begin + @test Element("a", Text("x")) != Element("a", Text("y")) + end + + @testset "different node types are not equal" begin + @test Text("x") != Comment("x") + end + + @testset "empty attributes vs nothing" begin + a = Element("a") + b = Element("a") + @test a == b + end + + @testset "parse equality" begin + xml = "<root><child>text</child></root>" + @test parse(xml, Node) == parse(xml, Node) + end +end + +#==============================================================================# +# XML WRITING # +#==============================================================================# +@testset "XML Writing" begin + @testset "write Text" begin + el = Element("p", "hello & goodbye") + @test XML.write(el) == "<p>hello & goodbye</p>" + end + + @testset "write Element with attributes" begin + el = Element("div"; class="main", id="content") + s = XML.write(el) + @test contains(s, "<div") + @test contains(s, "class=\"main\"") + @test contains(s, "id=\"content\"") + @test contains(s, "/>") + end + + @testset "write self-closing element" begin + @test XML.write(Element("br")) == "<br/>" + end + + @testset "write element with single text child (inline)" begin + @test XML.write(Element("p", "hello")) == "<p>hello</p>" + end + + @testset "write element with multiple children (indented)" begin + el = Element("div", Element("a"), Element("b")) + s = XML.write(el) + @test contains(s, "<div>") + @test contains(s, " <a/>") + @test contains(s, " <b/>") + @test contains(s, "</div>") + end + + @testset "write Comment" begin + el = Element("root", Comment(" comment ")) + @test contains(XML.write(el), "<!-- comment -->") + end + + @testset "write CData" begin + el = Element("root", CData("raw <data>")) + @test contains(XML.write(el), "<![CDATA[raw <data>]]>") + end + + @testset "write ProcessingInstruction with content" begin + pi = ProcessingInstruction("target", "data") + @test XML.write(pi) == "<?target data?>" + end + + @testset "write ProcessingInstruction without content" begin + pi = ProcessingInstruction("target") + @test XML.write(pi) == "<?target?>" + end + + @testset "write Declaration" begin + decl = Declaration(; version="1.0", encoding="UTF-8") + s = XML.write(decl) + @test contains(s, "<?xml") + @test contains(s, "version=\"1.0\"") + @test contains(s, "encoding=\"UTF-8\"") + @test contains(s, "?>") + end + + @testset "write DTD" begin + dtd = DTD("html") + @test XML.write(dtd) == "<!DOCTYPE html>" + end + + @testset "write Document" begin + doc = Document(Declaration(; version="1.0"), Element("root")) + s = XML.write(doc) + @test startswith(s, "<?xml") + @test contains(s, "<root/>") + end + + @testset "write escapes special characters in text" begin + el = Element("p", "a < b & c > d") + @test XML.write(el) == "<p>a < b & c > d</p>" + end + + @testset "write escapes special characters in attribute values" begin + el = Element("x"; a="a\"b") + @test contains(XML.write(el), "a=\"a"b\"") + end + + @testset "indentsize parameter" begin + el = Element("root", Element("child")) + s2 = XML.write(el; indentsize=2) + s4 = XML.write(el; indentsize=4) + @test contains(s2, " <child/>") + @test contains(s4, " <child/>") + end + + @testset "write xml:space='preserve' respects whitespace" begin + el = Element("root", Element("p", Text(" hello "); var"xml:space"="preserve")) + s = XML.write(el) + @test contains(s, "> hello </p>") + end +end + +#==============================================================================# +# WRITE TO FILE / READ FROM FILE # +#==============================================================================# +@testset "File I/O" begin + @testset "write and read back" begin + doc = Document( + Declaration(; version="1.0"), + Element("root", Element("child", "text")) + ) + temp = tempname() * ".xml" + XML.write(temp, doc) + content = read(temp, String) + @test contains(content, "<?xml") + @test contains(content, "<root>") + @test contains(content, "<child>text</child>") + doc2 = read(temp, Node) + @test nodetype(doc2) == Document + # Find the root element + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + child = first(filter(x -> nodetype(x) == Element, children(root))) + @test tag(child) == "child" + @test simple_value(child) == "text" + rm(temp) + end + + @testset "read from IO" begin + xml = """<?xml version="1.0"?><root>hello</root>""" + doc = read(IOBuffer(xml), Node) + @test nodetype(doc) == Document + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test simple_value(root) == "hello" + end +end + +#==============================================================================# +# PARSE → WRITE → PARSE ROUNDTRIP # +#==============================================================================# +@testset "Roundtrip: parse → write preserves semantics" begin + @testset "declaration and root" begin + xml = """<?xml version="1.0"?><root/>""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + decls = filter(x -> nodetype(x) == Declaration, children(doc2)) + @test length(decls) == 1 + @test decls[1]["version"] == "1.0" + els = filter(x -> nodetype(x) == Element, children(doc2)) + @test length(els) == 1 + @test tag(els[1]) == "root" + end + + @testset "element with attributes and text" begin + xml = """<root><child attr="val">text</child></root>""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + child = first(filter(x -> nodetype(x) == Element, children(root))) + @test tag(child) == "child" + @test child["attr"] == "val" + text_children = filter(x -> nodetype(x) == Text, children(child)) + @test any(t -> value(t) == "text", text_children) + end + + @testset "all special node types survive roundtrip" begin + xml = """<root><!-- comment --><![CDATA[data]]><?pi content?></root>""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + types = map(nodetype, filter(x -> nodetype(x) != Text, children(root))) + @test Comment in types + @test CData in types + @test ProcessingInstruction in types + end + + @testset "DOCTYPE survives roundtrip" begin + xml = """<!DOCTYPE html><html><body/></html>""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + dtds = filter(x -> nodetype(x) == DTD, children(doc2)) + @test length(dtds) == 1 + @test value(dtds[1]) == "html" + end + + @testset "namespace attributes survive roundtrip" begin + xml = """<root xmlns:ns="http://example.com"><ns:child/></root>""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + @test root["xmlns:ns"] == "http://example.com" + child = first(filter(x -> nodetype(x) == Element, children(root))) + @test tag(child) == "ns:child" + end + + @testset "mixed content survives roundtrip" begin + xml = """<p>Hello <b>world</b>!</p>""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + non_ws = filter(x -> !(nodetype(x) == Text && isempty(strip(value(x)))), children(root)) + texts = [value(x) for x in non_ws if nodetype(x) == Text] + @test any(t -> contains(t, "Hello"), texts) + @test any(t -> contains(t, "!"), texts) + bolds = filter(x -> nodetype(x) == Element && tag(x) == "b", non_ws) + @test length(bolds) == 1 + @test simple_value(bolds[1]) == "world" + end +end + +@testset "Roundtrip: file-based semantic preservation" begin + all_files = filter(isfile, [ + joinpath(@__DIR__, "data", "xml.xsd"), + joinpath(@__DIR__, "data", "kml.xsd"), + joinpath(@__DIR__, "data", "books.xml"), + # example.kml uses invalid <![CData[...]]> (lowercase), skip roundtrip + joinpath(@__DIR__, "data", "simple_dtd.xml"), + joinpath(@__DIR__, "data", "preserve.xml"), + ]) + + for path in all_files + node = read(path, Node) + temp = tempname() * ".xml" + XML.write(temp, node) + node2 = read(temp, Node) + # Verify structural properties are preserved + @test nodetype(node) == nodetype(node2) + # Count non-whitespace elements + count_elements(n) = sum(1 for c in children(n) if nodetype(c) == Element; init=0) + @test count_elements(node) == count_elements(node2) + rm(temp) + end +end + +#==============================================================================# +# PARSE Node{SubString{String}} # +#==============================================================================# +@testset "Parse with SubString{String}" begin + xml = """<?xml version="1.0"?><root attr="val"><child>text</child></root>""" + doc = parse(xml, Node{SubString{String}}) + @test nodetype(doc) == Document + @test tag(doc[2]) == "root" + @test doc[2]["attr"] == "val" + # SubString values + @test value(doc[2][1][1]) isa SubString{String} +end + +#==============================================================================# +# COMPLEX DOCUMENT PARSING # +#==============================================================================# +@testset "Complex Document Parsing" begin + @testset "books.xml" begin + path = joinpath(@__DIR__, "data", "books.xml") + isfile(path) || return + doc = read(path, Node) + @test nodetype(doc) == Document + + # Should have declaration + catalog + decl_nodes = filter(x -> nodetype(x) == Declaration, children(doc)) + @test length(decl_nodes) == 1 + @test decl_nodes[1]["version"] == "1.0" + + el_nodes = filter(x -> nodetype(x) == Element, children(doc)) + @test length(el_nodes) == 1 + catalog = el_nodes[1] + @test tag(catalog) == "catalog" + + # Catalog has 12 books + books = filter(x -> nodetype(x) == Element, children(catalog)) + @test length(books) == 12 + + # First book + book1 = books[1] + @test book1["id"] == "bk101" + + # Each book has: author, title, genre, price, publish_date, description + book_children = filter(x -> nodetype(x) == Element, children(book1)) + book_tags = map(tag, book_children) + @test "author" in book_tags + @test "title" in book_tags + @test "genre" in book_tags + @test "price" in book_tags + @test "publish_date" in book_tags + @test "description" in book_tags + + author = first(filter(x -> tag(x) == "author", book_children)) + @test simple_value(author) == "Gambardella, Matthew" + end + + @testset "simple_dtd.xml" begin + path = joinpath(@__DIR__, "data", "simple_dtd.xml") + isfile(path) || return + doc = read(path, Node) + @test nodetype(doc) == Document + + dtd_nodes = filter(x -> nodetype(x) == DTD, children(doc)) + @test length(dtd_nodes) == 1 + @test contains(value(dtd_nodes[1]), "ENTITY") + end + + @testset "preserve.xml" begin + path = joinpath(@__DIR__, "data", "preserve.xml") + isfile(path) || return + doc = read(path, Node) + @test nodetype(doc) == Document + + root = filter(x -> nodetype(x) == Element, children(doc))[1] + @test tag(root) == "root" + @test root["xml:space"] == "preserve" + + child_els = filter(x -> nodetype(x) == Element, children(root)) + @test length(child_els) == 1 + @test tag(child_els[1]) == "child" + @test child_els[1]["xml:space"] == "default" + end + + @testset "example.kml" begin + # example.kml uses invalid <![CData[...]]> (lowercase 'd') which is not valid XML + path = joinpath(@__DIR__, "data", "example.kml") + isfile(path) || return + @test_throws ArgumentError read(path, Node) + end + + @testset "tv.dtd" begin + path = joinpath(@__DIR__, "data", "tv.dtd") + isfile(path) || return + dtd_text = read(path, String) + pd = parse_dtd("TVSCHEDULE [\n" * dtd_text * "\n]") + @test pd.root == "TVSCHEDULE" + + @test length(pd.elements) == 10 + elem_names = map(e -> e.name, pd.elements) + @test "TVSCHEDULE" in elem_names + @test "CHANNEL" in elem_names + @test "PROGRAMSLOT" in elem_names + @test "TITLE" in elem_names + + @test length(pd.attributes) == 5 + attr_elements = map(a -> a.element, pd.attributes) + @test "TVSCHEDULE" in attr_elements + @test "CHANNEL" in attr_elements + @test "TITLE" in attr_elements + end +end + +#==============================================================================# +# DTD PARSING (parse_dtd) # +#==============================================================================# +@testset "DTD Parsing (parse_dtd)" begin + @testset "simple DTD with entities" begin + path = joinpath(@__DIR__, "data", "simple_dtd.xml") + isfile(path) || return + doc = read(path, Node) + dtd_node = first(filter(x -> nodetype(x) == DTD, children(doc))) + pd = parse_dtd(dtd_node) + @test pd.root == "note" + @test length(pd.entities) == 3 + @test pd.entities[1].name == "nbsp" + @test pd.entities[2].name == "writer" + @test pd.entities[3].name == "copyright" + @test pd.entities[2].value == "Writer: Donald Duck." + end + + @testset "DTD with SYSTEM external ID" begin + pd = parse_dtd("""root SYSTEM "root.dtd\"""") + @test pd.root == "root" + @test pd.system_id == "root.dtd" + @test pd.public_id === nothing + end + + @testset "DTD with PUBLIC external ID" begin + pd = parse_dtd("""root PUBLIC "-//W3C//DTD XHTML 1.0//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"""") + @test pd.root == "root" + @test pd.public_id == "-//W3C//DTD XHTML 1.0//EN" + @test pd.system_id == "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" + end + + @testset "DTD with ELEMENT declarations" begin + pd = parse_dtd("""root [ +<!ELEMENT root (child)> +<!ELEMENT child (#PCDATA)> +<!ELEMENT empty EMPTY> +<!ELEMENT any ANY> +]""") + @test pd.root == "root" + @test length(pd.elements) == 4 + @test pd.elements[1].name == "root" + @test pd.elements[1].content == "(child)" + @test pd.elements[2].name == "child" + @test pd.elements[2].content == "(#PCDATA)" + @test pd.elements[3].name == "empty" + @test pd.elements[3].content == "EMPTY" + @test pd.elements[4].name == "any" + @test pd.elements[4].content == "ANY" + end + + @testset "DTD with ATTLIST declarations (spec examples)" begin + pd = parse_dtd("""root [ +<!ATTLIST termdef id ID #REQUIRED name CDATA #IMPLIED> +<!ATTLIST list type (bullets|ordered|glossary) "ordered"> +<!ATTLIST form method CDATA #FIXED "POST"> +]""") + @test length(pd.attributes) == 4 + @test pd.attributes[1].element == "termdef" + @test pd.attributes[1].name == "id" + @test pd.attributes[1].type == "ID" + @test pd.attributes[1].default == "#REQUIRED" + @test pd.attributes[2].name == "name" + @test pd.attributes[2].type == "CDATA" + @test pd.attributes[2].default == "#IMPLIED" + @test pd.attributes[3].element == "list" + @test pd.attributes[3].name == "type" + @test pd.attributes[3].default == "\"ordered\"" + @test pd.attributes[4].element == "form" + @test pd.attributes[4].name == "method" + @test pd.attributes[4].default == "#FIXED \"POST\"" + end + + @testset "DTD with ENTITY declarations (spec examples)" begin + pd = parse_dtd("""root [ +<!ENTITY Pub-Status "This is a pre-release of the specification."> +<!ENTITY open-hatch SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml"> +<!ENTITY open-hatch2 PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" "http://www.textuality.com/boilerplate/OpenHatch.xml"> +<!ENTITY % YN '"Yes"'> +]""") + @test length(pd.entities) == 4 + @test pd.entities[1].name == "Pub-Status" + @test pd.entities[1].value == "This is a pre-release of the specification." + @test pd.entities[1].parameter == false + + @test pd.entities[2].name == "open-hatch" + @test pd.entities[2].value === nothing + @test contains(pd.entities[2].external_id, "SYSTEM") + + @test pd.entities[3].name == "open-hatch2" + @test contains(pd.entities[3].external_id, "PUBLIC") + + @test pd.entities[4].name == "YN" + @test pd.entities[4].parameter == true + end + + @testset "DTD with NOTATION declarations (spec example)" begin + pd = parse_dtd("""root [ +<!NOTATION vrml PUBLIC "VRML 1.0"> +<!NOTATION jpeg SYSTEM "image/jpeg"> +]""") + @test length(pd.notations) == 2 + @test pd.notations[1].name == "vrml" + @test contains(pd.notations[1].external_id, "PUBLIC") + @test pd.notations[2].name == "jpeg" + @test contains(pd.notations[2].external_id, "SYSTEM") + end + + @testset "parse_dtd from Node" begin + dtd = DTD("root [<!ELEMENT root (#PCDATA)>]") + pd = parse_dtd(dtd) + @test pd.root == "root" + @test length(pd.elements) == 1 + end + + @testset "parse_dtd errors on non-DTD node" begin + @test_throws ErrorException parse_dtd(Element("x")) + end + + @testset "complex DTD file (structure test)" begin + # complex_dtd.xml uses parameter entity references (%text;) which parse_dtd + # does not expand, so we just verify parsing the XML document itself works + path = joinpath(@__DIR__, "data", "complex_dtd.xml") + isfile(path) || return + doc = read(path, Node) + dtd_node = first(filter(x -> nodetype(x) == DTD, children(doc))) + @test nodetype(dtd_node) == DTD + @test contains(value(dtd_node), "test") + @test contains(value(dtd_node), "ELEMENT") + @test contains(value(dtd_node), "ATTLIST") + @test contains(value(dtd_node), "NOTATION") + @test contains(value(dtd_node), "ENTITY") + end +end + +#==============================================================================# +# XML 1.0 SPEC: ELEMENT TYPE DECLARATIONS (Section 3.2) # +#==============================================================================# +@testset "Spec 3.2: Element Type Declarations" begin + @testset "EMPTY content model" begin + pd = parse_dtd("root [<!ELEMENT br EMPTY>]") + @test pd.elements[1].content == "EMPTY" + end + + @testset "ANY content model" begin + pd = parse_dtd("root [<!ELEMENT container ANY>]") + @test pd.elements[1].content == "ANY" + end + + @testset "#PCDATA content model" begin + pd = parse_dtd("root [<!ELEMENT text (#PCDATA)>]") + @test pd.elements[1].content == "(#PCDATA)" + end + + @testset "mixed content model" begin + pd = parse_dtd("root [<!ELEMENT p (#PCDATA|emph)*>]") + @test pd.elements[1].content == "(#PCDATA|emph)*" + end + + @testset "sequence content model" begin + pd = parse_dtd("root [<!ELEMENT spec (front, body, back?)>]") + @test pd.elements[1].content == "(front, body, back?)" + end + + @testset "choice content model" begin + pd = parse_dtd("root [<!ELEMENT div1 (head, (p | list | note)*, div2*)>]") + @test pd.elements[1].content == "(head, (p | list | note)*, div2*)" + end +end + +#==============================================================================# +# XML 1.0 SPEC: ATTRIBUTE-LIST DECLARATIONS (Section 3.3) # +#==============================================================================# +@testset "Spec 3.3: Attribute-List Declarations" begin + @testset "ID attribute" begin + pd = parse_dtd("root [<!ATTLIST el id ID #REQUIRED>]") + @test pd.attributes[1].type == "ID" + @test pd.attributes[1].default == "#REQUIRED" + end + + @testset "CDATA attribute with default" begin + pd = parse_dtd("""root [<!ATTLIST el name CDATA "default">]""") + @test pd.attributes[1].type == "CDATA" + @test pd.attributes[1].default == "\"default\"" + end + + @testset "enumerated attribute" begin + pd = parse_dtd("""root [<!ATTLIST list type (bullets|ordered|glossary) "ordered">]""") + @test contains(pd.attributes[1].type, "bullets") + @test pd.attributes[1].default == "\"ordered\"" + end + + @testset "#IMPLIED attribute" begin + pd = parse_dtd("root [<!ATTLIST el opt CDATA #IMPLIED>]") + @test pd.attributes[1].default == "#IMPLIED" + end + + @testset "#FIXED attribute" begin + pd = parse_dtd("""root [<!ATTLIST el method CDATA #FIXED "POST">]""") + @test pd.attributes[1].default == "#FIXED \"POST\"" + end + + @testset "NOTATION attribute type" begin + pd = parse_dtd("root [<!ATTLIST fig notation NOTATION (jpeg|png) #IMPLIED>]") + @test contains(pd.attributes[1].type, "NOTATION") + end + + @testset "multiple attributes in one ATTLIST" begin + pd = parse_dtd("""root [<!ATTLIST book + id ID #REQUIRED + isbn CDATA #IMPLIED + format (hardcover|paperback|ebook) "paperback">]""") + @test length(pd.attributes) == 3 + @test pd.attributes[1].name == "id" + @test pd.attributes[2].name == "isbn" + @test pd.attributes[3].name == "format" + end +end + +#==============================================================================# +# XML 1.0 SPEC: ENTITY DECLARATIONS (Section 4.2) # +#==============================================================================# +@testset "Spec 4.2: Entity Declarations" begin + @testset "internal general entity (spec example)" begin + pd = parse_dtd("""root [<!ENTITY Pub-Status "This is a pre-release of the specification.">]""") + @test pd.entities[1].name == "Pub-Status" + @test pd.entities[1].value == "This is a pre-release of the specification." + @test pd.entities[1].external_id === nothing + @test pd.entities[1].parameter == false + end + + @testset "external entity with SYSTEM (spec example)" begin + pd = parse_dtd("""root [<!ENTITY open-hatch SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml">]""") + @test pd.entities[1].name == "open-hatch" + @test pd.entities[1].value === nothing + @test contains(pd.entities[1].external_id, "SYSTEM") + @test contains(pd.entities[1].external_id, "http://www.textuality.com/boilerplate/OpenHatch.xml") + end + + @testset "external entity with PUBLIC (spec example)" begin + pd = parse_dtd("""root [<!ENTITY open-hatch PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" "http://www.textuality.com/boilerplate/OpenHatch.xml">]""") + @test pd.entities[1].name == "open-hatch" + @test contains(pd.entities[1].external_id, "PUBLIC") + end + + @testset "parameter entity" begin + pd = parse_dtd("""root [<!ENTITY % YN '"Yes"'>]""") + @test pd.entities[1].name == "YN" + @test pd.entities[1].parameter == true + end +end + +#==============================================================================# +# XML 1.0 SPEC: NOTATION DECLARATIONS (Section 4.7) # +#==============================================================================# +@testset "Spec 4.7: Notation Declarations" begin + @testset "NOTATION with PUBLIC (spec example)" begin + pd = parse_dtd("""root [<!NOTATION vrml PUBLIC "VRML 1.0">]""") + @test pd.notations[1].name == "vrml" + @test contains(pd.notations[1].external_id, "PUBLIC") + @test contains(pd.notations[1].external_id, "VRML 1.0") + end + + @testset "NOTATION with SYSTEM" begin + pd = parse_dtd("""root [<!NOTATION jpeg SYSTEM "image/jpeg">]""") + @test pd.notations[1].name == "jpeg" + @test contains(pd.notations[1].external_id, "SYSTEM") + end +end + +#==============================================================================# +# ERROR HANDLING # +#==============================================================================# +@testset "Error Handling" begin + @testset "mismatched tags" begin + @test_throws ErrorException parse("<a></b>", Node) + end + + @testset "unclosed tag" begin + @test_throws ErrorException parse("<a><b></a>", Node) + end + + @testset "closing tag with no open tag" begin + @test_throws ErrorException parse("</a>", Node) + end + + @testset "unclosed root element" begin + @test_throws ErrorException parse("<root>", Node) + end + + @testset "unterminated comment" begin + @test_throws Exception parse("<root><!-- no end", Node) + end + + @testset "unterminated CDATA" begin + @test_throws Exception parse("<root><![CDATA[no end", Node) + end + + @testset "unterminated PI" begin + @test_throws Exception parse("<?pi no end", Node) + end + + @testset "unterminated attribute value" begin + @test_throws Exception parse("""<a b="no end""", Node) + end +end + +#==============================================================================# +# ILL-FORMED XML (must error) # +#==============================================================================# +@testset "Ill-Formed XML" begin + # ---- Tag structure ---- + @testset "mismatched close tag" begin + @test_throws Exception parse("<a></b>", Node) + end + + @testset "overlapping elements" begin + @test_throws Exception parse("<a><b></a></b>", Node) + end + + @testset "deeply mismatched nesting" begin + @test_throws Exception parse("<a><b><c></b></c></a>", Node) + end + + @testset "multiple unclosed tags" begin + @test_throws Exception parse("<a><b><c>", Node) + end + + @testset "close tag without open" begin + @test_throws Exception parse("</a>", Node) + end + + @testset "close tag after self-closing" begin + @test_throws Exception parse("<a/></a>", Node) + end + + @testset "nested close tag without open" begin + @test_throws Exception parse("<root></inner></root>", Node) + end + + # ---- Unterminated constructs ---- + @testset "unterminated open tag at EOF" begin + @test_throws Exception parse("<root><unclosed", Node) + end + + @testset "unterminated attribute value (double quote)" begin + @test_throws Exception parse("""<a x="no end""", Node) + end + + @testset "unterminated attribute value (single quote)" begin + @test_throws Exception parse("<a x='no end", Node) + end + + @testset "unterminated comment" begin + @test_throws Exception parse("<!-- no end", Node) + end + + @testset "unterminated CDATA" begin + @test_throws Exception parse("<![CDATA[no end", Node) + end + + @testset "unterminated processing instruction" begin + @test_throws Exception parse("<?pi no end", Node) + end + + @testset "unterminated DOCTYPE" begin + @test_throws Exception parse("<!DOCTYPE x", Node) + end + + # ---- Attribute errors ---- + @testset "duplicate attribute on element" begin + @test_throws Exception parse("""<a x="1" x="2"/>""", Node) + end + + @testset "duplicate attribute (different values)" begin + @test_throws Exception parse("""<root attr="a" attr="b"></root>""", Node) + end + + @testset "duplicate attribute in declaration" begin + @test_throws Exception parse("""<?xml version="1.0" version="1.1"?><a/>""", Node) + end + + @testset "attribute without value" begin + @test_throws Exception parse("<a disabled/>", Node) + end + + @testset "attribute with unquoted value" begin + @test_throws Exception parse("<a x=hello/>", Node) + end + + # ---- Tokenizer-level errors ---- + @testset "lone <" begin + @test_throws Exception parse("<", Node) + end + + @testset "lone < in text content" begin + @test_throws Exception parse("<root>a < b</root>", Node) + end + + @testset "tag with space before name" begin + @test_throws Exception parse("< root/>", Node) + end +end + +#==============================================================================# +# UNICODE SUPPORT # +#==============================================================================# +@testset "Unicode Support" begin + @testset "Unicode in text content" begin + doc = parse("<root>caf\u00e9 \u00f1 \u65e5\u672c\u8a9e</root>", Node) + @test simple_value(doc[1]) == "caf\u00e9 \u00f1 \u65e5\u672c\u8a9e" + end + + @testset "Unicode in attribute values" begin + doc = parse("<root name=\"\u00fcber\"/>", Node) + @test doc[1]["name"] == "\u00fcber" + end + + @testset "Unicode in comments" begin + doc = parse("<root><!-- h\u00e9llo --></root>", Node) + @test value(doc[1][1]) == " h\u00e9llo " + end + + @testset "CJK characters" begin + doc = parse("<root>\u4e2d\u6587</root>", Node) + @test simple_value(doc[1]) == "\u4e2d\u6587" + end + + @testset "emoji in text" begin + doc = parse("<root>\U0001f600\U0001f680</root>", Node) + @test simple_value(doc[1]) == "\U0001f600\U0001f680" + end + + @testset "Cyrillic characters" begin + doc = parse("<root>\u041f\u0440\u0438\u0432\u0435\u0442</root>", Node) + @test simple_value(doc[1]) == "\u041f\u0440\u0438\u0432\u0435\u0442" + end + + @testset "Arabic characters" begin + doc = parse("<root>\u0645\u0631\u062d\u0628\u0627</root>", Node) + @test simple_value(doc[1]) == "\u0645\u0631\u062d\u0628\u0627" + end +end + +#==============================================================================# +# EDGE CASES # +#==============================================================================# +@testset "Edge Cases" begin + @testset "document with only whitespace around root" begin + doc = parse(" \n <root/>\n ", Node) + # Parser preserves whitespace as Text nodes + els = filter(x -> nodetype(x) == Element, children(doc)) + @test length(els) == 1 + @test tag(els[1]) == "root" + end + + @testset "deeply nested elements" begin + xml = "<a><b><c><d><e><f>deep</f></e></d></c></b></a>" + doc = parse(xml, Node) + @test simple_value(doc[1][1][1][1][1][1]) == "deep" + end + + @testset "many siblings" begin + items = join(["<item>$i</item>" for i in 1:100]) + xml = "<root>$items</root>" + doc = parse(xml, Node) + @test length(doc[1]) == 100 + @test simple_value(doc[1][1]) == "1" + @test simple_value(doc[1][100]) == "100" + end + + @testset "element with hyphens and dots in name" begin + doc = parse("<my-element.name/>", Node) + @test tag(doc[1]) == "my-element.name" + end + + @testset "element with underscore in name" begin + doc = parse("<_private/>", Node) + @test tag(doc[1]) == "_private" + end + + @testset "attribute with numeric value" begin + doc = parse("""<x count="42"/>""", Node) + @test doc[1]["count"] == "42" + end + + @testset "empty text content" begin + doc = parse("<root></root>", Node) + @test isnothing(doc[1].children) + end + + @testset "adjacent CDATA and text" begin + doc = parse("<root>text<![CDATA[cdata]]>more</root>", Node) + @test length(doc[1]) == 3 + @test value(doc[1][1]) == "text" + @test value(doc[1][2]) == "cdata" + @test value(doc[1][3]) == "more" + end + + @testset "multiple CDATA sections" begin + doc = parse("<root><![CDATA[a]]><![CDATA[b]]></root>", Node) + @test length(doc[1]) == 2 + @test value(doc[1][1]) == "a" + @test value(doc[1][2]) == "b" + end + + @testset "comment between elements" begin + doc = parse("<root><a/><!-- between --><b/></root>", Node) + @test length(doc[1]) == 3 + @test nodetype(doc[1][2]) == Comment + end + + @testset "PI between elements" begin + doc = parse("<root><a/><?pi data?><b/></root>", Node) + @test length(doc[1]) == 3 + @test nodetype(doc[1][2]) == ProcessingInstruction + end + + @testset "all node types in one document" begin + xml = """<?xml version="1.0"?> +<!DOCTYPE root SYSTEM "root.dtd"> +<!-- comment --> +<?pi data?> +<root> + text + <child attr="val"/> + <!-- inner comment --> + <![CDATA[cdata]]> + <?inner-pi inner data?> +</root>""" + doc = parse(xml, Node) + types = map(nodetype, children(doc)) + @test Declaration in types + @test DTD in types + @test Comment in types + @test ProcessingInstruction in types + @test Element in types + end + + @testset "very long attribute value" begin + long_val = repeat("a", 10000) + doc = parse("""<x attr="$(long_val)"/>""", Node) + @test doc[1]["attr"] == long_val + end + + @testset "very long text content" begin + long_text = repeat("hello ", 10000) + doc = parse("<root>$(long_text)</root>", Node) + @test simple_value(doc[1]) == long_text + end + + @testset "CDATA with ]] but not followed by >" begin + doc = parse("<root><![CDATA[a]]b]]></root>", Node) + @test value(doc[1][1]) == "a]]b" + end +end + +#==============================================================================# +# SPEC EXAMPLES: FULL DOCUMENTS # +#==============================================================================# +@testset "Full Spec-Like Documents" begin + @testset "spec section 2.1: minimal document" begin + xml = """<?xml version="1.0"?> +<greeting>Hello, world!</greeting>""" + doc = parse(xml, Node) + @test nodetype(doc) == Document + @test simple_value(doc[end]) == "Hello, world!" + end + + @testset "spec section 2.8: document with external DTD" begin + xml = """<?xml version="1.0"?> +<!DOCTYPE greeting SYSTEM "hello.dtd"> +<greeting>Hello, world!</greeting>""" + doc = parse(xml, Node) + # Filter out whitespace text nodes to check structure + typed = filter(x -> nodetype(x) != Text, children(doc)) + @test length(typed) == 3 + @test nodetype(typed[1]) == Declaration + @test nodetype(typed[2]) == DTD + @test nodetype(typed[3]) == Element + end + + @testset "spec: document with internal subset" begin + xml = """<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE greeting [ + <!ELEMENT greeting (#PCDATA)> +]> +<greeting>Hello, world!</greeting>""" + doc = parse(xml, Node) + typed = filter(x -> nodetype(x) != Text, children(doc)) + @test typed[1]["encoding"] == "UTF-8" + @test nodetype(typed[2]) == DTD + pd = parse_dtd(typed[2]) + @test pd.root == "greeting" + @test length(pd.elements) == 1 + @test pd.elements[1].name == "greeting" + @test pd.elements[1].content == "(#PCDATA)" + @test simple_value(typed[3]) == "Hello, world!" + end + + @testset "typical HTML5-like doctype" begin + xml = """<!DOCTYPE html><html><head><title>Test

Content

""" + doc = parse(xml, Node) + @test nodetype(doc[1]) == DTD + @test value(doc[1]) == "html" + @test tag(doc[2]) == "html" + end + + @testset "SVG document" begin + xml = """ + + + Hello SVG +""" + doc = parse(xml, Node) + svg = doc[end] + @test tag(svg) == "svg" + @test svg["xmlns"] == "http://www.w3.org/2000/svg" + @test svg["width"] == "100" + + elements = filter(x -> nodetype(x) == Element, children(svg)) + @test length(elements) == 2 + @test tag(elements[1]) == "circle" + @test elements[1]["fill"] == "red" + @test tag(elements[2]) == "text" + @test value(elements[2][1]) == "Hello SVG" + end + + @testset "SOAP-like envelope" begin + xml = """ + + + + + IBM + + +""" + doc = parse(xml, Node) + env = doc[end] + @test tag(env) == "soap:Envelope" + elements = filter(x -> nodetype(x) == Element, children(env)) + @test tag(elements[1]) == "soap:Header" + @test tag(elements[2]) == "soap:Body" + end + + @testset "RSS-like feed" begin + xml = """ + + + Example Feed + http://example.com + An example RSS feed + + Item 1 + http://example.com/1 + + + Item 2 + http://example.com/2 + + +""" + doc = parse(xml, Node) + rss = doc[end] + @test tag(rss) == "rss" + @test rss["version"] == "2.0" + channel = first(filter(x -> nodetype(x) == Element, children(rss))) + @test tag(channel) == "channel" + items = filter(x -> nodetype(x) == Element && tag(x) == "item", children(channel)) + @test length(items) == 2 + end + + @testset "Atom-like feed" begin + xml = """ + + Example Feed + + Atom-Powered Robots Run Amok + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 2003-12-13T18:30:02Z + Some text. + +""" + doc = parse(xml, Node) + feed = doc[end] + @test tag(feed) == "feed" + @test feed["xmlns"] == "http://www.w3.org/2005/Atom" + entries = filter(x -> nodetype(x) == Element && tag(x) == "entry", children(feed)) + @test length(entries) == 1 + end + + @testset "MathML-like document" begin + xml = """ + + + x + 2 + + + + 1 + +""" + doc = parse(xml, Node) + math = doc[1] + @test tag(math) == "math" + @test math["xmlns"] == "http://www.w3.org/1998/Math/MathML" + end + + @testset "document with processing instructions and comments mixed" begin + xml = """ + + + + + + + +""" + doc = parse(xml, Node) + types = map(nodetype, children(doc)) + @test count(==(Comment), types) == 2 + @test count(==(ProcessingInstruction), types) >= 1 + @test count(==(Element), types) == 1 + end +end + +#==============================================================================# +# SHOW / DISPLAY # +#==============================================================================# +@testset "Show (REPL display)" begin + @testset "show Text" begin + t = Text("hello") + s = sprint(show, t) + @test contains(s, "Text") + @test contains(s, "hello") + end + + @testset "show Element" begin + el = Element("div"; class="main") + s = sprint(show, el) + @test contains(s, "Element") + @test contains(s, "hello

" + end +end + +#==============================================================================# +# SHOW (text/xml MIME) ROUNDTRIP # +#==============================================================================# +@testset "text/xml MIME output" begin + doc = Document( + Declaration(; version="1.0"), + Element("root", Element("child", "text")) + ) + xml_str = sprint(show, MIME("text/xml"), doc) + @test contains(xml_str, "") + @test contains(xml_str, "text") + # Verify it's parseable + doc2 = parse(xml_str, Node) + @test nodetype(doc2) == Document + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + @test tag(root) == "root" + child = first(filter(x -> nodetype(x) == Element, children(root))) + @test simple_value(child) == "text" +end + +#==============================================================================# +# CONSTRUCTION → WRITE → PARSE ROUNDTRIP # +#==============================================================================# +@testset "Construction → Write → Parse" begin + @testset "simple element: write then parse preserves semantics" begin + el = Element("greeting", "Hello, world!") + xml = XML.write(Document(el)) + doc2 = parse(xml, Node) + @test simple_value(doc2[1]) == "Hello, world!" + end + + @testset "element with attributes: write then parse preserves attributes" begin + el = Element("item"; id="1", class="active") + xml = XML.write(Document(el)) + doc2 = parse(xml, Node) + @test doc2[1]["id"] == "1" + @test doc2[1]["class"] == "active" + end + + @testset "single-child text elements roundtrip" begin + doc = Document(Element("root", "text")) + xml = XML.write(doc) + doc2 = parse(xml, Node) + @test doc == doc2 + end + + @testset "self-closing elements roundtrip" begin + doc = Document(Element("root")) + xml = XML.write(doc) + doc2 = parse(xml, Node) + @test doc == doc2 + end + + @testset "all node types survive write → parse" begin + doc = Document( + Declaration(; version="1.0"), + Comment(" header "), + Element("root", + Element("child", "text"), + CData("raw "), + Comment(" inner "), + ProcessingInstruction("pi", "content") + ) + ) + xml = XML.write(doc) + doc2 = parse(xml, Node) + typed = filter(x -> nodetype(x) != Text, children(doc2)) + @test count(==(Declaration), map(nodetype, typed)) == 1 + @test count(==(Comment), map(nodetype, typed)) == 1 + @test count(==(Element), map(nodetype, typed)) == 1 + root = first(filter(x -> nodetype(x) == Element, typed)) + inner = filter(x -> nodetype(x) != Text, children(root)) + inner_types = map(nodetype, inner) + @test Element in inner_types + @test CData in inner_types + @test Comment in inner_types + @test ProcessingInstruction in inner_types + end + + @testset "special characters in text roundtrip" begin + el = Element("p", "a < b & c > d ' e \" f") + xml = XML.write(Document(el)) + doc2 = parse(xml, Node) + @test simple_value(doc2[1]) == "a < b & c > d ' e \" f" + end + + @testset "special characters in attributes roundtrip" begin + el = Element("x"; data="a&bd'e\"f") + xml = XML.write(Document(el)) + doc2 = parse(xml, Node) + @test doc2[1]["data"] == "a&bd'e\"f" + end +end + +#==============================================================================# +# KML-LIKE DOCUMENT # +#==============================================================================# +@testset "KML-like Document" begin + xml = """ + + + KML Sample + + Simple placemark + Attached to the ground. + + -122.0822035,37.4220033612141,0 + + + +""" + doc = parse(xml, Node) + kml = doc[end] + @test tag(kml) == "kml" + @test kml["xmlns"] == "http://www.opengis.net/kml/2.2" + + document = first(filter(x -> nodetype(x) == Element, children(kml))) + @test tag(document) == "Document" + + name = first(filter(x -> nodetype(x) == Element && tag(x) == "name", children(document))) + @test simple_value(name) == "KML Sample" + + pm = first(filter(x -> nodetype(x) == Element && tag(x) == "Placemark", children(document))) + pm_name = first(filter(x -> nodetype(x) == Element && tag(x) == "name", children(pm))) + @test simple_value(pm_name) == "Simple placemark" +end + +#==============================================================================# +# XHTML-LIKE DOCUMENT # +#==============================================================================# +@testset "XHTML-like Document" begin + xml = """ + + + + XHTML Test + + + +

Hello World

+

This is a test of XHTML.

+
+ An image + +""" + doc = parse(xml, Node) + typed = filter(x -> nodetype(x) != Text, children(doc)) + @test nodetype(typed[1]) == Declaration + @test nodetype(typed[2]) == DTD + @test contains(value(typed[2]), "PUBLIC") + + html = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(html) == "html" + @test html["xmlns"] == "http://www.w3.org/1999/xhtml" + + head_el = first(filter(x -> nodetype(x) == Element && tag(x) == "head", children(html))) + title_el = first(filter(x -> nodetype(x) == Element && tag(x) == "title", children(head_el))) + @test simple_value(title_el) == "XHTML Test" + + body_el = first(filter(x -> nodetype(x) == Element && tag(x) == "body", children(html))) + h1_el = first(filter(x -> nodetype(x) == Element && tag(x) == "h1", children(body_el))) + @test simple_value(h1_el) == "Hello World" + + # Verify write produces valid XML that can be re-parsed + xml2 = XML.write(doc) + doc2 = parse(xml2, Node) + @test nodetype(doc2) == Document +end + +#==============================================================================# +# PLIST-LIKE DOCUMENT # +#==============================================================================# +@testset "plist-like Document" begin + xml = """ + + + + CFBundleName + MyApp + CFBundleVersion + 1.0 + LSRequiresIPhoneOS + + +""" + doc = parse(xml, Node) + plist = doc[end] + @test tag(plist) == "plist" + @test plist["version"] == "1.0" + + dict = first(filter(x -> nodetype(x) == Element, children(plist))) + @test tag(dict) == "dict" + + elements = filter(x -> nodetype(x) == Element, children(dict)) + keys_found = [simple_value(e) for e in elements if tag(e) == "key"] + @test "CFBundleName" in keys_found + @test "CFBundleVersion" in keys_found +end + +#==============================================================================# +# MAVEN POM-LIKE DOCUMENT # +#==============================================================================# +@testset "Maven POM-like Document" begin + xml = """ + + 4.0.0 + com.example + my-app + 1.0-SNAPSHOT + + + junit + junit + 4.13.2 + test + + +""" + doc = parse(xml, Node) + project = doc[end] + @test tag(project) == "project" + + elements = filter(x -> nodetype(x) == Element, children(project)) + version = first(filter(x -> tag(x) == "version", elements)) + @test simple_value(version) == "1.0-SNAPSHOT" + + deps = first(filter(x -> tag(x) == "dependencies", elements)) + dep_list = filter(x -> nodetype(x) == Element, children(deps)) + @test length(dep_list) == 1 + @test tag(dep_list[1]) == "dependency" +end + +#==============================================================================# +# GITHUB ISSUES REGRESSION TESTS # +#==============================================================================# +@testset "GitHub Issues" begin + + #--- Issue #7: attribute order should not affect equality --- + @testset "#7: attribute-order-insensitive ==" begin + a = Element("x"; first="1", second="2") + b = Element("x"; second="2", first="1") + @test a == b + + # Same attrs same order still works + c = Element("x"; a="1", b="2") + d = Element("x"; a="1", b="2") + @test c == d + + # Different values are still not equal + @test Element("x"; a="1") != Element("x"; a="2") + + # Different attr names are not equal + @test Element("x"; a="1") != Element("x"; b="1") + + # Different number of attrs + @test Element("x"; a="1") != Element("x"; a="1", b="2") + + # Parsed elements with same attrs in different order + doc1 = parse("""""", Node) + doc2 = parse("""""", Node) + @test doc1[1] == doc2[1] + + # No attrs vs empty attrs (both are "no attributes") + @test Element("x") == Element("x") + end + + #--- Issue #17: numeric character references --- + @testset "#17: numeric character references (&#decimal; and &#xHex;)" begin + # Decimal character references + @test unescape("<") == "<" + @test unescape(">") == ">" + @test unescape("&") == "&" + @test unescape("'") == "'" + @test unescape(""") == "\"" + + # Hex character references (lowercase x) + @test unescape("<") == "<" + @test unescape("<") == "<" + @test unescape(">") == ">" + @test unescape("&") == "&" + @test unescape("'") == "'" + @test unescape(""") == "\"" + + # Uppercase X also works + @test unescape("A") == "A" + + # Unicode character references + @test unescape("A") == "A" + @test unescape("A") == "A" + @test unescape("é") == "\u00e9" # é + @test unescape("é") == "\u00e9" # é + @test unescape("中") == "\u4e2d" # 中 + @test unescape("😀") == "\U0001f600" # 😀 + + # Mixed with named entities + @test unescape("&<<") == "&<<" + @test unescape("<tag>") == "" + + # In parsed XML text + doc = parse("<hello>", Node) + @test simple_value(doc[1]) == "" + + # In parsed XML attributes + doc = parse("""""", Node) + @test doc[1]["a"] == "<>" + + # Non-breaking space + @test unescape(" ") == "\u00a0" + @test unescape(" ") == "\u00a0" + + # Invalid numeric reference preserved verbatim + @test unescape("&#xZZZ;") == "&#xZZZ;" + + # Named entity references that aren't predefined are preserved verbatim + @test unescape("&foo;") == "&foo;" + + # Ampersand without semicolon is preserved + @test unescape("a & b") == "a & b" + end + + #--- Issue #33: empty attributes consistency --- + @testset "#33: empty attributes [] vs nothing" begin + # Constructed elements have empty Vector for attrs + a = Element("x") + # Parsed elements with no attrs have nothing + b = parse("", Node)[1] + # They should compare equal via _eq / _attrs_eq + @test a == b + end + + #--- Issue #35: write → parse preserves structure --- + @testset "#35: write then parse preserves structure" begin + doc = Document( + Declaration(; version="1.0"), + Element("root", + Element("child", "text"), + Element("empty") + ) + ) + xml = XML.write(doc) + doc2 = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + child_elements = filter(x -> nodetype(x) == Element, children(root)) + @test length(child_elements) == 2 + @test tag(child_elements[1]) == "child" + @test tag(child_elements[2]) == "empty" + end + + #--- Issue #50: Base.get with default --- + @testset "#50: Base.get(node, key, default)" begin + el = parse("""""", Node)[1] + + # Existing keys return their values + @test get(el, "a", "default") == "1" + @test get(el, "b", "default") == "2" + + # Non-existing key returns default + @test get(el, "c", "default") == "default" + @test get(el, "c", nothing) === nothing + + # Works on elements with no attributes + el2 = parse("", Node)[1] + @test get(el2, "a", "nope") == "nope" + + # Works on constructed elements + el3 = Element("x"; foo="bar") + @test get(el3, "foo", "default") == "bar" + @test get(el3, "baz", "default") == "default" + end + + #--- Issue #52: escape double-escapes (expected behavior) --- + @testset "#52: escape is not idempotent (by design)" begin + @test escape("&") == "&" + @test escape("&") == "&amp;" # double-escaping is correct + end + + #--- Issue #53: unescape works correctly --- + @testset "#53: unescape works correctly on parsed content" begin + doc = parse("&", Node) + @test simple_value(doc[1]) == "&" + doc = parse("<tag>", Node) + @test simple_value(doc[1]) == "" + end +end + +#==============================================================================# +# TREE NAVIGATION: parent, depth, siblings # +#==============================================================================# +@testset "Tree Navigation" begin + doc = parse("", Node) + root = doc[1] + a = root[1] + a1 = a[1] + a2 = a[2] + b = root[2] + c = root[3] + c1 = c[1] + c1a = c1[1] + + @testset "parent" begin + @test parent(root, doc) === doc + @test parent(a, doc) === root + @test parent(a1, doc) === a + @test parent(c1a, doc) === c1 + @test parent(b, root) === root + @test_throws ErrorException parent(doc, doc) # root has no parent + @test_throws ErrorException parent(Element("x"), doc) # not in tree + end + + @testset "depth" begin + @test depth(doc, doc) == 0 + @test depth(root, doc) == 1 + @test depth(a, doc) == 2 + @test depth(a1, doc) == 3 + @test depth(c1a, doc) == 4 + @test depth(b, root) == 1 + @test_throws ErrorException depth(Element("x"), doc) + end + + @testset "siblings" begin + @test siblings(a, doc) == [b, c] + @test siblings(b, doc) == [a, c] + @test siblings(a1, doc) == [a2] + @test siblings(a2, doc) == [a1] + @test isempty(siblings(c1, doc)) + @test_throws ErrorException siblings(doc, doc) # root has no parent + end + + @testset "1-arg parent/depth errors" begin + @test_throws ErrorException parent(a) + @test_throws ErrorException depth(a) + end +end + +#==============================================================================# +# DEPRECATIONS / REMOVED API # +#==============================================================================# +@testset "Deprecations and Removed API" begin + node = Element("test") + node2 = Element("other") + + @testset "XML.next errors" begin + @test_throws ErrorException XML.next(node) + end + + @testset "XML.prev errors" begin + @test_throws ErrorException XML.prev(node) + end + + @testset "XML.nodes_equal errors" begin + @test_throws ErrorException XML.nodes_equal(node, node2) + end + + @testset "XML.escape! errors" begin + @test_throws ErrorException XML.escape!(node) + @test_throws ErrorException XML.escape!(node, false) + end + + @testset "XML.unescape! errors" begin + @test_throws ErrorException XML.unescape!(node) + @test_throws ErrorException XML.unescape!(node, false) + end + + @testset "XML.Raw errors" begin + @test_throws ErrorException XML.Raw() + @test_throws ErrorException XML.Raw("arg") + end + + @testset "simplevalue binding redirects to simple_value" begin + el = Element("x", "val") + @test XML.simplevalue(el) == simple_value(el) + end +end + +#==============================================================================# +# XPATH # +#==============================================================================# +@testset "XPath" begin + doc = parse(""" + + Alice + Bob + Carol + + dark + """, Node) + + @testset "absolute path" begin + results = xpath(doc, "/root/users/user") + @test length(results) == 3 + @test all(n -> tag(n) == "user", results) + end + + @testset "single child" begin + results = xpath(doc, "/root/settings/theme") + @test length(results) == 1 + @test tag(results[1]) == "theme" + end + + @testset "positional predicate [n]" begin + results = xpath(doc, "/root/users/user[1]") + @test length(results) == 1 + @test results[1]["id"] == "1" + + results = xpath(doc, "/root/users/user[3]") + @test length(results) == 1 + @test results[1]["id"] == "3" + end + + @testset "[last()]" begin + results = xpath(doc, "/root/users/user[last()]") + @test length(results) == 1 + @test results[1]["id"] == "3" + end + + @testset "out of bounds predicate" begin + results = xpath(doc, "/root/users/user[99]") + @test isempty(results) + end + + @testset "has-attribute predicate [@attr]" begin + results = xpath(doc, "/root/users/user[@role]") + @test length(results) == 3 + end + + @testset "attribute-value predicate [@attr='v']" begin + results = xpath(doc, "/root/users/user[@role='admin']") + @test length(results) == 2 + ids = sort([n["id"] for n in results]) + @test ids == ["1", "3"] + end + + @testset "attribute-value with double quotes" begin + results = xpath(doc, """/root/users/user[@id="2"]""") + @test length(results) == 1 + @test results[1]["id"] == "2" + end + + @testset "descendant //" begin + results = xpath(doc, "//name") + @test length(results) == 3 + @test all(n -> tag(n) == "name", results) + end + + @testset "// with predicate" begin + results = xpath(doc, "//user[@role='admin']/name") + @test length(results) == 2 + end + + @testset "wildcard *" begin + results = xpath(doc, "/root/*") + @test length(results) == 2 + @test Set(tag.(results)) == Set(["users", "settings"]) + end + + @testset "text()" begin + results = xpath(doc, "/root/settings/theme/text()") + @test length(results) == 1 + @test value(results[1]) == "dark" + end + + @testset "node()" begin + results = xpath(doc, "/root/users/user[1]/node()") + @test length(results) >= 1 + end + + @testset "attribute selection @attr" begin + results = xpath(doc, "//user/@id") + @test length(results) == 3 + vals = sort([value(n) for n in results]) + @test vals == ["1", "2", "3"] + end + + @testset "self ." begin + results = xpath(doc, ".") + @test length(results) == 1 + @test results[1] === doc + end + + @testset "no match returns empty" begin + @test isempty(xpath(doc, "/root/nonexistent")) + @test isempty(xpath(doc, "//nonexistent")) + end + + @testset "empty expression" begin + @test isempty(xpath(doc, "")) + end + + @testset "deep // with path" begin + results = xpath(doc, "//theme/text()") + @test length(results) == 1 + @test value(results[1]) == "dark" + end + + @testset "error: unterminated predicate" begin + @test_throws ErrorException xpath(doc, "/root/user[1") + end + + @testset "error: unsupported predicate" begin + @test_throws ErrorException xpath(doc, "/root/user[position()>1]") + end + + @testset "self-closing elements" begin + doc2 = parse("", Node) + @test length(xpath(doc2, "/root/*")) == 3 + end + + @testset "relative path" begin + root = xpath(doc, "/root")[1] + results = xpath(root, "users/user") + @test length(results) == 3 + end + + @testset ".. parent navigation" begin + # /root/users/user[1]/.. goes back to + results = xpath(doc, "/root/users/user[1]/..") + @test length(results) == 1 + @test tag(results[1]) == "users" + end + + @testset ".. in mid-path" begin + # /root/users/.. should go back to root + results = xpath(doc, "/root/users/..") + @test length(results) == 1 + @test tag(results[1]) == "root" + end + + @testset "// mid-path" begin + # /root//name finds all elements anywhere under root + results = xpath(doc, "/root//name") + @test length(results) == 3 + @test all(n -> tag(n) == "name", results) + end + + @testset "// with wildcard //*" begin + doc2 = parse("", Node) + results = xpath(doc2, "//*") + tags = [tag(n) for n in results if nodetype(n) === Element] + @test "r" in tags + @test "a" in tags + @test "b" in tags + @test "c" in tags + end + + @testset "// with text()" begin + results = xpath(doc, "//text()") + @test length(results) >= 3 # at least Alice, Bob, Carol + vals = [value(n) for n in results] + @test "Alice" in vals + @test "Bob" in vals + @test "dark" in vals + end + + @testset "multiple // segments" begin + results = xpath(doc, "//users//name") + @test length(results) == 3 + @test all(n -> tag(n) == "name", results) + end + + @testset "chained predicates" begin + results = xpath(doc, "/root/users/user[@role='admin'][1]") + @test length(results) == 1 + @test results[1]["id"] == "1" + end + + @testset "@attr with no match" begin + results = xpath(doc, "//user/@nonexistent") + @test isempty(results) + end + + @testset "namespaced tag" begin + doc2 = parse("""val""", Node) + results = xpath(doc2, "/root/ns:item") + @test length(results) == 1 + @test tag(results[1]) == "ns:item" + end + + @testset "whitespace in expression" begin + results = xpath(doc, " / root / users / user ") + @test length(results) == 3 + end + + @testset "error: empty @" begin + @test_throws ErrorException xpath(doc, "/root/@") + end + + @testset "error: unknown function" begin + @test_throws ErrorException xpath(doc, "/root/foo()") + end + + @testset "error: unexpected character" begin + @test_throws ErrorException xpath(doc, "/root/!bad") + end + + @testset "deep nesting" begin + doc2 = parse("deep", Node) + results = xpath(doc2, "//e/text()") + @test length(results) == 1 + @test value(results[1]) == "deep" + end + + @testset "wildcard with predicate" begin + doc2 = parse("""""", Node) + results = xpath(doc2, "/r/*[@x]") + @test length(results) == 2 + end + + @testset "// from non-document node" begin + root = xpath(doc, "/root")[1] + results = xpath(root, "//name") + @test length(results) == 3 + end +end + +#==============================================================================# +# LAZYNODE # +#==============================================================================# +@testset "LazyNode" begin + @testset "parse and nodetype" begin + doc = parse("", LazyNode) + @test nodetype(doc) == Document + + doc2 = parse(LazyNode, "") + @test nodetype(doc2) == Document + end + + @testset "read from IO" begin + xml = """hello""" + doc = read(IOBuffer(xml), LazyNode) + @test nodetype(doc) == Document + end + + @testset "read from file" begin + path = joinpath(@__DIR__, "data", "books.xml") + isfile(path) || return + doc = read(path, LazyNode) + @test nodetype(doc) == Document + @test length(children(doc)) > 0 + end + + @testset "Document children" begin + xml = """""" + doc = parse(xml, LazyNode) + ch = children(doc) + @test length(ch) == 2 + @test nodetype(ch[1]) == Declaration + @test nodetype(ch[2]) == Element + end + + @testset "Document with all prolog node types" begin + xml = """""" + doc = parse(xml, LazyNode) + ch = children(doc) + types = map(nodetype, ch) + @test Declaration in types + @test DTD in types + @test Comment in types + @test ProcessingInstruction in types + @test Element in types + end + + @testset "Element tag" begin + doc = parse("", LazyNode) + @test tag(doc[1]) == "root" + end + + @testset "tag returns nothing for non-element/PI" begin + doc = parse("text", LazyNode) + text_node = children(doc[1])[1] + @test nodetype(text_node) == Text + @test tag(text_node) === nothing + end + + @testset "Element attributes" begin + doc = parse("""""", LazyNode) + attrs = attributes(doc[1]) + @test attrs isa Attributes + @test attrs["a"] == "1" + @test attrs["b"] == "2" + end + + @testset "Element with no attributes" begin + doc = parse("", LazyNode) + @test attributes(doc[1]) === nothing + end + + @testset "attributes returns nothing for non-element" begin + doc = parse("text", LazyNode) + @test attributes(children(doc[1])[1]) === nothing + end + + @testset "attributes unescape entity references" begin + doc = parse("""""", LazyNode) + @test doc[1]["a"] == "a&b" + end + + @testset "Declaration attributes" begin + doc = parse("""""", LazyNode) + decl = doc[1] + @test nodetype(decl) == Declaration + attrs = attributes(decl) + @test attrs["version"] == "1.0" + @test attrs["encoding"] == "UTF-8" + end + + @testset "get with default" begin + doc = parse("""""", LazyNode) + el = doc[1] + @test get(el, "a", "nope") == "1" + @test get(el, "b", "nope") == "nope" + end + + @testset "get on non-element returns default" begin + doc = parse("text", LazyNode) + text_node = children(doc[1])[1] + @test get(text_node, "a", "default") == "default" + end + + @testset "getindex with string key" begin + doc = parse("""""", LazyNode) + @test doc[1]["a"] == "1" + @test_throws KeyError doc[1]["nonexistent"] + end + + @testset "haskey" begin + doc = parse("""""", LazyNode) + @test haskey(doc[1], "a") == true + @test haskey(doc[1], "b") == false + end + + @testset "keys" begin + doc = parse("""""", LazyNode) + @test keys(doc[1]) == ["a", "b"] + end + + @testset "keys on element with no attributes" begin + doc = parse("", LazyNode) + @test isempty(keys(doc[1])) + end + + @testset "keys on non-element" begin + doc = parse("text", LazyNode) + @test keys(children(doc[1])[1]) == () + end + + @testset "Text value" begin + doc = parse("hello", LazyNode) + ch = children(doc[1]) + @test nodetype(ch[1]) == Text + @test value(ch[1]) == "hello" + end + + @testset "Text value unescapes entities" begin + doc = parse("& < >", LazyNode) + @test value(children(doc[1])[1]) == "& < >" + end + + @testset "has_entities short-circuit (zero-copy, correctness)" begin + # Entity-free Text: returns the raw SubString view, no allocation. + doc = parse("plain text no entities", LazyNode) + v = value(children(doc[1])[1]) + @test v isa SubString{String} + @test v == "plain text no entities" + @test (@allocated value(children(doc[1])[1])) ≥ 0 # smoke + + # Entity-bearing Text: still decodes byte-for-byte like unescape. + d2 = parse("a & b A A <", LazyNode) + tv = value(children(d2[1])[1]) + @test tv == unescape(SubString("a & b A A <")) + @test tv == "a & b A A <" + + # Entity-free attribute: zero-copy SubString view. + d3 = parse("""""", LazyNode) + c = d3[1] + @test get(c, "r", nothing) isa SubString{String} + @test get(c, "r", nothing) == "A1" + a = attributes(c) + @test a["s"] == "3" + @test a["s"] isa SubString{String} + pairs_collected = collect(eachattribute(c)) + @test pairs_collected == ["r" => "A1", "s" => "3", "t" => "n"] + @test all(p -> last(p) isa SubString{String}, pairs_collected) + + # Entity-bearing attribute: decoded. + d4 = parse("""""", LazyNode) + x = d4[1] + @test x["a"] == "x & y" + @test get(x, "b", nothing) == "plain" + @test get(x, "b", nothing) isa SubString{String} + @test attributes(x)["a"] == "x & y" + @test Dict(eachattribute(x)) == Dict("a" => "x & y", "b" => "plain") + + # CDATA carries markup characters verbatim — never entity-decoded. + d5 = parse("", LazyNode) + cd = children(d5[1])[1] + @test nodetype(cd) == CData + @test value(cd) == "a & b < c & d" + + # is_simple_value: entity-free returns view, entity-bearing decodes. + s1 = parse("simple", LazyNode)[1] + @test XML.is_simple_value(s1) == "simple" + @test XML.is_simple_value(s1) isa SubString{String} + s2 = parse("a & b", LazyNode)[1] + @test XML.is_simple_value(s2) == "a & b" + end + + @testset "Comment value" begin + doc = parse("", LazyNode) + c = children(doc[1])[1] + @test nodetype(c) == Comment + @test value(c) == " a comment " + end + + @testset "CData value" begin + doc = parse("]]>", LazyNode) + cd = children(doc[1])[1] + @test nodetype(cd) == CData + @test value(cd) == "raw " + end + + @testset "DTD value" begin + doc = parse("""""", LazyNode) + dtd = doc[1] + @test nodetype(dtd) == DTD + @test contains(value(dtd), "greeting") + end + + @testset "ProcessingInstruction tag and value" begin + doc = parse("", LazyNode) + pi = doc[1] + @test nodetype(pi) == ProcessingInstruction + @test tag(pi) == "mypi" + @test value(pi) == "some data" + end + + @testset "ProcessingInstruction with no content" begin + doc = parse("", LazyNode) + pi = doc[1] + @test tag(pi) == "target" + @test value(pi) === nothing + end + + @testset "value returns nothing for Element/Document" begin + doc = parse("", LazyNode) + @test value(doc) === nothing + @test value(doc[1]) === nothing + end + + @testset "Element children" begin + doc = parse("", LazyNode) + root = doc[1] + @test length(children(root)) == 3 + @test tag(children(root)[1]) == "a" + @test tag(children(root)[2]) == "b" + @test tag(children(root)[3]) == "c" + end + + @testset "self-closing element has no children" begin + doc = parse("
", LazyNode) + br = children(doc[1])[1] + @test isempty(children(br)) + end + + @testset "non-element children returns empty tuple" begin + doc = parse("text", LazyNode) + text_node = children(doc[1])[1] + @test children(text_node) == () + end + + @testset "nested elements" begin + doc = parse("
deep", LazyNode) + @test tag(doc[1]) == "a" + @test tag(doc[1][1]) == "b" + @test tag(doc[1][1][1]) == "c" + @test simple_value(doc[1][1][1]) == "deep" + end + + @testset "mixed content children" begin + xml = "text" + doc = parse(xml, LazyNode) + ch = children(doc[1]) + types = map(nodetype, ch) + @test Text in types + @test Comment in types + @test CData in types + @test ProcessingInstruction in types + @test Element in types + end + + @testset "integer indexing" begin + doc = parse("", LazyNode) + @test tag(doc[1][1]) == "a" + @test tag(doc[1][2]) == "b" + @test tag(doc[1][3]) == "c" + end + + @testset "colon indexing" begin + doc = parse("", LazyNode) + all = doc[1][:] + @test length(all) == 2 + end + + @testset "lastindex" begin + doc = parse("", LazyNode) + @test tag(doc[1][end]) == "c" + end + + @testset "only" begin + doc = parse("", LazyNode) + @test tag(only(doc[1])) == "only" + end + + @testset "length" begin + doc = parse("", LazyNode) + @test length(doc[1]) == 3 + end + + @testset "is_simple" begin + doc = parse("text", LazyNode) + simple = children(doc[1])[1] + complex = children(doc[1])[2] + @test is_simple(simple) + @test !is_simple(complex) + end + + @testset "is_simple with attributes" begin + doc = parse("""text""", LazyNode) + @test !is_simple(children(doc[1])[1]) + end + + @testset "is_simple with CData child" begin + doc = parse("", LazyNode) + @test is_simple(children(doc[1])[1]) + end + + @testset "is_simple returns false for non-element" begin + doc = parse("text", LazyNode) + @test !is_simple(children(doc[1])[1]) + end + + @testset "simple_value" begin + doc = parse("hello", LazyNode) + @test simple_value(children(doc[1])[1]) == "hello" + end + + @testset "simple_value errors on non-simple" begin + doc = parse("", LazyNode) + @test_throws ErrorException simple_value(children(doc[1])[1]) + end + + @testset "simple_value errors on non-element" begin + doc = parse("text", LazyNode) + @test_throws ErrorException simple_value(children(doc[1])[1]) + end + + @testset "show Document" begin + doc = parse("", LazyNode) + s = sprint(show, doc) + @test contains(s, "Lazy") + @test contains(s, "Document") + @test contains(s, "1 child") + end + + @testset "show Document multiple children" begin + doc = parse("", LazyNode) + s = sprint(show, doc) + @test contains(s, "2 children") + end + + @testset "show Element" begin + doc = parse("""""", LazyNode) + s = sprint(show, doc[1]) + @test contains(s, "Lazy Element") + @test contains(s, "hello", LazyNode) + s = sprint(show, children(doc[1])[1]) + @test contains(s, "Lazy Text") + @test contains(s, "hello") + end + + @testset "show Comment" begin + doc = parse("", LazyNode) + s = sprint(show, children(doc[1])[1]) + @test contains(s, "Lazy Comment") + @test contains(s, " + + + text content + inner + + + + +""" + doc = parse(xml, LazyNode) + @test nodetype(doc) == Document + + typed = filter(x -> nodetype(x) != Text, children(doc)) + @test nodetype(typed[1]) == Declaration + @test nodetype(typed[2]) == DTD + @test nodetype(typed[3]) == Comment + @test nodetype(typed[4]) == ProcessingInstruction + @test nodetype(typed[5]) == Element + + root = typed[5] + @test tag(root) == "root" + @test root["attr"] == "val" + + inner = children(root) + inner_types = map(nodetype, inner) + @test Text in inner_types + @test Element in inner_types + @test CData in inner_types + @test Comment in inner_types + @test ProcessingInstruction in inner_types + + child_els = filter(x -> nodetype(x) == Element, inner) + @test length(child_els) == 2 + @test tag(child_els[1]) == "child" + @test simple_value(child_els[1]) == "inner" + @test tag(child_els[2]) == "empty" + end + + @testset "sourcetext" begin + @testset "self-closing element" begin + doc = parse("", LazyNode) + @test sourcetext(doc[1]) == "" end - lzxml = """ hello hello preserve """ - lz = XML.parse(XML.LazyNode, lzxml) - n=XML.next(lz) - n=XML.next(n) - text_content = XML.write(n) - @test text_content == "" - n=XML.next(n) - text_content = XML.write(n) - @test text_content == "hello" - n=XML.next(n) - text_content = XML.write(n) - @test text_content == "hello" - n=XML.next(n) - text_content = XML.write(n) - @test text_content == " hello preserve " - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == "hello" - n=XML.next(n) - text_content = XML.write(n) - @test text_content == " hello preserve " - n=XML.next(n) - text_content = XML.write(n) - @test text_content == " hello " - n=XML.next(n) - text_content = XML.write(n) - @test text_content == " preserve " - n=XML.next(n) - text_content = XML.write(n) - @test text_content == " preserve " - n=XML.next(n) - text_content = XML.write(n) - @test text_content == "" - n=XML.next(n) - text_content = XML.write(n) - @test text_content == "" - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == "" - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == " preserve " - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == " preserve " - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == " hello " - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == " hello preserve " - n=XML.next(n) - text_content = XML.write(n) - @test text_content == " hello " - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == " hello preserve " - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == "hello" - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == "hello" - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == "" - n=XML.prev(n) - text_content = XML.write(n) - @test text_content == "\n \n hello\n hello preserve \n \n \n" - end - - @testset "depth and parent" begin - @test XML.depth(data) == 0 - @test isnothing(XML.parent(data)) - @test XML.depth(doc[1]) == 1 - @test XML.parent(doc[1]) == data - @test XML.depth(doc[2]) == 1 - @test XML.depth(doc[3]) == 2 - @test XML.parent(doc[3]) == doc[2] - @test XML.depth(doc[end]) == 1 - @test XML.parent(doc[end]) == data - end - - @testset "tag/attributes/value" begin - x = doc[1] # - @test XML.tag(x) === nothing - @test XML.attributes(x) == Dict("version" => "1.0") - @test XML.value(x) === nothing - - x = XML.next(x) # - @test XML.tag(x) == "catalog" - @test XML.attributes(x) === nothing - @test XML.value(x) === nothing - - x = XML.next(x) # - @test XML.tag(x) == "book" - @test XML.attributes(x) == Dict("id" => "bk101") - @test XML.value(x) === nothing - - x = XML.next(x) # - @test XML.tag(x) == "author" - @test XML.attributes(x) === nothing - @test XML.value(x) === nothing - - x = XML.next(x) # Gambardella, Matthew - @test XML.tag(x) === nothing - @test XML.attributes(x) === nothing - @test XML.value(x) == "Gambardella, Matthew" - end -end - -#-----------------------------------------------------------------------------# Preserve whitespace -@testset "xml:space" begin - @testset "Basic xml:space functionality" begin - - # Test 1: xml:space="preserve" should preserve entirely empty whitespace - xml1 = """ """ - doc1 = parse(XML.Node, xml1) - text_content = XML.value(doc1[1][1][1]) - @test text_content == " " - - # Test 2: xml:space="preserve" should preserve leading and trailing whitespace - xml2 = """ leading and trailing spaces """ - doc2 = parse(XML.Node, xml2) - text_content = XML.value(doc2[1][1][1]) - @test text_content == " leading and trailing spaces " - - # Test 3: Entirely empty tags with and without xml:space="preserve" become self-closing - xml3 = """ """ - doc3 = XML.parse(XML.Node, xml3) - text_content = XML.write(doc3[1][1]) - @test text_content == "" # without xml:space="preserve", empty text becomes self-closing - text_content = XML.value(doc3[1][2][1]) - @test text_content == " " # with xml:space, whitespace is preserved - text_content = XML.write(doc3[1][3]) - @test text_content == "" # with xml:space="preserve", empty text becomes self-closing - - # Test 4: Without xml:space, whitespace should be normalized - xml4 = """ gets normalized """ - doc4 = XML.parse(XML.Node, xml4) - text_content = XML.value(doc4[1][1][1]) - @test text_content == "gets normalized" - - # Test 5: xml:space="default" should normalize even with preserve_xml_space=true - xml5 = """ gets normalized """ - doc5 = XML.parse(XML.Node, xml5) - text_content = XML.value(doc5[1][1][1]) - @test text_content == "gets normalized" - end - - @testset "xml:space inheritance" begin - # Test 6: Children inherit parent's xml:space="preserve" - xml6 = """ - parent text - child text - - """ - doc6 = XML.parse(XML.Node, xml6) - # Both parent and child should preserve whitespace - @test contains(XML.value(doc6[1][2][1]), "parent text \n") - @test XML.value(doc6[1][2][2][1]) == " child text " - - # Test 7: xml:space="default" overrides parent's "preserve" - xml7 = """ - normalized despite parent - """ - doc7 = XML.parse(XML.Node, xml7) - @test XML.value(doc7[1][2][1]) == "normalized despite parent" - end - - @testset "Nesting scenarios" begin - # Test 8: Multiple levels of xml:space changes - xml8 = """ - preserved - normalized - preserved again - - - """ - doc8 = XML.parse(XML.Node, xml8) - - # level1 should preserve (inherits from root) - level1_text = XML.value(doc8[1][2][1]) - @test level1_text == " preserved \n " - - # level2 should normalize (explicit xml:space="default") - level2_text = XML.value(doc8[1][2][2][1]) - @test level2_text == "normalized" - - # level3 should preserve (explicit xml:space="preserve") - level3_text = XML.value(doc8[1][2][2][2][1]) - @test level3_text == " preserved again " - - # Test 9: repeated multiple levels of xml:space changes - xml9 = """ - preserved - normalized - preserved again - - - preserved b - normalized b - preserved again b - - - """ - doc9 = XML.parse(XML.Node, xml9) - - # level1b should preserve (inherits from root) - level1b_text = XML.value(doc9[1][4][1]) - @test level1b_text == " preserved b \n " - - # level2 should normalize (explicit xml:space="default") - level2b_text = XML.value(doc9[1][4][2][1]) - @test level2b_text == "normalized b" - - # level3 should preserve (explicit xml:space="preserve") - level3b_text = XML.value(doc9[1][4][2][2][1]) - @test level3b_text == " preserved again b " - - # Test 10: futher repeated multiple levels of xml:space changes - xml10 = """ - normalized - normalized b - preserved - - - normalized c - preserved b - normalized again b - preserved c - - - - normalized d - """ - doc10 = XML.parse(XML.Node, xml10) - - # level1 should normalize (as root) - level1_text = XML.value(doc10[end][1][1]) - @test level1_text == "normalized" - - # level2 should normalize (as root and level1) - level2_text = XML.value(doc10[end][1][2][1]) - @test level2_text == "normalized b" - - # level3 should preserve (explicit xml:space="preserve") - level3_text = XML.value(doc10[end][1][2][2][1]) - @test level3_text == " preserved " - - # level1b should normalize (as root) - level1b_text = XML.value(doc10[end][2][1]) - @test level1b_text == "normalized c" - - # level2b should preserve (explicit xml:space="preserve") - level2b_text = XML.value(doc10[end][2][2][1]) - @test level2b_text == " preserved b \n " - - # level3 should normalize (explicit xml:space="default") - level3b_text = XML.value(doc10[end][2][2][2][1]) - @test level3b_text == "normalized again b" - - # level3c should preserve (inherited from level2b) - level3c_text = XML.value(doc10[end][2][2][4][1]) - @test level3c_text == " preserved c \n " - - # level1c should normalize (as root) - level1c_text = XML.value(doc10[end][3][1]) - @test level1c_text == "normalized d" - end - @testset "inter-element gap semantics" begin - # Default parent: gap between siblings should be dropped - s1 = """ x - y """ - d1 = XML.parse(XML.Node, s1) - @test length(d1[1]) == 2 - @test XML.value(d1[1][1][1]) == "x" - @test XML.value(d1[1][2][1]) == "y" - - # Preserve parent, default child ends: gap after default child dropped - s2 = """ -

keep

- norm - after default gap -
""" - d2 = XML.parse(XML.Node, s2) - @test length(d2[1]) == 7 - @test XML.value(d2[1][1]) == "\n " - @test XML.value(d2[1][2][1]) == " keep " - @test XML.value(d2[1][3]) == "\n " - @test XML.value(d2[1][4][1]) == "norm" - @test XML.value(d2[1][5]) == "\n " - @test XML.value(d2[1][6][1]) == " after default gap " - @test XML.value(d2[1][7]) == "\n" - end - @testset "XML whitespace vs Unicode whitespace" begin - nbsp = "\u00A0" - s = """ - x\t\n - $(nbsp) y $(nbsp) - $(nbsp) z $(nbsp) - """ - d = XML.parse(XML.Node, s) - @test XML.value(d[1][1][1]) == "x" - @test XML.value(d[1][2][1]) == "$(nbsp) y $(nbsp)" - @test XML.value(d[1][3][1]) == "$(nbsp) z $(nbsp)" - end - - @testset "CDATA/Comment/PI boundaries" begin - s = """ - pre post - pre post - - """ - d = XML.parse(XML.Node, s) - @test XML.value(d[1][1][1]) == "pre" - @test nodetype(d[1][1][2]) == XML.CData - @test XML.value(d[1][1][3]) == "post" - @test XML.value(d[1][2][1]) == " pre " - @test nodetype(d[1][2][2]) == XML.Comment - @test XML.value(d[1][2][3]) == " post " - @test nodetype(d[1][3]) == XML.ProcessingInstruction - end - - @testset "nested toggles and sibling sequences" begin - s = """ - a - b - c - - d - e - - """ - d = XML.parse(XML.Node, s) - @test XML.value(d[1][2][1]) == " a \n " - @test XML.value(d[1][2][2][1]) == "b" - @test XML.value(d[1][2][2][2][1]) == " c " - @test d[1][2][4].tag == "y2" - @test XML.value(d[1][2][4][1]) == "d" - @test d[1][2][6].tag == "w" - @test XML.value(d[1][2][6][1]) == " e " - end - - @testset "root/document boundaries" begin - s = "\n \n a \n \t " - d = XML.parse(XML.Node, s) - @test length(d) == 1 - @test XML.value(d[1][1]) == "a" - end - - @testset "entities expanding to whitespace" begin - chr1="\u0020" - chr2="\u000A" - chr3="\u00A0" - - s = """ - $(chr1) a $(chr2) - $(chr1) b $(chr2) - $(chr3)c$(chr3) - """ - d = XML.parse(XML.Node, s) - @test XML.value(d[1][1][1]) == "a" - @test XML.value(d[1][2][1]) == " b \n" - @test XML.value(d[1][3][1]) == "$(chr3)c$(chr3)" - end - - @testset "invalid values and placement" begin - s_bad = """ t """ - @test_throws ErrorException XML.parse(XML.Node, s_bad) - - s_pi = """ t """ - d = XML.parse(XML.Node, s_pi) - @test XML.value(d[end][1]) == "t" - - s_dup = """ t """ -# @test_throws ErrorException XML.parse(XML.Node, s_dup) - end - - @testset "prev()/next() symmetry" begin - xml = """ - a b c - d e f - i - """ - r = XML.parse(XML.LazyNode, xml).raw - toks=XML.Raw[] - while true - n = XML.next(r) - n === nothing && break - push!(toks, n) - r=n + @testset "element with attributes" begin + xml = """""" + doc = parse(xml, LazyNode) + @test sourcetext(doc[1]) == xml end - back = XML.Raw[] - r = toks[end] - while true - p = XML.prev(r) - p === nothing && break - push!(back, p) - r = p + + @testset "element with children" begin + xml = "text" + doc = parse(xml, LazyNode) + @test sourcetext(doc[1]) == xml + root = doc[1] + child = first(c for c in children(root) if nodetype(c) == Element) + @test sourcetext(child) == "text" end - @test reverse(back)[2:end] == toks[1:end-1] - end - - @testset "write/read roundtrip extremes" begin - xml = """ -

- - r - pre post -
""" - n = XML.parse(XML.Node, xml) - io = IOBuffer(); XML.write(io, n) - n2 = XML.parse(XML.Node, String(take!(io))) - @test n == n2 - @test XML.write(n2[1][1]) == "

" - @test XML.write(n2[1][2]) == "" - @test XML.value(n2[1][3][1]) == "r" - @test XML.write(n2[1][4]) == " pre post " - end - - @testset "self-closing/empty/whitespace-only children" begin - s = """ - - - - - x y - """ - d = XML.parse(XML.Node, s) - @test XML.write(d[1][1]) == "" - @test XML.write(d[1][2]) == "" - @test XML.value(d[1][3][1]) == " " - @test XML.value(d[1][5][1]) == "x" - @test XML.value(d[1][5][3]) == "y" - end - - @testset "allocation guard: small xml:space doc" begin - xml = " x y
" - f() = XML.parse(XML.Node, xml) - a = @allocated f() - @test a < 500_000 # tune for CI - end - -end - -#-----------------------------------------------------------------------------# roundtrip -@testset "read/write/read roundtrip" begin - for path in all_files - node = read(path, Node) - temp = tempname() * ".xml" - XML.write(temp, node) - node2 = read(temp, Node) - @test node == node2 - - #For debugging: - for (a,b) in zip(AbstractTrees.Leaves(node), AbstractTrees.Leaves(node2)) - if a != b - @info path - @info a - @info b - error() - end + + @testset "nested elements" begin + xml = "deep" + doc = parse(xml, LazyNode) + a = doc[1] + @test sourcetext(a) == xml + b = first(c for c in children(a) if nodetype(c) == Element) + @test sourcetext(b) == "deep" + end + + @testset "comment" begin + xml = "" + doc = parse(xml, LazyNode) + @test sourcetext(doc[1]) == "" + end + + @testset "cdata" begin + xml = "]]>" + doc = parse(xml, LazyNode) + cdata = first(c for c in children(doc[1]) if nodetype(c) == CData) + @test sourcetext(cdata) == "]]>" + end + + @testset "processing instruction" begin + xml = "" + doc = parse(xml, LazyNode) + @test sourcetext(doc[1]) == "" + end + + @testset "declaration" begin + xml = """""" + doc = parse(xml, LazyNode) + @test sourcetext(doc[1]) == """""" + end + + @testset "DTD" begin + xml = """""" + doc = parse(xml, LazyNode) + @test sourcetext(doc[1]) == """""" + end + + @testset "text node" begin + doc = parse("hello world", LazyNode) + txt = first(c for c in children(doc[1]) if nodetype(c) == Text) + @test sourcetext(txt) == "hello world" + end + + @testset "document" begin + xml = "hello" + doc = parse(xml, LazyNode) + @test sourcetext(doc) == xml + end + + @testset "mixed content" begin + xml = "

Hello world and more

" + doc = parse(xml, LazyNode) + @test sourcetext(doc[1]) == xml end end -end -#-----------------------------------------------------------------------------# Node writing -@testset "Node writing" begin - doc = Document( - DTD("root_tag"), - Declaration(version=1.0), - Comment("comment"), - ProcessingInstruction("xml-stylesheet", href="mystyle.css", type="text/css"), - Element("root_tag", CData("cdata"), Text("text")) - ) - @test map(nodetype, children(doc)) == [DTD,Declaration,Comment,ProcessingInstruction,Element] - @test length(children(doc[end])) == 2 - @test nodetype(doc[end][1]) == XML.CData - @test nodetype(doc[end][2]) == XML.Text - @test value(doc[end][1]) == "cdata" - @test value(doc[end][2]) == "text" - - #set/get index for attributes - o = doc[end] - @test isempty(keys(o)) - o["id"] = 1 - @test o["id"] == "1" - @test keys(o) == keys(Dict("id" => "1")) -end - -#-----------------------------------------------------------------------------# Issues -@testset "Issues" begin - # https://github.com/JuliaComputing/XML.jl/issues/12: DTD content was cut short - s = """ - - - - ]> - """ - - doc = parse(Node, s) - @test value(only(doc)) == s[11:end-2] # note [...] - - # https://github.com/JuliaComputing/XML.jl/issues/14 (Sorted Attributes) - kw = NamedTuple(OrderedDict(Symbol(k) => Int(k) for k in 'a':'z')) - xyz = XML.Element("point"; kw...) - @test collect(keys(attributes(xyz))) == string.(collect('a':'z')) + @testset "write(::LazyNode)" begin + @testset "write returns String" begin + xml = "text" + doc = parse(xml, LazyNode) + @test XML.write(doc[1]) == xml + @test XML.write(doc[1]) isa String + end + + @testset "write to IO" begin + xml = "text" + doc = parse(xml, LazyNode) + io = IOBuffer() + XML.write(io, doc[1]) + @test String(take!(io)) == xml + end + end + + @testset "eachchildnode" begin + @testset "matches children for element" begin + xml = "text" + doc = parse(xml, LazyNode) + root = doc[1] + eager = children(root) + lazy = collect(eachchildnode(root)) + @test length(eager) == length(lazy) + @test map(nodetype, eager) == map(nodetype, lazy) + @test map(tag, eager) == map(tag, lazy) + end + + @testset "self-closing element has no children" begin + doc = parse("", LazyNode) + @test isempty(collect(eachchildnode(doc[1]))) + end + + @testset "document children" begin + xml = """""" + doc = parse(xml, LazyNode) + eager = children(doc) + lazy = collect(eachchildnode(doc)) + @test length(eager) == length(lazy) + @test map(nodetype, eager) == map(nodetype, lazy) + end + + @testset "mixed content types" begin + xml = """text""" + doc = parse(xml, LazyNode) + root = doc[1] + types = [nodetype(c) for c in eachchildnode(root)] + @test Text in types + @test Comment in types + @test CData in types + @test ProcessingInstruction in types + @test Element in types + end + + @testset "sourcetext works on eachchildnode results" begin + xml = "helloworld" + doc = parse(xml, LazyNode) + root = doc[1] + results = [XML.write(c) for c in eachchildnode(root)] + @test results == ["hello", "world"] + end + + @testset "non-element/document returns empty" begin + xml = "" + doc = parse(xml, LazyNode) + comment = doc[1] + @test nodetype(comment) == Comment + @test isempty(collect(eachchildnode(comment))) + end + end end +include("test_abstracttrees_ext.jl") +include("test_pugixml.jl") +include("test_libexpat.jl") +include("test_w3c.jl") diff --git a/test/test_abstracttrees_ext.jl b/test/test_abstracttrees_ext.jl new file mode 100644 index 0000000..e30bc5c --- /dev/null +++ b/test/test_abstracttrees_ext.jl @@ -0,0 +1,89 @@ +import AbstractTrees + +@testset "AbstractTrees extension" begin + xml = """ + + + + + One + Alice + + + Two + + + """ + + @testset "extension is loaded" begin + @test Base.get_extension(XML, :XMLAbstractTreesExt) !== nothing + end + + @testset "children (Node)" begin + doc = parse(xml, Node) + @test AbstractTrees.children(doc) == XML.children(doc) + lib = first(filter(c -> nodetype(c) == Element, XML.children(doc))) + @test AbstractTrees.children(lib) == XML.children(lib) + + title = first(filter(c -> nodetype(c) == Element, XML.children(lib)))[1] + # `One` — title element's only child is a Text node with no children + @test isempty(AbstractTrees.children(title)) + end + + @testset "children (LazyNode)" begin + ldoc = parse(xml, LazyNode) + @test length(AbstractTrees.children(ldoc)) == length(XML.children(ldoc)) + lib = first(filter(c -> nodetype(c) == Element, XML.children(ldoc))) + @test length(AbstractTrees.children(lib)) == length(XML.children(lib)) + end + + @testset "nodevalue identity" begin + doc = parse(xml, Node) + @test AbstractTrees.nodevalue(doc) === doc + ldoc = parse(xml, LazyNode) + @test AbstractTrees.nodevalue(ldoc) === ldoc + end + + @testset "traits" begin + @test AbstractTrees.NodeType(Node) === AbstractTrees.HasNodeType() + @test AbstractTrees.NodeType(LazyNode) === AbstractTrees.HasNodeType() + @test AbstractTrees.nodetype(Node{String}) === Node{String} + @test AbstractTrees.ChildIndexing(Node) === AbstractTrees.IndexedChildren() + end + + @testset "PreOrderDFS visits every node" begin + doc = parse(xml, Node) + elements = [n for n in AbstractTrees.PreOrderDFS(doc) if nodetype(n) == Element] + @test map(tag, elements) == ["library", "book", "title", "author", "book", "title"] + + ldoc = parse(xml, LazyNode) + lelements = [n for n in AbstractTrees.PreOrderDFS(ldoc) if nodetype(n) == Element] + @test map(tag, lelements) == ["library", "book", "title", "author", "book", "title"] + end + + @testset "printnode labels" begin + @test sprint(AbstractTrees.printnode, Element("div", "hi"; class="main")) == "
" + @test sprint(AbstractTrees.printnode, Text("hello")) == "\"hello\"" + @test sprint(AbstractTrees.printnode, Comment("c")) == "" + @test sprint(AbstractTrees.printnode, CData("xyz")) == "" + @test sprint(AbstractTrees.printnode, DTD("note")) == "" + @test sprint(AbstractTrees.printnode, ProcessingInstruction("xml-stylesheet", "type=\"text/xsl\"")) == + "" + @test sprint(AbstractTrees.printnode, Declaration(version="1.0")) == "" + @test sprint(AbstractTrees.printnode, Document()) == "Document" + + ldoc = parse("hi", LazyNode) + a = ldoc[1] + @test sprint(AbstractTrees.printnode, a) == "" + end + + @testset "print_tree round-trips structure" begin + doc = parse("", Node) + out = sprint(AbstractTrees.print_tree, doc) + @test occursin("Document", out) + @test occursin("", out) + @test occursin("", out) + @test occursin("", out) + @test occursin("", out) + end +end diff --git a/test/test_libexpat.jl b/test/test_libexpat.jl new file mode 100644 index 0000000..9ac8955 --- /dev/null +++ b/test/test_libexpat.jl @@ -0,0 +1,389 @@ +# Test cases inspired by libexpat (https://github.com/libexpat/libexpat, MIT license) +# Translated from expat/tests/basic_tests.c + +using XML +using XML: Node, nodetype, Document, Element, Comment, CData, ProcessingInstruction, Text, Declaration, DTD +using XML: tag, value, children, attributes, simple_value +using Test + +@testset "libexpat-inspired" begin + + #==========================================================================# + # Character References # + #==========================================================================# + @testset "Decimal character references" begin + doc = parse("éè", Node) + @test simple_value(children(doc)[1]) == "éè" + end + + @testset "Hex character references" begin + doc = parse("éè", Node) + @test simple_value(children(doc)[1]) == "éè" + end + + @testset "Mixed char refs and text" begin + doc = parse("abcdef", Node) + @test simple_value(children(doc)[1]) == "abcdef" + end + + @testset "Large Unicode code points" begin + # CJK Unified Ideograph + doc = parse("世界", Node) + @test simple_value(children(doc)[1]) == "世界" + end + + #==========================================================================# + # UTF-8 Content # + #==========================================================================# + @testset "UTF-8 BOM" begin + bom = "\xef\xbb\xbf" + doc = parse(bom * "", Node) + @test nodetype(doc) == Document + end + + @testset "UTF-8 element content" begin + doc = parse("Ünïcödé", Node) + @test simple_value(children(doc)[1]) == "Ünïcödé" + end + + @testset "UTF-8 in attribute values" begin + doc = parse("", Node) + @test children(doc)[1]["attr"] == "café" + end + + @testset "UTF-8 element names" begin + # XML.jl tokenizer does not yet support non-ASCII characters in element names + @test_broken try + parse("", Node) + true + catch + false + end + end + + @testset "Multi-byte UTF-8 sequences" begin + # 2-byte: ñ (U+00F1) + doc = parse("ñ", Node) + @test simple_value(children(doc)[1]) == "ñ" + + # 3-byte: 世 (U+4E16) + doc = parse("", Node) + @test simple_value(children(doc)[1]) == "世" + + # 4-byte: 𤭢 (U+24B62) + doc = parse("𤭢", Node) + @test simple_value(children(doc)[1]) == "𤭢" + end + + #==========================================================================# + # CDATA # + #==========================================================================# + @testset "Basic CDATA" begin + doc = parse("Hello!]]>", Node) + root = children(doc)[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata) == 1 + @test value(cdata[1]) == "Hello!" + end + + @testset "CDATA with special characters" begin + doc = parse("\"']]>", Node) + root = children(doc)[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test value(cdata[1]) == "&<>\"'" + end + + @testset "Multiple CDATA sections" begin + doc = parse("", Node) + root = children(doc)[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata) == 2 + @test value(cdata[1]) == "first" + @test value(cdata[2]) == "second" + end + + @testset "CDATA containing ]]" begin + # ]] without > is valid inside CDATA + doc = parse("", Node) + root = children(doc)[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test value(cdata[1]) == "data]]with]]brackets" + end + + @testset "CDATA errors" begin + @test_throws Exception parse("", Node) + decls = filter(x -> nodetype(x) == Declaration, children(doc)) + @test length(decls) == 1 + @test decls[1]["version"] == "1.0" + end + + @testset "XML declaration with encoding" begin + doc = parse("", Node) + decls = filter(x -> nodetype(x) == Declaration, children(doc)) + @test decls[1]["encoding"] == "UTF-8" + end + + @testset "XML declaration with standalone" begin + doc = parse("", Node) + decls = filter(x -> nodetype(x) == Declaration, children(doc)) + @test decls[1]["standalone"] == "yes" + end + + @testset "Full XML declaration" begin + doc = parse("", Node) + decls = filter(x -> nodetype(x) == Declaration, children(doc)) + @test decls[1]["version"] == "1.0" + @test decls[1]["encoding"] == "UTF-8" + @test decls[1]["standalone"] == "no" + end + + #==========================================================================# + # Processing Instructions # + #==========================================================================# + @testset "Processing instructions" begin + doc = parse("", Node) + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc)) + @test length(pis) == 1 + + doc = parse("", Node) + root = children(doc)[1] + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(root)) + @test length(pis) == 1 + end + + @testset "PI with no data" begin + doc = parse("", Node) + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc)) + @test length(pis) == 1 + end + + #==========================================================================# + # Comments # + #==========================================================================# + @testset "Comments in various positions" begin + # In prolog + doc = parse("", Node) + comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test length(comments) == 1 + + # Inside element + doc = parse("", Node) + root = children(doc)[1] + comments = filter(x -> nodetype(x) == Comment, children(root)) + @test length(comments) == 1 + + # After root element + doc = parse("", Node) + comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test length(comments) == 1 + end + + @testset "Comment with special content" begin + doc = parse("", Node) + root = children(doc)[1] + comments = filter(x -> nodetype(x) == Comment, children(root)) + @test contains(value(comments[1]), "") + @test contains(value(comments[1]), "¬-entity;") + end + + #==========================================================================# + # DTD / DOCTYPE # + #==========================================================================# + @testset "DOCTYPE with internal subset" begin + xml = """ + +]> +text""" + doc = parse(xml, Node) + @test nodetype(doc) == Document + dtd_nodes = filter(x -> nodetype(x) == DTD, children(doc)) + @test length(dtd_nodes) == 1 + root = filter(x -> nodetype(x) == Element, children(doc))[1] + @test tag(root) == "doc" + @test root["attr"] == "value" + text_nodes = filter(x -> nodetype(x) == Text, children(root)) + @test length(text_nodes) == 1 + @test value(text_nodes[1]) == "text" + end + + @testset "DOCTYPE with SYSTEM" begin + doc = parse("", Node) + dtd_nodes = filter(x -> nodetype(x) == DTD, children(doc)) + @test length(dtd_nodes) == 1 + end + + @testset "DOCTYPE with PUBLIC" begin + doc = parse("""""", Node) + dtd_nodes = filter(x -> nodetype(x) == DTD, children(doc)) + @test length(dtd_nodes) == 1 + end + + #==========================================================================# + # Entity Handling # + #==========================================================================# + @testset "Predefined entities" begin + doc = parse("<>&'"", Node) + @test simple_value(children(doc)[1]) == "<>&'\"" + end + + @testset "Entities in attribute values" begin + doc = parse("", Node) + @test children(doc)[1]["attr"] == "" + end + + @testset "Mixed entities and text" begin + doc = parse("Hello & welcome <user>", Node) + @test simple_value(children(doc)[1]) == "Hello & welcome " + end + + #==========================================================================# + # Attribute Edge Cases # + #==========================================================================# + @testset "Empty attribute value" begin + doc = parse("", Node) + @test children(doc)[1]["attr"] == "" + + doc = parse("""""", Node) + @test children(doc)[1]["attr"] == "" + end + + @testset "Attribute with entities" begin + doc = parse("", Node) + @test children(doc)[1]["attr"] == "a&b" + end + + @testset "Multiple attributes" begin + doc = parse("""""", Node) + el = children(doc)[1] + @test el["a"] == "1" + @test el["b"] == "2" + @test el["c"] == "3" + @test el["d"] == "4" + @test el["e"] == "5" + end + + @testset "Attribute error: duplicate" begin + @test_throws Exception parse("""""", Node) + end + + #==========================================================================# + # Nesting & Structure # + #==========================================================================# + @testset "Deeply nested elements" begin + xml = "deep" + doc = parse(xml, Node) + @test nodetype(doc) == Document + end + + @testset "Many sibling elements" begin + items = join(["$i" for i in 1:100]) + xml = "$items" + doc = parse(xml, Node) + root = children(doc)[1] + els = filter(x -> nodetype(x) == Element, children(root)) + @test length(els) == 100 + @test simple_value(els[1]) == "1" + @test simple_value(els[100]) == "100" + end + + @testset "Mismatched tags" begin + @test_throws Exception parse("", Node) + @test_throws Exception parse("", Node) + @test_throws Exception parse("", Node) + end + + @testset "Unclosed elements" begin + @test_throws Exception parse("", Node) + @test_throws Exception parse("text", Node) + end + + #==========================================================================# + # Line Endings # + #==========================================================================# + @testset "Various line endings in content" begin + # CR, LF, CRLF should all work + doc = parse("line1\nline2", Node) + @test nodetype(doc) == Document + + doc = parse("line1\rline2", Node) + @test nodetype(doc) == Document + + doc = parse("line1\r\nline2", Node) + @test nodetype(doc) == Document + end + + #==========================================================================# + # Empty Document Parts # + #==========================================================================# + @testset "Empty root element" begin + doc = parse("", Node) + root = children(doc)[1] + @test tag(root) == "doc" + @test isempty(filter(x -> nodetype(x) == Element, children(root))) + end + + @testset "Element with only whitespace" begin + doc = parse(" \n\t ", Node) + @test nodetype(doc) == Document + end + + @testset "Element with only comments" begin + doc = parse("", Node) + root = children(doc)[1] + els = filter(x -> nodetype(x) == Element, children(root)) + @test isempty(els) + comments = filter(x -> nodetype(x) == Comment, children(root)) + @test length(comments) == 2 + end + + #==========================================================================# + # Namespace-like Attributes # + #==========================================================================# + @testset "xmlns declarations" begin + doc = parse("""""", Node) + root = children(doc)[1] + @test root["xmlns"] == "http://example.com" + @test root["xmlns:ns"] == "http://example.com/ns" + els = filter(x -> nodetype(x) == Element, children(root)) + @test tag(els[1]) == "ns:child" + end + + @testset "Namespaced attributes" begin + doc = parse("""""", Node) + root = children(doc)[1] + @test root["xml:lang"] == "en" + @test root["xml:space"] == "preserve" + end + + #==========================================================================# + # Large Content # + #==========================================================================# + @testset "Long attribute value" begin + long_val = repeat("x", 10_000) + doc = parse("", Node) + @test children(doc)[1]["attr"] == long_val + end + + @testset "Long text content" begin + long_text = repeat("Hello World! ", 1000) + doc = parse("$long_text", Node) + @test simple_value(children(doc)[1]) == long_text + end + + @testset "Long CDATA" begin + long_cdata = repeat("data<>& ", 1000) + doc = parse("", Node) + root = children(doc)[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test value(cdata[1]) == long_cdata + end +end diff --git a/test/test_libxml2_testcases.jl b/test/test_libxml2_testcases.jl new file mode 100644 index 0000000..0b8a89a --- /dev/null +++ b/test/test_libxml2_testcases.jl @@ -0,0 +1,1578 @@ +# Test cases borrowed from the libxml2 test suite (https://github.com/GNOME/libxml2). +# +# libxml2 is Copyright (C) the GNOME Project and contributors, licensed under the MIT License. +# These test cases are adapted for the XML.jl Julia package. +# +# Categories mirror the libxml2 test/ directory structure: +# - CDATA handling +# - Comments +# - Processing instructions +# - Attributes (normalization, entities, quoting) +# - Namespaces +# - DTD / internal subset +# - Entity references (character refs, predefined, internal general) +# - Whitespace / blank handling +# - Well-formedness (boundaries, big names, mixed content) +# - Error cases (must fail to parse) + +using XML +using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text +using XML: escape, unescape +using Test + +@testset "libxml2 test cases" begin + +#==============================================================================# +# CDATA SECTIONS # +# From: test/cdata, test/cdata2, test/adjacent-cdata.xml, # +# test/emptycdata.xml, test/cdata-*-byte-UTF-8.xml # +#==============================================================================# +@testset "CDATA" begin + @testset "cdata: basic CDATA with markup characters" begin + # libxml2 test/cdata + xml = """\nHello, world!]]>\n""" + doc = parse(xml, Node) + root = doc[1] + cdata_nodes = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata_nodes) >= 1 + @test value(cdata_nodes[1]) == "Hello, world!" + end + + @testset "cdata2: nested CDATA-like content" begin + # libxml2 test/cdata2 - tests ]]> escaping pattern + xml = """ + + ]> +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "collection" + end + + @testset "adjacent-cdata: three adjacent CDATA sections" begin + # libxml2 test/adjacent-cdata.xml + xml = "" + doc = parse(xml, Node) + root = doc[1] + cdata_nodes = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata_nodes) == 3 + @test value(cdata_nodes[1]) == "abc" + @test value(cdata_nodes[2]) == "def" + @test value(cdata_nodes[3]) == "ghi" + end + + @testset "emptycdata: empty CDATA section in namespaced doc" begin + # libxml2 test/emptycdata.xml + xml = """ + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "html" + cdata_nodes = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata_nodes) >= 1 + @test value(cdata_nodes[1]) == "" + end + + @testset "cdata-2-byte-UTF-8: two-byte chars across buffer boundary" begin + # libxml2 test/cdata-2-byte-UTF-8.xml - tests Č (U+010C, 2 bytes in UTF-8) + long_c = repeat("Č", 400) + xml = """\n\n

\n
""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + p = first(filter(x -> nodetype(x) == Element, children(root))) + cdata = first(filter(x -> nodetype(x) == CData, children(p))) + @test value(cdata) == long_c + end + + @testset "cdata-3-byte-UTF-8: three-byte chars across buffer boundary" begin + # libxml2 test/cdata-3-byte-UTF-8.xml - tests 牛 (U+725B, 3 bytes in UTF-8) + long_cow = repeat("牛", 400) + xml = """\n\n

\n
""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + p = first(filter(x -> nodetype(x) == Element, children(root))) + cdata = first(filter(x -> nodetype(x) == CData, children(p))) + @test value(cdata) == long_cow + end + + @testset "cdata-4-byte-UTF-8: four-byte chars across buffer boundary" begin + # libxml2 test/cdata-4-byte-UTF-8.xml - tests 🍦 (U+1F366, 4 bytes in UTF-8) + long_ice = repeat("🍦", 334) + xml = """\n\n

\n
""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + p = first(filter(x -> nodetype(x) == Element, children(root))) + cdata = first(filter(x -> nodetype(x) == CData, children(p))) + @test value(cdata) == long_ice + end +end + +#==============================================================================# +# COMMENTS # +# From: test/comment.xml through test/comment6.xml, test/badcomment.xml # +#==============================================================================# +@testset "Comments" begin + @testset "comment: comments inside element" begin + # libxml2 test/comment.xml + xml = """ + + + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + comments = filter(x -> nodetype(x) == Comment, children(root)) + @test length(comments) == 2 + @test contains(value(comments[1]), "document start") + @test contains(value(comments[2]), "document end") + end + + @testset "comment2: comments outside root element" begin + # libxml2 test/comment2.xml + xml = """ + + + + +""" + doc = parse(xml, Node) + top_comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test length(top_comments) == 2 + @test contains(value(top_comments[1]), "document start") + @test contains(value(top_comments[2]), "document end") + end + + @testset "comment3: very long comment (buffer boundary test)" begin + # libxml2 test/comment3.xml - 150+ lines of repeated digits + lines = join([repeat("01234567890123456789012345678901234567890123456789", 1) for _ in 1:150], "\n") + comment_text = " test of very very long comments and buffer limits\n" * lines * "\n" + xml = """\n\n""" + doc = parse(xml, Node) + comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test length(comments) >= 1 + @test length(value(comments[1])) > 7000 + end + + @testset "comment5: hyphens and line breaks in comments" begin + # libxml2 test/comment5.xml + xml = """ + +""" + doc = parse(xml, Node) + comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test length(comments) == 1 + @test contains(value(comments[1]), "hyphen") + @test contains(value(comments[1]), "- - -") + end + + @testset "comment6: comment before DOCTYPE" begin + # libxml2 test/comment6.xml + xml = """ + +]> +
""" + doc = parse(xml, Node) + typed = filter(x -> nodetype(x) != Text, children(doc)) + @test nodetype(typed[1]) == Comment + @test nodetype(typed[2]) == DTD + @test nodetype(typed[3]) == Element + end + + @testset "badcomment: comment with markup-like content" begin + # libxml2 test/badcomment.xml - note: libxml2 considers this valid XML + xml = """ + + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "foo" + comments = filter(x -> nodetype(x) == Comment, children(root)) + @test length(comments) >= 1 + end + + @testset "comment4: non-ASCII characters in comments" begin + # libxml2 test/comment4.xml (adapted from ISO-8859-1 to UTF-8) + xml = """ + + + +""" + doc = parse(xml, Node) + comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test length(comments) == 3 + @test contains(value(comments[1]), "là") + @test contains(value(comments[2]), "à") + end +end + +#==============================================================================# +# PROCESSING INSTRUCTIONS # +# From: test/pi.xml, test/pi2.xml # +#==============================================================================# +@testset "Processing Instructions" begin + @testset "pi: PIs inside root element" begin + # libxml2 test/pi.xml + xml = """ + + + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(root)) + @test length(pis) == 2 + @test tag(pis[1]) == "document-start" + @test value(pis[1]) == "doc" + @test tag(pis[2]) == "document-end" + @test value(pis[2]) == "doc" + end + + @testset "pi2: PIs outside root element" begin + # libxml2 test/pi2.xml + xml = """ + + + + +""" + doc = parse(xml, Node) + top_pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc)) + @test length(top_pis) == 2 + @test tag(top_pis[1]) == "document-start" + @test tag(top_pis[2]) == "document-end" + end +end + +#==============================================================================# +# ATTRIBUTES # +# From: test/att1 through test/att11, test/attrib.xml, # +# test/def-xml-attr.xml, test/defattr.xml # +#==============================================================================# +@testset "Attributes" begin + @testset "att1: attribute with newlines (whitespace normalization)" begin + # libxml2 test/att1 + xml = "" + doc = parse(xml, Node) + @test tag(doc[1]) == "doc" + @test haskey(doc[1], "attr") + end + + @testset "att2: attribute with multiple spaces" begin + # libxml2 test/att2 + xml = """""" + doc = parse(xml, Node) + @test doc[1]["attr"] == "to normalize with a space" + end + + @testset "att3: attribute with character references" begin + # libxml2 test/att3 + xml = """""" + doc = parse(xml, Node) + @test tag(doc[1]) == "select" + @test haskey(doc[1], "onclick") + end + + @testset "att4: complex document with many attributes" begin + # Adapted from libxml2 test/att4 (electroxml document) + xml = """ + + + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "electroxml" + @test root["modified"] == "20021216T072726" + end + + @testset "attrib: attribute with entities and char refs" begin + # libxml2 test/attrib.xml + xml = """""" + doc = parse(xml, Node) + @test tag(doc[1]) == "item" + @test doc[1]["url"] == "http://example.com/" + @test doc[1]["visits"] == "1" + end + + @testset "att5: attribute with empty value" begin + # Adapted from libxml2 test/att5 + xml = """ +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test root["a"] == "" + @test root["b"] == "val" + end + + @testset "att9: attribute with single quotes in double-quoted value" begin + # libxml2 test/att9 pattern + xml = """""" + doc = parse(xml, Node) + @test doc[1]["attr"] == "it's a test" + end + + @testset "att10: attribute with double quotes in single-quoted value" begin + xml = """""" + doc = parse(xml, Node) + @test doc[1]["attr"] == "he said \"hello\"" + end + + @testset "att11: attribute values with entity refs" begin + xml = """""" + doc = parse(xml, Node) + @test doc[1]["a"] == "" + @test doc[1]["b"] == "a&b" + end + + @testset "def-xml-attr: xml:lang default attribute in DTD" begin + # libxml2 test/def-xml-attr.xml (just verify parsing doesn't fail) + xml = """ + + +]> + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "root" + end +end + +#==============================================================================# +# NAMESPACES # +# From: test/ns through test/ns7, test/namespaces/err_*.xml, # +# test/nsclean.xml, test/entity-in-ns-uri.xml # +#==============================================================================# +@testset "Namespaces" begin + @testset "ns: namespace with prefix on element and attribute" begin + # libxml2 test/ns + xml = """ + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "dia:diagram" + @test root["xmlns:dia"] == "http://www.lysator.liu.se/~alla/dia/" + child = first(filter(x -> nodetype(x) == Element, children(root))) + @test tag(child) == "dia:diagramdata" + @test child["dia:testattr"] == "test" + end + + @testset "ns2: namespace on self-closing element" begin + # libxml2 test/ns2 + xml = """ +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "dia:diagram" + @test root["dia:testattr"] == "test" + end + + @testset "ns3: xmlns declared after prefixed attribute" begin + # libxml2 test/ns3 + xml = """ +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test root["dia:testattr"] == "test" + @test root["xmlns:dia"] == "http://www.lysator.liu.se/~alla/dia/" + end + + @testset "ns4: xml:lang, xml:link, xml:space built-in attributes" begin + # libxml2 test/ns4 + xml = """ +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test root["xml:lang"] == "en" + @test root["xml:space"] == "preserve" + end + + @testset "ns5: default namespace on element with prefix on another" begin + # libxml2 test/ns5 + xml = """ + +""" + doc = parse(xml, Node) + root = doc[1] + @test root["xmlns"] == "http://example.org/ns/1" + @test root["xmlns:rng"] == "http://example.org/ns/1" + @test root["name"] == "foo" + end + + @testset "ns6: default namespace on child, not on sibling" begin + # libxml2 test/ns6 + xml = """ + + +""" + doc = parse(xml, Node) + root = doc[1] + elements = filter(x -> nodetype(x) == Element, children(root)) + @test tag(elements[1]) == "foo" + @test elements[1]["xmlns"] == "http://abc" + @test tag(elements[2]) == "bar" + end + + @testset "ns7: xml: prefix element (built-in)" begin + # libxml2 test/ns7 + xml = "" + doc = parse(xml, Node) + @test tag(doc[1]) == "xml:test" + end + + @testset "multiple namespace prefixes" begin + xml = """ + + +""" + doc = parse(xml, Node) + root = doc[1] + elements = filter(x -> nodetype(x) == Element, children(root)) + @test tag(elements[1]) == "a:child" + @test elements[1]["a:attr"] == "1" + @test tag(elements[2]) == "b:child" + @test elements[2]["b:attr"] == "2" + end + + @testset "namespace redeclaration on nested element" begin + xml = """ + + + +""" + doc = parse(xml, Node) + root = doc[1] + child = first(filter(x -> nodetype(x) == Element, children(root))) + @test child["xmlns:a"] == "http://second.com" + end +end + +#==============================================================================# +# DTD / INTERNAL SUBSET # +# From: test/dtd1 through test/dtd13, test/intsubset.xml, # +# test/intsubset2.xml # +#==============================================================================# +@testset "DTD / Internal Subset" begin + @testset "dtd1: DOCTYPE with PUBLIC id" begin + # libxml2 test/dtd1 + xml = """ + + +""" + doc = parse(xml, Node) + dtd = first(filter(x -> nodetype(x) == DTD, children(doc))) + @test contains(value(dtd), "MEMO") + @test contains(value(dtd), "PUBLIC") + end + + @testset "dtd2: simple internal subset with ELEMENT declaration" begin + # libxml2 test/dtd2 + xml = """ +]> +This is a valid document !""" + doc = parse(xml, Node) + dtd = first(filter(x -> nodetype(x) == DTD, children(doc))) + @test contains(value(dtd), "ELEMENT") + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test simple_value(root) == "This is a valid document !" + end + + @testset "dtd3: ANY content model" begin + # libxml2 test/dtd3 + xml = """ +]> +This is a valid document !""" + doc = parse(xml, Node) + dtd = first(filter(x -> nodetype(x) == DTD, children(doc))) + @test contains(value(dtd), "ANY") + end + + @testset "dtd4: EMPTY content model" begin + # libxml2 test/dtd4 + xml = """ +]> +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "doc" + @test length(children(root)) == 0 + end + + @testset "dtd5: mixed content model" begin + # libxml2 test/dtd5 + xml = """ + + +]> +This is a valid document""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "doc" + elements = filter(x -> nodetype(x) == Element, children(root)) + @test length(elements) == 2 + @test tag(elements[1]) == "a" + @test tag(elements[2]) == "b" + end + + @testset "dtd6: choice content model" begin + # libxml2 test/dtd6 + xml = """ + + +]> +This is a valid document""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + elements = filter(x -> nodetype(x) == Element, children(root)) + @test length(elements) == 3 + end + + @testset "dtd7: sequence content model" begin + # libxml2 test/dtd7 + xml = """ + + +]> +This is a valid document""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + elements = filter(x -> nodetype(x) == Element, children(root)) + @test length(elements) == 2 + @test tag(elements[1]) == "a" + @test tag(elements[2]) == "b" + end + + @testset "dtd8: nested choice and sequence" begin + # libxml2 test/dtd8 + xml = """ + + + + +]> +This is a valid document""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + elements = filter(x -> nodetype(x) == Element, children(root)) + @test tag(elements[1]) == "b" + @test tag(elements[2]) == "c" + end + + @testset "dtd9: optional content model" begin + # libxml2 test/dtd9 + xml = """ + + + + +]> +This is a valid document""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + elements = filter(x -> nodetype(x) == Element, children(root)) + @test length(elements) == 2 + end + + @testset "dtd10: mixed repetition content model" begin + # libxml2 test/dtd10 + xml = """ + + + + +]> +This is a valid document""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + elements = filter(x -> nodetype(x) == Element, children(root)) + @test length(elements) == 3 + end + + @testset "dtd11: ATTLIST with CDATA #IMPLIED" begin + # libxml2 test/dtd11 + xml = """ + +]> +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test root["val"] == "v1" + end + + @testset "dtd12: nested entity references" begin + # libxml2 test/dtd12 - entity referencing another entity + xml = """ + +]> +&WhatHeSaid;""" + # This may or may not expand depending on XML.jl's entity handling + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "doc" + end + + @testset "dtd13: comments before and after DOCTYPE" begin + # libxml2 test/dtd13 + xml = """ + +]> + +""" + doc = parse(xml, Node) + typed = filter(x -> nodetype(x) != Text, children(doc)) + @test nodetype(typed[1]) == Comment + @test nodetype(typed[2]) == DTD + @test nodetype(typed[3]) == Comment + @test nodetype(typed[4]) == Element + end + + @testset "intsubset: internal subset with comment containing quote" begin + # libxml2 test/intsubset.xml + xml = """ + + +]> +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "root" + dtd = first(filter(x -> nodetype(x) == DTD, children(doc))) + @test contains(value(dtd), "ELEMENT") + end +end + +#==============================================================================# +# ENTITY REFERENCES # +# From: test/ent1 through test/ent11, test/ent6hex # +#==============================================================================# +@testset "Entity References" begin + @testset "ent1: internal general entity declaration and use" begin + # libxml2 test/ent1 + xml = """ + +]> + + &xml; +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "EXAMPLE" + end + + @testset "ent3: entity refs in attribute values" begin + # libxml2 test/ent3 + xml = """ + +]> + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test root["prop1"] == "a&b" + @test root["prop2"] == "c + + This is an inverted exclamation sign ¡ + This is a space +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + text = join([value(c) for c in children(root) if nodetype(c) == Text]) + @test contains(text, "\u00A1") # ¡ + @test contains(text, " ") # space ( ) + end + + @testset "ent6: predefined entities with double-escaping" begin + # libxml2 test/ent6 + xml = """ + + + + +]> +<""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "doc" + end + + @testset "ent8: multiple entities in one document" begin + # libxml2 test/ent8 + xml = """ + +]> + +&test1;&test2; +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "doc" + end + + @testset "predefined entities in text content" begin + xml = "& < > ' "" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "& < > ' \"" + end + + @testset "predefined entities in attributes" begin + xml = """""" + doc = parse(xml, Node) + @test doc[1]["a"] == "&" + @test doc[1]["b"] == "<" + @test doc[1]["c"] == ">" + @test doc[1]["d"] == "'" + @test doc[1]["e"] == "\"" + end + + @testset "decimal character references" begin + xml = "ABC" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "ABC" + end + + @testset "hexadecimal character references" begin + xml = "ABC" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "ABC" + end + + @testset "mixed hex and decimal char refs" begin + xml = "Hello" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "Hello" + end + + @testset "char ref for non-ASCII: inverted exclamation" begin + xml = "¡" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "\u00A1" + end + + @testset "char ref for CJK character" begin + xml = "" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "中" + end + + @testset "char ref for emoji" begin + xml = "😀" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "\U0001F600" + end +end + +#==============================================================================# +# WHITESPACE / BLANK HANDLING # +# From: test/tstblanks.xml, test/title.xml # +#==============================================================================# +@testset "Whitespace / Blank Handling" begin + @testset "title: simple document with encoding" begin + # libxml2 test/title.xml + xml = """ +my title""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "title" + @test simple_value(root) == "my title" + end + + @testset "whitespace preservation in text content" begin + xml = " hello world " + doc = parse(xml, Node) + @test simple_value(doc[1]) == " hello world " + end + + @testset "tab and newline preservation" begin + xml = "\t\n\ttabbed\n" + doc = parse(xml, Node) + @test simple_value(doc[1]) == "\t\n\ttabbed\n" + end + + @testset "whitespace-only text node" begin + xml = " " + doc = parse(xml, Node) + @test simple_value(doc[1]) == " " + end + + @testset "inter-element whitespace preserved" begin + xml = "\n \n \n" + doc = parse(xml, Node) + root = doc[1] + text_nodes = filter(x -> nodetype(x) == Text, children(root)) + @test length(text_nodes) >= 1 + end +end + +#==============================================================================# +# WELL-FORMED DOCUMENTS # +# From: test/boundaries1.xml, test/bigname.xml, test/bigname2.xml, # +# test/slashdot.xml, test/eve.xml, test/wap.xml, etc. # +#==============================================================================# +@testset "Well-Formed Documents" begin + @testset "boundaries1: boundary conditions with entities and CDATA" begin + # libxml2 test/boundaries1.xml (simplified - without DTD entity expansion) + xml = """ +"> + '> +]> + + +text + + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "d" + @test root["a"] == ">" + @test root["b"] == ">" + cdata_nodes = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata_nodes) == 1 + @test value(cdata_nodes[1]) == "cdata" + end + + @testset "bigname: very long element name" begin + # libxml2 test/bigname.xml - element name with >10000 characters + longname = "this_is_a_very_large_name_" * repeat("0123456789", 500) * "_end" + xml = "<$(longname)/>" + doc = parse(xml, Node) + @test tag(doc[1]) == longname + end + + @testset "slashdot: real-world XML (ultramode feed)" begin + # libxml2 test/slashdot.xml (simplified) + xml = """ + + 100 Mbit/s on Fibre to the home + http://slashdot.org/articles/99/06/06/1440211.shtml + + CmdrTaco + wouldn't-it-be-nice + internet + 20 +
articles
+ topicinternet.jpg +
+ + Gimp 1.2 Preview + http://slashdot.org/articles/99/06/06/1438246.shtml + + CmdrTaco + stuff-to-read + gimp + 12 +
articles
+ topicgimp.gif +
+
""" + doc = parse(xml, Node) + root = doc[1] + @test tag(root) == "ultramode" + stories = filter(x -> nodetype(x) == Element && tag(x) == "story", children(root)) + @test length(stories) == 2 + title1 = first(filter(x -> nodetype(x) == Element && tag(x) == "title", + children(stories[1]))) + @test simple_value(title1) == "100 Mbit/s on Fibre to the home" + end + + @testset "eve: document with external DTD reference and internal entity" begin + # libxml2 test/eve.xml + xml = """ + +]> + +""" + doc = parse(xml, Node) + dtd = first(filter(x -> nodetype(x) == DTD, children(doc))) + @test contains(value(dtd), "PUBLIC") + @test contains(value(dtd), "ENTITY") + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "spec" + end + + @testset "deeply nested document" begin + xml = "
deep" + doc = parse(xml, Node) + @test simple_value(doc[1][1][1][1][1][1][1][1][1][1]) == "deep" + end + + @testset "many sibling elements" begin + items = join(["Item $i" for i in 1:200]) + xml = "$items" + doc = parse(xml, Node) + elements = filter(x -> nodetype(x) == Element, children(doc[1])) + @test length(elements) == 200 + @test elements[1]["n"] == "1" + @test elements[200]["n"] == "200" + end + + @testset "mixed content: text, elements, CDATA, comments, PIs" begin + xml = """ + text before + child text + + + + text after +""" + doc = parse(xml, Node) + root = doc[1] + types = Set(nodetype(c) for c in children(root)) + @test Text in types + @test Element in types + @test Comment in types + @test CData in types + @test ProcessingInstruction in types + end + + @testset "self-closing elements" begin + xml = "

" + doc = parse(xml, Node) + elements = filter(x -> nodetype(x) == Element, children(doc[1])) + @test length(elements) == 3 + @test tag(elements[1]) == "br" + @test tag(elements[2]) == "hr" + @test tag(elements[3]) == "img" + @test all(x -> length(children(x)) == 0, elements) + end + + @testset "empty element: start-tag and end-tag" begin + xml = "" + doc = parse(xml, Node) + el = first(filter(x -> nodetype(x) == Element, children(doc[1]))) + @test tag(el) == "empty" + end + + @testset "element names with hyphens, dots, underscores" begin + xml = "<_private/>" + doc = parse(xml, Node) + @test tag(doc[1]) == "my-root" + elements = filter(x -> nodetype(x) == Element, children(doc[1])) + @test tag(elements[1]) == "sub.element" + @test tag(elements[2]) == "_private" + end + + @testset "element names starting with underscore" begin + xml = "<_root><__child/>" + doc = parse(xml, Node) + @test tag(doc[1]) == "_root" + end + + @testset "numeric element names (with letter prefix)" begin + xml = "

heading

" + doc = parse(xml, Node) + @test tag(doc[1]) == "h1" + @test simple_value(doc[1]) == "heading" + end +end + +#==============================================================================# +# ROUNDTRIP: PARSE → WRITE → PARSE # +# Tests that libxml2-style documents survive roundtrip processing # +#==============================================================================# +@testset "Roundtrip" begin + @testset "roundtrip: namespaced document" begin + xml = """ + + +""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + @test root["xmlns:dia"] == "http://www.lysator.liu.se/~alla/dia/" + end + + @testset "roundtrip: DTD with internal subset" begin + xml = """ +]> +text""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + dtd = first(filter(x -> nodetype(x) == DTD, children(doc2))) + @test contains(value(dtd), "ELEMENT") + end + + @testset "roundtrip: adjacent CDATA sections" begin + xml = "" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + cdata_nodes = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata_nodes) == 2 + end + + @testset "roundtrip: processing instructions" begin + xml = """ + + +""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc2)) + @test length(pis) == 2 + end + + @testset "roundtrip: comments with special characters" begin + xml = "" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + comments = filter(x -> nodetype(x) == Comment, children(root)) + @test length(comments) == 1 + end + + @testset "roundtrip: entities in attributes" begin + xml = """""" + doc = parse(xml, Node) + s = XML.write(doc) + doc2 = parse(s, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc2))) + @test root["a"] == "a&b" + @test root["b"] == "cx", Node) + @test nodetype(doc) == Document + end + + @testset "errors/invalid-start-tag-1: text-only document" begin + # libxml2 test/errors/invalid-start-tag-1.xml + # XML.jl is lenient: treats bare text as a Text node + doc = parse("x", Node) + @test nodetype(doc) == Document + end + + @testset "errors/invalid-start-tag-2: lone <" begin + # libxml2 test/errors/invalid-start-tag-2.xml + @test_throws Exception parse("<", Node) + end + + @testset "errors/doctype1: malformed DOCTYPE" begin + # libxml2 test/errors/doctype1.xml - "[]>" + # XML.jl is lenient: parses the DOCTYPE and treats []> as text + doc = parse("[]>\n", Node) + @test nodetype(doc) == Document + end + + @testset "errors/dup-xml-attr: duplicate xml: attribute" begin + # libxml2 test/errors/dup-xml-attr.xml + @test_throws Exception parse("""""", Node) + end + + @testset "errors/attr5: duplicate attribute" begin + # libxml2 test/errors/attr5.xml + @test_throws Exception parse(""" + +""", Node) + end + + @testset "mismatched tags" begin + @test_throws Exception parse("
", Node) + end + + @testset "overlapping elements" begin + @test_throws Exception parse("", Node) + end + + @testset "unclosed root element" begin + @test_throws Exception parse("", Node) + end + + @testset "close tag without open" begin + @test_throws Exception parse("", Node) + end + + @testset "unclosed comment" begin + @test_throws Exception parse("
" + doc = parse(xml, Node) + comments = filter(x -> nodetype(x) == Comment, children(doc[1])) + @test contains(value(comments[1]), "héllo") + end + + @testset "Unicode in CDATA" begin + xml = "" + doc = parse(xml, Node) + cdata = first(filter(x -> nodetype(x) == CData, children(doc[1]))) + @test value(cdata) == "日本語テスト" + end + + @testset "Unicode in PI content" begin + xml = "" + doc = parse(xml, Node) + pi = first(filter(x -> nodetype(x) == ProcessingInstruction, children(doc[1]))) + @test contains(value(pi), "données") + end + + @testset "UTF-8 BOM handling" begin + # libxml2 test/utf8bom.xml pattern + xml = "\xef\xbb\xbf\n" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "doc" + end +end + +#==============================================================================# +# REAL-WORLD DOCUMENT PATTERNS # +# Patterns commonly tested by libxml2 (DAV, RDF, SOAP, SVG, etc.) # +#==============================================================================# +@testset "Real-World Document Patterns" begin + @testset "WebDAV-like document" begin + # Inspired by libxml2 test/dav* series + xml = """ + + + /container/ + + + Example collection + + + HTTP/1.1 200 OK + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "D:multistatus" + @test root["xmlns:D"] == "DAV:" + end + + @testset "RDF-like document" begin + # Inspired by libxml2 test/rdf1, test/rdf2 + xml = """ + + + Example Resource + John Doe + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "rdf:RDF" + desc = first(filter(x -> nodetype(x) == Element, children(root))) + @test desc["rdf:about"] == "http://example.org/resource" + end + + @testset "SVG-like document" begin + # Inspired by libxml2 test/svg1, test/svg2, test/svg3 + xml = """ + + + + + + + + + + Hello SVG +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "svg" + @test root["xmlns"] == "http://www.w3.org/2000/svg" + @test root["width"] == "200" + end + + @testset "SOAP-like envelope" begin + xml = """ + + + + New York + US + + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "soap:Envelope" + end + + @testset "Atom feed" begin + xml = """ + + Example Feed + + 2003-12-13T18:30:02Z + + John Doe + + urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 + + Atom-Powered Robots Run Amok + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 2003-12-13T18:30:02Z + Some text. + +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "feed" + @test root["xmlns"] == "http://www.w3.org/2005/Atom" + end + + @testset "plist-like document" begin + xml = """ + + + + Name + Example + Version + 42 + Enabled + + Tags + + alpha + beta + + +""" + doc = parse(xml, Node) + plist = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(plist) == "plist" + @test plist["version"] == "1.0" + end + + @testset "XHTML with mixed content" begin + xml = """ + + Test + +

This is emphasized and strong text.

+

A link: click here.

+
+
  preformatted  text  
+ +""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "html" + @test root["xmlns"] == "http://www.w3.org/1999/xhtml" + end + + @testset "MathML-like document" begin + xml = """ + + x2 + + + y2 + = + z2 + +""" + doc = parse(xml, Node) + root = doc[1] + @test tag(root) == "math" + @test root["xmlns"] == "http://www.w3.org/1998/Math/MathML" + end + + @testset "WML-like document (mobile)" begin + # Inspired by libxml2 test/wml.xml + xml = """ + + + +

Welcome to WML

+
+
""" + doc = parse(xml, Node) + root = first(filter(x -> nodetype(x) == Element, children(doc))) + @test tag(root) == "wml" + end +end + +#==============================================================================# +# EDGE CASES # +# Additional edge cases inspired by libxml2 test patterns # +#==============================================================================# +@testset "Edge Cases" begin + @testset "CDATA containing ]] not followed by >" begin + xml = "" + doc = parse(xml, Node) + cdata = first(filter(x -> nodetype(x) == CData, children(doc[1]))) + @test value(cdata) == "a]]b" + end + + @testset "comment containing --" begin + # Note: -- inside comments is technically not well-formed per spec, + # but many parsers tolerate single - characters + xml = "" + doc = parse(xml, Node) + comments = filter(x -> nodetype(x) == Comment, children(doc[1])) + @test length(comments) == 1 + end + + @testset "attribute value containing >" begin + xml = """""" + doc = parse(xml, Node) + @test doc[1]["attr"] == "a>b" + end + + @testset "attribute value containing single quote in double quotes" begin + xml = """""" + doc = parse(xml, Node) + @test doc[1]["attr"] == "it's" + end + + @testset "attribute value containing double quote in single quotes" begin + xml = "" + doc = parse(xml, Node) + @test doc[1]["attr"] == "say \"hello\"" + end + + @testset "very long attribute value" begin + long_val = repeat("x", 10000) + xml = """""" + doc = parse(xml, Node) + @test doc[1]["attr"] == long_val + end + + @testset "very long text content" begin + long_text = repeat("word ", 5000) + xml = "$(long_text)" + doc = parse(xml, Node) + @test simple_value(doc[1]) == long_text + end + + @testset "many attributes on one element" begin + attrs = join(["a$i=\"v$i\"" for i in 1:50], " ") + xml = "" + doc = parse(xml, Node) + @test doc[1]["a1"] == "v1" + @test doc[1]["a50"] == "v50" + end + + @testset "whitespace around = in attributes" begin + xml = """""" + doc = parse(xml, Node) + @test doc[1]["a"] == "1" + @test doc[1]["b"] == "2" + end + + @testset "tab and newline in tag whitespace" begin + xml = "" + doc = parse(xml, Node) + @test doc[1]["a"] == "1" + @test doc[1]["b"] == "2" + end + + @testset "empty element: self-closing vs open-close" begin + xml1 = "" + xml2 = "" + doc1 = parse(xml1, Node) + doc2 = parse(xml2, Node) + # Both should produce empty elements + el1 = first(filter(x -> nodetype(x) == Element, children(doc1[1]))) + el2 = first(filter(x -> nodetype(x) == Element, children(doc2[1]))) + @test tag(el1) == tag(el2) == "x" + end + + @testset "document with all prolog components" begin + xml = """ + + + + +]> + + +&greeting;""" + doc = parse(xml, Node) + typed = filter(x -> nodetype(x) != Text, children(doc)) + type_list = map(nodetype, typed) + @test Declaration in type_list + @test DTD in type_list + @test Comment in type_list + @test ProcessingInstruction in type_list + @test Element in type_list + end +end + +end # top-level @testset diff --git a/test/test_pugixml.jl b/test/test_pugixml.jl new file mode 100644 index 0000000..6e46d5a --- /dev/null +++ b/test/test_pugixml.jl @@ -0,0 +1,308 @@ +# Test cases inspired by pugixml (https://github.com/zeux/pugixml, MIT license) +# Translated from tests/test_parse.cpp and tests/test_xpath.cpp + +using XML +using XML: Node, nodetype, Document, Element, Comment, CData, ProcessingInstruction, Text, Declaration +using XML: tag, value, children, attributes, simple_value, xpath +using Test + +@testset "pugixml-inspired" begin + + #==========================================================================# + # Processing Instructions # + #==========================================================================# + @testset "PI parsing" begin + doc = parse("", Node) + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc)) + @test length(pis) == 1 + + doc = parse("", Node) + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc)) + @test length(pis) == 1 + + doc = parse("", Node) + pis = filter(x -> nodetype(x) == ProcessingInstruction, children(doc)) + @test length(pis) == 1 + end + + @testset "PI errors" begin + # XML.jl is lenient about incomplete PIs without a root element, + # but these should fail when embedded in a document + @test_throws Exception parse("", Node) + @test_throws Exception parse("", Node) + end + + #==========================================================================# + # Comments # + #==========================================================================# + @testset "Comment parsing" begin + doc = parse("", Node) + comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test length(comments) == 1 + @test value(comments[1]) == "" + + doc = parse("", Node) + comments = filter(x -> nodetype(x) == Comment, children(doc)) + @test value(comments[1]) == "value" + + doc = parse("", Node) + root = filter(x -> nodetype(x) == Element, children(doc))[1] + comments = filter(x -> nodetype(x) == Comment, children(root)) + @test contains(value(comments[1]), "multi") + end + + @testset "Comment errors" begin + @test_throws Exception parse("", Node) + end + + #==========================================================================# + # CDATA # + #==========================================================================# + @testset "CDATA parsing" begin + doc = parse("", Node) + root = filter(x -> nodetype(x) == Element, children(doc))[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test length(cdata) == 1 + @test value(cdata[1]) == "" + + doc = parse("", Node) + root = filter(x -> nodetype(x) == Element, children(doc))[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test value(cdata[1]) == "value" + + # CDATA preserves markup characters + doc = parse("Hello!]]>", Node) + root = filter(x -> nodetype(x) == Element, children(doc))[1] + cdata = filter(x -> nodetype(x) == CData, children(root)) + @test value(cdata[1]) == "Hello!" + end + + @testset "CDATA errors" begin + @test_throws Exception parse("", Node) + @test tag(children(doc)[1]) == "node" + + doc = parse("", Node) + @test tag(children(doc)[1]) == "node" + + doc = parse("", Node) + @test tag(children(doc)[1]) == "node" + end + + @testset "Tag hierarchy" begin + doc = parse("", Node) + root = children(doc)[1] + @test tag(root) == "node" + root_els = filter(x -> nodetype(x) == Element, children(root)) + @test length(root_els) == 2 + @test tag(root_els[1]) == "n1" + @test tag(root_els[2]) == "n3" + end + + @testset "Tag errors" begin + @test_throws Exception parse("<", Node) + @test_throws Exception parse("", Node) + @test_throws Exception parse("", Node) + @test_throws Exception parse("", Node) + end + + #==========================================================================# + # Attribute Parsing # + #==========================================================================# + @testset "Attribute quotes" begin + doc = parse("", Node) + el = children(doc)[1] + @test el["id1"] == "v1" + @test el["id2"] == "v2" + end + + @testset "Attribute spaces around =" begin + doc = parse("", Node) + el = children(doc)[1] + @test el["id1"] == "v1" + @test el["id2"] == "v2" + @test el["id3"] == "v3" + @test el["id4"] == "v4" + end + + @testset "Attribute errors" begin + @test_throws Exception parse("", Node) + @test children(doc)[1]["id"] == "<>&'\"" + end + + @testset "Predefined entities in text" begin + doc = parse("<>&'"", Node) + @test simple_value(children(doc)[1]) == "<>&'\"" + end + + @testset "Numeric character references" begin + doc = parse(" ", Node) + @test simple_value(children(doc)[1]) == " " + end + + @testset "Unicode character references" begin + # Greek gamma + doc = parse("γ", Node) + @test simple_value(children(doc)[1]) == "γ" + + # Same char, lowercase hex + doc = parse("γ", Node) + @test simple_value(children(doc)[1]) == "γ" + end + + #==========================================================================# + # Whitespace # + #==========================================================================# + @testset "Whitespace text nodes preserved" begin + doc = parse(" ", Node) + root = children(doc)[1] + # Should have text nodes with whitespace + text_nodes = filter(x -> nodetype(x) == Text, children(root)) + @test length(text_nodes) >= 1 + end + + @testset "PCDATA content" begin + doc = parse("text content", Node) + @test simple_value(children(doc)[1]) == "text content" + end + + #==========================================================================# + # Unicode / CJK Content # + #==========================================================================# + @testset "Unicode element names (CJK)" begin + # XML.jl tokenizer does not yet support CJK characters in element/attribute names + @test_broken try + parse("<汉语>世界", Node) + true + catch + false + end + end + + @testset "Unicode text content" begin + doc = parse("Ünïcödé café naïve", Node) + @test simple_value(children(doc)[1]) == "Ünïcödé café naïve" + end + + #==========================================================================# + # Mixed Content # + #==========================================================================# + @testset "Mixed text, CDATA, comments" begin + xml = "First textSecond textLast text" + doc = parse(xml, Node) + root = children(doc)[1] + child_types = map(nodetype, children(root)) + @test Text in child_types + @test Comment in child_types + @test CData in child_types + end + + #==========================================================================# + # Complex Document # + #==========================================================================# + @testset "Complex document with all node types" begin + xml = """ + + + + + + some text + + + + + + +""" + doc = parse(xml, Node) + @test nodetype(doc) == Document + + root_els = filter(x -> nodetype(x) == Element, children(doc)) + @test length(root_els) == 1 + mesh = root_els[1] + @test tag(mesh) == "mesh" + @test mesh["name"] == "mesh_root" + + # Check inner content types + inner = children(mesh) + @test any(x -> nodetype(x) == Comment, inner) + @test any(x -> nodetype(x) == Text, inner) + @test any(x -> nodetype(x) == CData, inner) + @test any(x -> nodetype(x) == ProcessingInstruction, inner) + + nodes = filter(x -> nodetype(x) == Element && tag(x) == "node", inner) + @test length(nodes) == 2 + @test nodes[1]["attr1"] == "value1" + @test nodes[1]["attr2"] == "value2" + end + + #==========================================================================# + # XPath # + #==========================================================================# + @testset "XPath" begin + @testset "descendant with attribute predicate" begin + doc = parse("", Node) + results = xpath(doc, "//c[@id='b']") + @test length(results) == 1 + @test results[1]["id"] == "b" + end + + @testset "child with attribute" begin + doc = parse("", Node) + results = xpath(doc, "/a/c[@id]") + @test length(results) == 1 + @test results[1]["id"] == "b" + end + + @testset "wildcard with attribute predicate" begin + doc = parse("""test""", Node) + results = xpath(doc, "/node/*[@attr1]") + @test length(results) == 2 + end + + @testset "descendant-or-self with text()" begin + doc = parse("deep", Node) + results = xpath(doc, "//e/text()") + @test length(results) == 1 + @test value(results[1]) == "deep" + end + + @testset "positional predicate" begin + doc = parse("", Node) + results = xpath(doc, "/root/*[1]") + @test length(results) == 1 + @test tag(results[1]) == "a" + + results = xpath(doc, "/root/*[last()]") + @test length(results) == 1 + @test tag(results[1]) == "c" + end + + @testset "nested predicates" begin + doc = parse("""""", Node) + results = xpath(doc, "//subchild[@id]") + @test length(results) == 2 + end + end +end diff --git a/test/test_remote_files.jl b/test/test_remote_files.jl new file mode 100644 index 0000000..ed2b3e3 --- /dev/null +++ b/test/test_remote_files.jl @@ -0,0 +1,77 @@ +using XML +using XML: Document, Element, Declaration, Comment, CData, DTD, ProcessingInstruction, Text +using Downloads: download +using Test + +#==============================================================================# +# REMOTE XML FILE PARSING TESTS # +#==============================================================================# +# These tests download publicly available XML files and verify that XML.jl can +# parse them without error. A failed download (network issues, CI without +# internet, URL gone) is silently skipped — only parsing failures count as test +# failures. +# +# Not included in runtests.jl — run standalone: julia --project test/test_remote_files.jl + +function _try_download(url::AbstractString)::Union{String, Nothing} + try + path = download(url) + return read(path, String) + catch + return nothing + end +end + +const REMOTE_XML_URLS = [ + # ---- W3Schools example files ---- + ("W3Schools note.xml", "https://www.w3schools.com/xml/note.xml"), + ("W3Schools cd_catalog.xml", "https://www.w3schools.com/xml/cd_catalog.xml"), + ("W3Schools plant_catalog.xml", "https://www.w3schools.com/xml/plant_catalog.xml"), + ("W3Schools simple.xml", "https://www.w3schools.com/xml/simple.xml"), + ("W3Schools books.xml", "https://www.w3schools.com/xml/books.xml"), + + # ---- W3C SVG samples ---- + ("W3C SVG helloworld.svg", "https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/helloworld.svg"), + ("W3C SVG tiger.svg", "https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/tiger.svg"), + ("W3C SVG w3c.svg", "https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/w3c.svg"), + ("W3C SVG lineargradient2.svg", "https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/lineargradient2.svg"), + ("W3C SVG heart.svg", "https://dev.w3.org/SVG/tools/svgweb/samples/svg-files/heart.svg"), + + # ---- GitHub-hosted XML files ---- + ("JUnit XML complete example", "https://raw.githubusercontent.com/testmoapp/junitxml/main/examples/junit-complete.xml"), + ("JUnit XML basic example", "https://raw.githubusercontent.com/testmoapp/junitxml/main/examples/junit-basic.xml"), + ("PEPPOL invoice base example", "https://raw.githubusercontent.com/OpenPEPPOL/peppol-bis-invoice-3/master/rules/examples/base-example.xml"), + + # ---- Maven Central POM (real-world XML with namespaces) ---- + ("Maven JUnit 4.13.2 POM", "https://repo1.maven.org/maven2/junit/junit/4.13.2/junit-4.13.2.pom"), + ("Maven Guava 33.0 POM", "https://repo1.maven.org/maven2/com/google/guava/guava/33.0.0-jre/guava-33.0.0-jre.pom"), + + # ---- NASA RSS feed (live XML) ---- + ("NASA news RSS feed", "https://www.nasa.gov/news-release/feed/"), +] + +@testset "Remote XML Parsing" begin + for (label, url) in REMOTE_XML_URLS + @testset "$label" begin + xml_str = _try_download(url) + if isnothing(xml_str) + @info "Skipping $label — download failed" url + @test_skip false + else + doc = parse(xml_str, Node) + @test nodetype(doc) == Document + @test length(children(doc)) > 0 + + # Verify at least one Element exists somewhere in the document + has_element = any(x -> nodetype(x) == Element, children(doc)) + @test has_element + + # Verify write produces output and can be re-parsed + xml_out = XML.write(doc) + @test length(xml_out) > 0 + doc2 = parse(xml_out, Node) + @test nodetype(doc2) == Document + end + end + end +end diff --git a/test/test_tokenizer.jl b/test/test_tokenizer.jl new file mode 100644 index 0000000..89c7145 --- /dev/null +++ b/test/test_tokenizer.jl @@ -0,0 +1,425 @@ +using Test, XML + +using XML.XMLTokenizer + +# Convenience: collect token kinds from a string +kinds(xml) = [t.kind for t in tokenize(xml)] +raws(xml) = [String(t.raw) for t in tokenize(xml)] + +@testset "XMLTokenizer" begin + +#-----------------------------------------------------------------------# Basic text +@testset "plain text" begin + toks = collect(tokenize("hello world")) + @test length(toks) == 1 + @test toks[1].kind == TokenKinds.TEXT + @test toks[1].raw == "hello world" +end + +@testset "empty string" begin + @test isempty(collect(tokenize(""))) +end + +#-----------------------------------------------------------------------# Open tags +@testset "open tag without attributes" begin + @test kinds("
") == [TokenKinds.OPEN_TAG, TokenKinds.TAG_CLOSE] + @test raws("
") == [""] +end + +@testset "open tag with attributes" begin + xml = """""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [ + TokenKinds.OPEN_TAG, + TokenKinds.ATTR_NAME, TokenKinds.ATTR_VALUE, + TokenKinds.ATTR_NAME, TokenKinds.ATTR_VALUE, + TokenKinds.TAG_CLOSE, + ] + @test tag_name(toks[1]) == "a" + @test toks[2].raw == "href" + @test attr_value(toks[3]) == "url" + @test toks[4].raw == "class" + @test attr_value(toks[5]) == "main" +end + +@testset "whitespace around =" begin + xml = """""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [ + TokenKinds.OPEN_TAG, TokenKinds.ATTR_NAME, TokenKinds.ATTR_VALUE, TokenKinds.TAG_CLOSE, + ] + @test attr_value(toks[3]) == "1" +end + +#-----------------------------------------------------------------------# Self-closing tags +@testset "self-closing tag" begin + @test kinds("
") == [TokenKinds.OPEN_TAG, TokenKinds.SELF_CLOSE] + @test raws("
") == [""] +end + +@testset "self-closing tag with attributes" begin + xml = """""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [ + TokenKinds.OPEN_TAG, TokenKinds.ATTR_NAME, TokenKinds.ATTR_VALUE, TokenKinds.SELF_CLOSE, + ] + @test tag_name(toks[1]) == "img" + @test attr_value(toks[3]) == "a.png" +end + +#-----------------------------------------------------------------------# Close tags +@testset "close tag" begin + toks = collect(tokenize("
")) + @test [t.kind for t in toks] == [TokenKinds.CLOSE_TAG, TokenKinds.TAG_CLOSE] + @test tag_name(toks[1]) == "div" + @test toks[2].raw == ">" +end + +@testset "close tag with whitespace" begin + toks = collect(tokenize("
")) + @test [t.kind for t in toks] == [TokenKinds.CLOSE_TAG, TokenKinds.TAG_CLOSE] + @test tag_name(toks[1]) == "div" +end + +#-----------------------------------------------------------------------# Open + close round-trip +@testset "element with text" begin + xml = "

hello

" + @test kinds(xml) == [ + TokenKinds.OPEN_TAG, TokenKinds.TAG_CLOSE, + TokenKinds.TEXT, + TokenKinds.CLOSE_TAG, TokenKinds.TAG_CLOSE, + ] + toks = collect(tokenize(xml)) + @test tag_name(toks[1]) == "p" + @test toks[3].raw == "hello" + @test tag_name(toks[4]) == "p" +end + +#-----------------------------------------------------------------------# Namespaced tags +@testset "namespaced tag" begin + xml = """""" + toks = collect(tokenize(xml)) + @test tag_name(toks[1]) == "ns:el" + @test toks[2].raw == "xmlns:ns" +end + +#-----------------------------------------------------------------------# Comments +@testset "comment" begin + xml = "" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [TokenKinds.COMMENT_OPEN, TokenKinds.COMMENT_CONTENT, TokenKinds.COMMENT_CLOSE] + @test toks[1].raw == "" +end + +@testset "empty comment" begin + toks = collect(tokenize("")) + @test [t.kind for t in toks] == [TokenKinds.COMMENT_OPEN, TokenKinds.COMMENT_CONTENT, TokenKinds.COMMENT_CLOSE] + @test toks[2].raw == "" +end + +@testset "comment with markup-like content" begin + toks = collect(tokenize("")) + @test toks[2].raw == " not a tag " +end + +#-----------------------------------------------------------------------# CDATA +@testset "CDATA" begin + xml = "]]>" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [TokenKinds.CDATA_OPEN, TokenKinds.CDATA_CONTENT, TokenKinds.CDATA_CLOSE] + @test toks[1].raw == "" + @test toks[3].raw == "]]>" +end + +@testset "empty CDATA" begin + toks = collect(tokenize("")) + @test [t.kind for t in toks] == [TokenKinds.CDATA_OPEN, TokenKinds.CDATA_CONTENT, TokenKinds.CDATA_CLOSE] + @test toks[2].raw == "" +end + +#-----------------------------------------------------------------------# Processing instructions +@testset "processing instruction" begin + xml = """""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [TokenKinds.PI_OPEN, TokenKinds.PI_CONTENT, TokenKinds.PI_CLOSE] + @test toks[1].raw == "" +end + +@testset "PI with no content" begin + toks = collect(tokenize("")) + @test [t.kind for t in toks] == [TokenKinds.PI_OPEN, TokenKinds.PI_CONTENT, TokenKinds.PI_CLOSE] + @test pi_target(toks[1]) == "target" + @test toks[2].raw == "" +end + +#-----------------------------------------------------------------------# XML declaration +@testset "XML declaration" begin + xml = """""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [ + TokenKinds.XML_DECL_OPEN, + TokenKinds.ATTR_NAME, TokenKinds.ATTR_VALUE, + TokenKinds.ATTR_NAME, TokenKinds.ATTR_VALUE, + TokenKinds.XML_DECL_CLOSE, + ] + @test pi_target(toks[1]) == "xml" + @test toks[1].raw == "" +end + +@testset "XML declaration with single quotes" begin + xml = "" + toks = collect(tokenize(xml)) + @test toks[3].raw == "'1.0'" + @test attr_value(toks[3]) == "1.0" +end + +#-----------------------------------------------------------------------# DOCTYPE +@testset "DOCTYPE simple" begin + xml = """""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [TokenKinds.DOCTYPE_OPEN, TokenKinds.DOCTYPE_CONTENT, TokenKinds.DOCTYPE_CLOSE] + @test toks[1].raw == "" +end + +@testset "DOCTYPE with internal subset" begin + xml = """]>""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [TokenKinds.DOCTYPE_OPEN, TokenKinds.DOCTYPE_CONTENT, TokenKinds.DOCTYPE_CLOSE] + @test toks[2].raw == " note []" +end + +@testset "DOCTYPE with quoted > in internal subset" begin + xml = """b">]>""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [TokenKinds.DOCTYPE_OPEN, TokenKinds.DOCTYPE_CONTENT, TokenKinds.DOCTYPE_CLOSE] + @test occursin("a>b", toks[2].raw) +end + +#-----------------------------------------------------------------------# Full document +@testset "full document" begin + xml = """ + + + text + + + + +""" + toks = collect(tokenize(xml)) + tok_kinds = [t.kind for t in toks] + + # XML declaration + @test tok_kinds[1] == TokenKinds.XML_DECL_OPEN + # DOCTYPE present + @test TokenKinds.DOCTYPE_OPEN in tok_kinds + # All open tags have matching closes + open_names = [tag_name(t) for t in toks if t.kind == TokenKinds.OPEN_TAG] + close_names = [tag_name(t) for t in toks if t.kind == TokenKinds.CLOSE_TAG] + @test open_names == ["root", "child", "empty"] + @test close_names == ["child", "root"] + # CDATA is present + cdata_content = [t.raw for t in toks if t.kind == TokenKinds.CDATA_CONTENT] + @test cdata_content == ["data"] + # Comment is present + comment_content = [t.raw for t in toks if t.kind == TokenKinds.COMMENT_CONTENT] + @test comment_content == [" comment "] + # PI is present + pi_opens = [t for t in toks if t.kind == TokenKinds.PI_OPEN] + @test length(pi_opens) == 1 + @test pi_target(pi_opens[1]) == "pi" +end + +#-----------------------------------------------------------------------# Raw round-trip +@testset "concatenated raw reproduces input" begin + # Round-trip works for inputs where no whitespace/= is consumed between tokens. + # Whitespace around `=` in attributes is consumed and not part of any token. + for xml in [ + """
""", + """""", + """""", + """]>""", + """

text

""", + ] + reconstructed = join(t.raw for t in tokenize(xml)) + @test reconstructed == xml + end +end + +@testset "attribute whitespace is not preserved" begin + # Whitespace around `=` and between attrs is consumed, not emitted as tokens. + xml = """
""" + toks = collect(tokenize(xml)) + @test [t.kind for t in toks] == [ + TokenKinds.OPEN_TAG, TokenKinds.ATTR_NAME, TokenKinds.ATTR_VALUE, + TokenKinds.ATTR_NAME, TokenKinds.ATTR_VALUE, TokenKinds.SELF_CLOSE, + ] +end + +#-----------------------------------------------------------------------# Iterator protocol +@testset "iterator protocol" begin + t = tokenize("") + @test Base.IteratorSize(typeof(t)) == Base.SizeUnknown() + @test Base.eltype(typeof(t)) == Token + toks = collect(t) + @test length(toks) == 2 +end + +#-----------------------------------------------------------------------# Utility error handling +@testset "tag_name errors on wrong kind" begin + tok = first(tokenize("hello")) + @test_throws ArgumentError tag_name(tok) +end + +@testset "attr_value errors on wrong kind" begin + tok = first(tokenize("")) + @test_throws ArgumentError attr_value(tok) +end + +@testset "pi_target errors on wrong kind" begin + tok = first(tokenize("")) + @test_throws ArgumentError pi_target(tok) +end + +#-----------------------------------------------------------------------# Error cases +@testset "error: unterminated comment" begin + @test_throws ArgumentError collect(tokenize("")) + @test toks[2].raw == " héllo " +end + +#-----------------------------------------------------------------------# Edge cases +@testset "adjacent tags" begin + xml = "" + toks = collect(tokenize(xml)) + open_names = [tag_name(t) for t in toks if t.kind == TokenKinds.OPEN_TAG] + close_names = [tag_name(t) for t in toks if t.kind == TokenKinds.CLOSE_TAG] + @test open_names == ["a", "b"] + @test close_names == ["a", "b"] + # No text tokens between them + @test !any(t -> t.kind == TokenKinds.TEXT, toks) +end + +@testset "text between adjacent tags" begin + xml = "xy" + texts = [t.raw for t in tokenize(xml) if t.kind == TokenKinds.TEXT] + @test texts == ["x", "y"] +end + +@testset "multiple attributes" begin + xml = """
""" + names = [String(t.raw) for t in tokenize(xml) if t.kind == TokenKinds.ATTR_NAME] + vals = [String(attr_value(t)) for t in tokenize(xml) if t.kind == TokenKinds.ATTR_VALUE] + @test names == ["a", "b", "c"] + @test vals == ["1", "2", "3"] +end + +@testset "attribute with > in value" begin + xml = """""" + toks = collect(tokenize(xml)) + @test attr_value(toks[3]) == "1>2" + @test toks[end].kind == TokenKinds.TAG_CLOSE +end + +@testset "attribute with single quotes" begin + xml = "" + toks = collect(tokenize(xml)) + @test toks[3].raw == "'val'" + @test attr_value(toks[3]) == "val" +end + +@testset "mixed quote styles" begin + xml = """""" + vals = [attr_value(t) for t in tokenize(xml) if t.kind == TokenKinds.ATTR_VALUE] + @test vals == ["1", "2"] +end + +@testset "whitespace-only text" begin + xml = " \n\t " + texts = [t for t in tokenize(xml) if t.kind == TokenKinds.TEXT] + @test length(texts) == 1 + @test texts[1].raw == " \n\t " +end + +@testset "entities preserved verbatim" begin + xml = "

& < A

" + texts = [t.raw for t in tokenize(xml) if t.kind == TokenKinds.TEXT] + @test texts == ["& < A"] +end + +@testset "show method" begin + tok = first(tokenize("hello")) + buf = IOBuffer() + show(buf, tok) + s = String(take!(buf)) + @test occursin("TEXT", s) + @test occursin("hello", s) +end + +end # top-level testset diff --git a/test/test_w3c.jl b/test/test_w3c.jl new file mode 100644 index 0000000..16587ed --- /dev/null +++ b/test/test_w3c.jl @@ -0,0 +1,154 @@ +# W3C XML Conformance Test Suite +# https://www.w3.org/XML/Test/xmlts20130923.tar +# +# Test types: +# - "valid": well-formed XML that is also valid (should parse successfully) +# - "invalid": well-formed but not valid per DTD (should still parse — we're non-validating) +# - "not-wf": not well-formed XML (should fail to parse) +# - "error": optional errors (parser may or may not reject) +# +# We only run tests with ENTITIES="none" since XML.jl does not expand external entities. +# We skip XML 1.1 tests (VERSION="1.1" or RECOMMENDATION="XML1.1"). + +using XML +using XML: Node, nodetype, Document +using Test +using Downloads: download +using Tar + +const W3C_URL = "https://www.w3.org/XML/Test/xmlts20130923.tar" +const W3C_DIR = joinpath(@__DIR__, "data", "w3c") +const W3C_TAR = joinpath(@__DIR__, "data", "xmlts20130923.tar") + +function ensure_w3c_suite() + isdir(joinpath(W3C_DIR, "xmlconf")) && return + mkpath(W3C_DIR) + if !isfile(W3C_TAR) + @info "Downloading W3C XML Conformance Test Suite..." + download(W3C_URL, W3C_TAR) + end + @info "Extracting W3C XML Conformance Test Suite..." + open(W3C_TAR) do io + Tar.extract(io, W3C_DIR) + end +end + +# Parse a test catalog XML and extract TEST entries +function parse_catalog(catalog_path::String) + isfile(catalog_path) || return NamedTuple[] + doc = read(catalog_path, Node) + tests = NamedTuple[] + _collect_tests!(tests, doc, dirname(catalog_path)) + return tests +end + +function _collect_tests!(tests, node, base_dir) + for child in XML.children(node) + nodetype(child) !== XML.Element && continue + if XML.tag(child) == "TEST" + attrs = XML.attributes(child) + haskey(attrs, "URI") || continue + push!(tests, ( + type = get(attrs, "TYPE", ""), + entities = get(attrs, "ENTITIES", ""), + id = get(attrs, "ID", ""), + uri = joinpath(base_dir, attrs["URI"]), + version = get(attrs, "VERSION", "1.0"), + recommendation = get(attrs, "RECOMMENDATION", ""), + )) + elseif XML.tag(child) == "TESTCASES" + # TESTCASES may have xml:base to adjust paths + sub_base = get(XML.attributes(child), "xml:base", "") + child_base = isempty(sub_base) ? base_dir : joinpath(base_dir, sub_base) + _collect_tests!(tests, child, child_base) + else + _collect_tests!(tests, child, base_dir) + end + end +end + +function is_xml11(test) + test.version == "1.1" || + test.recommendation == "XML1.1" || + contains(test.recommendation, "XML1.1") +end + +ensure_w3c_suite() + +# Catalogs for XML 1.0 tests +const XMLCONF_DIR = joinpath(W3C_DIR, "xmlconf") +const CATALOGS = filter(isfile, [ + joinpath(XMLCONF_DIR, "xmltest", "xmltest.xml"), + joinpath(XMLCONF_DIR, "sun", "sun-valid.xml"), + joinpath(XMLCONF_DIR, "sun", "sun-invalid.xml"), + joinpath(XMLCONF_DIR, "sun", "sun-not-wf.xml"), + joinpath(XMLCONF_DIR, "sun", "sun-error.xml"), + joinpath(XMLCONF_DIR, "oasis", "oasis.xml"), + joinpath(XMLCONF_DIR, "ibm", "ibm_oasis_not-wf.xml"), + joinpath(XMLCONF_DIR, "ibm", "ibm_oasis_valid.xml"), + joinpath(XMLCONF_DIR, "ibm", "ibm_oasis_invalid.xml"), + joinpath(XMLCONF_DIR, "eduni", "errata-2e", "errata2e.xml"), + joinpath(XMLCONF_DIR, "eduni", "errata-3e", "errata3e.xml"), + joinpath(XMLCONF_DIR, "eduni", "errata-4e", "errata4e.xml"), + joinpath(XMLCONF_DIR, "eduni", "namespaces", "1.0", "rmt-ns10.xml"), + joinpath(XMLCONF_DIR, "eduni", "misc", "ht-bh.xml"), + joinpath(XMLCONF_DIR, "japanese", "japanese.xml"), +]) + +# Collect all tests +all_tests = NamedTuple[] +for catalog in CATALOGS + append!(all_tests, parse_catalog(catalog)) +end + +# Filter: only ENTITIES="none", skip XML 1.1 +xml10_tests = filter(t -> t.entities == "none" && !is_xml11(t), all_tests) + +valid_tests = filter(t -> t.type in ("valid", "invalid"), xml10_tests) +notwf_tests = filter(t -> t.type == "not-wf", xml10_tests) + +@info "W3C tests: $(length(valid_tests)) valid/invalid, $(length(notwf_tests)) not-wf (from $(length(all_tests)) total)" + +@testset "W3C Conformance" begin + @testset "Well-formed documents should parse" begin + n_pass = 0 + n_fail = 0 + failures = String[] + for test in valid_tests + isfile(test.uri) || continue + try + doc = read(test.uri, Node) + @test nodetype(doc) == Document + n_pass += 1 + catch e + n_fail += 1 + push!(failures, "$(test.id): $e") + end + end + if n_fail > 0 + @warn "W3C well-formed: $n_pass passed, $n_fail failed" failures=first(failures, 20) + end + @info "W3C well-formed: $n_pass / $(n_pass + n_fail) passed" + end + + @testset "Not-well-formed documents should fail to parse" begin + n_pass = 0 + n_fail = 0 + failures = String[] + for test in notwf_tests + isfile(test.uri) || continue + try + read(test.uri, Node) + n_fail += 1 + push!(failures, test.id) + catch + @test true + n_pass += 1 + end + end + if n_fail > 0 + @warn "W3C not-well-formed: $n_pass rejected, $n_fail incorrectly accepted" failures=first(failures, 20) + end + @info "W3C not-well-formed: $n_pass / $(n_pass + n_fail) correctly rejected" + end +end