From ce8d674cee7279c91d9f8455eb4dfcc63be8d34d Mon Sep 17 00:00:00 2001 From: Juan Cruz Viotti Date: Wed, 25 Mar 2026 16:07:07 -0400 Subject: [PATCH] Better catch Blaze unsupported required vocabulary errors Signed-off-by: Juan Cruz Viotti --- DEPENDENCIES | 2 +- src/index/generators.h | 19 +++- test/cli/CMakeLists.txt | 4 + ...no-evaluate-unknown-required-vocabulary.sh | 92 ++++++++++++++++ ...n-required-vocabulary-nested-metaschema.sh | 101 ++++++++++++++++++ ...-unknown-required-vocabulary-standalone.sh | 68 ++++++++++++ .../fail-unknown-required-vocabulary.sh | 96 +++++++++++++++++ .../include/sourcemeta/core/jsonschema.h | 39 +++++++ vendor/core/src/core/jsonschema/jsonschema.cc | 61 +++++++++-- 9 files changed, 470 insertions(+), 12 deletions(-) create mode 100755 test/cli/index/common/fail-no-evaluate-unknown-required-vocabulary.sh create mode 100755 test/cli/index/common/fail-unknown-required-vocabulary-nested-metaschema.sh create mode 100755 test/cli/index/common/fail-unknown-required-vocabulary-standalone.sh create mode 100755 test/cli/index/common/fail-unknown-required-vocabulary.sh diff --git a/DEPENDENCIES b/DEPENDENCIES index e04e79a8..6c005065 100644 --- a/DEPENDENCIES +++ b/DEPENDENCIES @@ -1,6 +1,6 @@ vendorpull https://github.com/sourcemeta/vendorpull 1dcbac42809cf87cb5b045106b863e17ad84ba02 uwebsockets https://github.com/uNetworking/uWebSockets v20.76.0 -core https://github.com/sourcemeta/core 56eab6ef118e9731df539d3a507db1c23241f911 +core https://github.com/sourcemeta/core a5b788eeea69b697ca0b8347637ff3cec6fa3cdb blaze https://github.com/sourcemeta/blaze 19fd0062386ed65799543dc32ac2e68688a39aaa jsonbinpack https://github.com/sourcemeta/jsonbinpack c1897f4fa8d0552016bb690b415b67120a0e9979 hydra https://github.com/sourcemeta/hydra eef9a60014ec16f00bbebb5f272522207d48a9f8 diff --git a/src/index/generators.h b/src/index/generators.h index 79d48c64..9f78f109 100644 --- a/src/index/generators.h +++ b/src/index/generators.h @@ -36,7 +36,6 @@ #include // std::ostringstream #include // std::unordered_map #include // std::move, std::pair -#include // std::visit namespace sourcemeta::one { @@ -114,7 +113,7 @@ struct GENERATE_MATERIALISED_SCHEMA { assert(schema.has_value()); const auto dialect_identifier{sourcemeta::core::dialect(schema.value())}; assert(!dialect_identifier.empty()); - const auto metaschema{resolver(dialect_identifier)}; + const auto metaschema{resolver(dialect_identifier, callback)}; assert(metaschema.has_value()); // Validate the schemas against their meta-schemas @@ -128,6 +127,22 @@ struct GENERATE_MATERIALISED_SCHEMA { throw MetaschemaError(output); } + // Most schemas are not metaschemas, so this check is a nice + // heuristic to avoid the cost of resolving the base dialect + // on most of them + if (schema->is_object() && schema->defines("$vocabulary")) { + const auto declared_vocabularies{sourcemeta::core::parse_vocabularies( + schema.value(), + [&callback, &resolver](const auto identifier) { + return resolver(identifier, callback); + }, + dialect_identifier)}; + if (declared_vocabularies.has_value()) { + declared_vocabularies.value().throw_if_any_unknown_required( + "The metaschema requires an unrecognised vocabulary"); + } + } + sourcemeta::core::format( schema.value(), sourcemeta::core::schema_walker, [&callback, &resolver](const auto identifier) { diff --git a/test/cli/CMakeLists.txt b/test/cli/CMakeLists.txt index b731e79f..c59f8859 100644 --- a/test/cli/CMakeLists.txt +++ b/test/cli/CMakeLists.txt @@ -58,6 +58,10 @@ if(ONE_INDEX) sourcemeta_one_test_cli(common index fail-schema-self-metaschema) sourcemeta_one_test_cli(common index fail-self-referencing-metaschema) sourcemeta_one_test_cli(common index fail-unknown-dialect) + sourcemeta_one_test_cli(common index fail-unknown-required-vocabulary) + sourcemeta_one_test_cli(common index fail-unknown-required-vocabulary-standalone) + sourcemeta_one_test_cli(common index fail-unknown-required-vocabulary-nested-metaschema) + sourcemeta_one_test_cli(common index fail-no-evaluate-unknown-required-vocabulary) sourcemeta_one_test_cli(common index fail-unknown-option) sourcemeta_one_test_cli(common index fail-vocabulary-not-object) sourcemeta_one_test_cli(common index draft4-ignore-vocabulary) diff --git a/test/cli/index/common/fail-no-evaluate-unknown-required-vocabulary.sh b/test/cli/index/common/fail-no-evaluate-unknown-required-vocabulary.sh new file mode 100755 index 00000000..97357b80 --- /dev/null +++ b/test/cli/index/common/fail-no-evaluate-unknown-required-vocabulary.sh @@ -0,0 +1,92 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << EOF > "$TMP/one.json" +{ + "url": "http://localhost:8000", + "html": false, + "contents": { + "example": { + "baseUri": "https://example.com", + "path": "./schemas", + "x-sourcemeta-one:evaluate": false + } + } +} +EOF + +mkdir "$TMP/schemas" + +# A custom metaschema that requires an unknown vocabulary +cat << 'EOF' > "$TMP/schemas/custom-meta.json" +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://example.com/custom-meta", + "$vocabulary": { + "https://json-schema.org/draft/2020-12/vocab/core": true, + "https://json-schema.org/draft/2020-12/vocab/applicator": true, + "https://json-schema.org/draft/2020-12/vocab/validation": true, + "https://example.com/vocab/totally-unknown": true + } +} +EOF + +# A schema that uses the custom metaschema +cat << 'EOF' > "$TMP/schemas/test.json" +{ + "$schema": "https://example.com/custom-meta", + "$id": "https://example.com/test", + "type": "string" +} +EOF + +remove_threads_information() { + expr='s/ \[[^]]*[^a-z-][^]]*\]//g' + if [ "$(uname -s)" = "Darwin" ]; then + sed -i '' "$expr" "$1" + else + sed -i "$expr" "$1" + fi +} + +"$1" --skip-banner "$TMP/one.json" "$TMP/output" --concurrency 1 2> "$TMP/output.txt" && CODE="$?" || CODE="$?" +test "$CODE" = "1" || exit 1 +remove_threads_information "$TMP/output.txt" + +cat << EOF > "$TMP/expected1.txt" +Writing output to: $(realpath "$TMP")/output +Using configuration: $(realpath "$TMP")/one.json +Detecting: $(realpath "$TMP")/schemas/custom-meta.json (#1) +Detecting: $(realpath "$TMP")/schemas/test.json (#2) +( 50%) Resolving: custom-meta.json +(100%) Resolving: test.json +( 4%) Producing: configuration.json +( 8%) Producing: version.json +( 12%) Producing: schemas/example/custom-meta/%/schema.metapack +error: The metaschema requires an unrecognised vocabulary + at vocabulary https://example.com/vocab/totally-unknown + at path $(realpath "$TMP")/schemas/custom-meta.json +EOF + +cat << EOF > "$TMP/expected2.txt" +Writing output to: $(realpath "$TMP")/output +Using configuration: $(realpath "$TMP")/one.json +Detecting: $(realpath "$TMP")/schemas/test.json (#1) +Detecting: $(realpath "$TMP")/schemas/custom-meta.json (#2) +( 50%) Resolving: test.json +(100%) Resolving: custom-meta.json +( 4%) Producing: configuration.json +( 8%) Producing: version.json +( 12%) Producing: schemas/example/custom-meta/%/schema.metapack +error: The metaschema requires an unrecognised vocabulary + at vocabulary https://example.com/vocab/totally-unknown + at path $(realpath "$TMP")/schemas/custom-meta.json +EOF + +diff "$TMP/output.txt" "$TMP/expected1.txt" || diff "$TMP/output.txt" "$TMP/expected2.txt" diff --git a/test/cli/index/common/fail-unknown-required-vocabulary-nested-metaschema.sh b/test/cli/index/common/fail-unknown-required-vocabulary-nested-metaschema.sh new file mode 100755 index 00000000..d029af3c --- /dev/null +++ b/test/cli/index/common/fail-unknown-required-vocabulary-nested-metaschema.sh @@ -0,0 +1,101 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << EOF > "$TMP/one.json" +{ + "url": "https://sourcemeta.com/", + "contents": { + "example": { + "contents": { + "schemas": { + "baseUri": "https://example.com/", + "path": "./schemas" + } + } + } + } +} +EOF + +mkdir "$TMP/schemas" + +# A custom metaschema that only uses known vocabularies +cat << 'EOF' > "$TMP/schemas/meta-a.json" +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://example.com/meta-a", + "$vocabulary": { + "https://json-schema.org/draft/2020-12/vocab/core": true, + "https://json-schema.org/draft/2020-12/vocab/applicator": true, + "https://json-schema.org/draft/2020-12/vocab/validation": true + } +} +EOF + +# A custom metaschema that uses meta-a as its dialect +# and declares an unknown required vocabulary +cat << 'EOF' > "$TMP/schemas/meta-b.json" +{ + "$schema": "https://example.com/meta-a", + "$id": "https://example.com/meta-b", + "$vocabulary": { + "https://json-schema.org/draft/2020-12/vocab/core": true, + "https://example.com/vocab/totally-unknown": true + } +} +EOF + +remove_threads_information() { + expr='s/ \[[^]]*[^a-z-][^]]*\]//g' + if [ "$(uname -s)" = "Darwin" ]; then + sed -i '' "$expr" "$1" + else + sed -i "$expr" "$1" + fi +} + +"$1" --skip-banner "$TMP/one.json" "$TMP/output" --concurrency 1 2> "$TMP/output.txt" && CODE="$?" || CODE="$?" +test "$CODE" = "1" +remove_threads_information "$TMP/output.txt" + +cat << EOF > "$TMP/expected1.txt" +Writing output to: $(realpath "$TMP")/output +Using configuration: $(realpath "$TMP")/one.json +Detecting: $(realpath "$TMP")/schemas/meta-a.json (#1) +Detecting: $(realpath "$TMP")/schemas/meta-b.json (#2) +( 50%) Resolving: meta-a.json +(100%) Resolving: meta-b.json +( 2%) Producing: configuration.json +( 5%) Producing: version.json +( 8%) Producing: explorer/%/404.metapack +( 11%) Producing: schemas/example/schemas/meta-a/%/schema.metapack +( 14%) Producing: schemas/example/schemas/meta-b/%/schema.metapack +error: The metaschema requires an unrecognised vocabulary + at vocabulary https://example.com/vocab/totally-unknown + at path $(realpath "$TMP")/schemas/meta-b.json +EOF + +cat << EOF > "$TMP/expected2.txt" +Writing output to: $(realpath "$TMP")/output +Using configuration: $(realpath "$TMP")/one.json +Detecting: $(realpath "$TMP")/schemas/meta-b.json (#1) +Detecting: $(realpath "$TMP")/schemas/meta-a.json (#2) +( 50%) Resolving: meta-b.json +(100%) Resolving: meta-a.json +( 2%) Producing: configuration.json +( 5%) Producing: version.json +( 8%) Producing: explorer/%/404.metapack +( 11%) Producing: schemas/example/schemas/meta-a/%/schema.metapack +( 14%) Producing: schemas/example/schemas/meta-b/%/schema.metapack +error: The metaschema requires an unrecognised vocabulary + at vocabulary https://example.com/vocab/totally-unknown + at path $(realpath "$TMP")/schemas/meta-b.json +EOF + +diff "$TMP/output.txt" "$TMP/expected1.txt" || diff "$TMP/output.txt" "$TMP/expected2.txt" diff --git a/test/cli/index/common/fail-unknown-required-vocabulary-standalone.sh b/test/cli/index/common/fail-unknown-required-vocabulary-standalone.sh new file mode 100755 index 00000000..f5751e7c --- /dev/null +++ b/test/cli/index/common/fail-unknown-required-vocabulary-standalone.sh @@ -0,0 +1,68 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << EOF > "$TMP/one.json" +{ + "url": "https://sourcemeta.com/", + "contents": { + "example": { + "contents": { + "schemas": { + "baseUri": "https://example.com/", + "path": "./schemas" + } + } + } + } +} +EOF + +mkdir "$TMP/schemas" + +# A custom metaschema that requires an unknown vocabulary, +# with no other schema using it +cat << 'EOF' > "$TMP/schemas/custom-meta.json" +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://example.com/custom-meta", + "$vocabulary": { + "https://json-schema.org/draft/2020-12/vocab/core": true, + "https://json-schema.org/draft/2020-12/vocab/applicator": true, + "https://json-schema.org/draft/2020-12/vocab/validation": true, + "https://example.com/vocab/totally-unknown": true + } +} +EOF + +"$1" --skip-banner "$TMP/one.json" "$TMP/output" --concurrency 1 2> "$TMP/output.txt" && CODE="$?" || CODE="$?" +test "$CODE" = "1" || exit 1 + +# Remove thread information +if [ "$(uname)" = "Darwin" ] +then + sed -i '' 's/ \[.*\]//g' "$TMP/output.txt" +else + sed -i 's/ \[.*\]//g' "$TMP/output.txt" +fi + +cat << EOF > "$TMP/expected.txt" +Writing output to: $(realpath "$TMP")/output +Using configuration: $(realpath "$TMP")/one.json +Detecting: $(realpath "$TMP")/schemas/custom-meta.json (#1) +(100%) Resolving: custom-meta.json +( 4%) Producing: configuration.json +( 8%) Producing: version.json +( 13%) Producing: explorer/%/404.metapack +( 17%) Producing: schemas/example/schemas/custom-meta/%/schema.metapack +error: The metaschema requires an unrecognised vocabulary + at vocabulary https://example.com/vocab/totally-unknown + at path $(realpath "$TMP")/schemas/custom-meta.json +EOF + +diff "$TMP/output.txt" "$TMP/expected.txt" diff --git a/test/cli/index/common/fail-unknown-required-vocabulary.sh b/test/cli/index/common/fail-unknown-required-vocabulary.sh new file mode 100755 index 00000000..22a91636 --- /dev/null +++ b/test/cli/index/common/fail-unknown-required-vocabulary.sh @@ -0,0 +1,96 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << EOF > "$TMP/one.json" +{ + "url": "https://sourcemeta.com/", + "contents": { + "example": { + "contents": { + "schemas": { + "baseUri": "https://example.com/", + "path": "./schemas" + } + } + } + } +} +EOF + +mkdir "$TMP/schemas" + +# A custom metaschema that requires an unknown vocabulary +cat << 'EOF' > "$TMP/schemas/custom-meta.json" +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://example.com/custom-meta", + "$vocabulary": { + "https://json-schema.org/draft/2020-12/vocab/core": true, + "https://json-schema.org/draft/2020-12/vocab/applicator": true, + "https://json-schema.org/draft/2020-12/vocab/validation": true, + "https://example.com/vocab/totally-unknown": true + } +} +EOF + +# A schema that uses the custom metaschema +cat << 'EOF' > "$TMP/schemas/test.json" +{ + "$schema": "https://example.com/custom-meta", + "$id": "https://example.com/test", + "type": "string" +} +EOF + +remove_threads_information() { + expr='s/ \[[^]]*[^a-z-][^]]*\]//g' + if [ "$(uname -s)" = "Darwin" ]; then + sed -i '' "$expr" "$1" + else + sed -i "$expr" "$1" + fi +} + +"$1" --skip-banner "$TMP/one.json" "$TMP/output" --concurrency 1 2> "$TMP/output.txt" && CODE="$?" || CODE="$?" +test "$CODE" = "1" || exit 1 +remove_threads_information "$TMP/output.txt" + +cat << EOF > "$TMP/expected1.txt" +Writing output to: $(realpath "$TMP")/output +Using configuration: $(realpath "$TMP")/one.json +Detecting: $(realpath "$TMP")/schemas/custom-meta.json (#1) +Detecting: $(realpath "$TMP")/schemas/test.json (#2) +( 50%) Resolving: custom-meta.json +(100%) Resolving: test.json +( 2%) Producing: configuration.json +( 5%) Producing: version.json +( 8%) Producing: explorer/%/404.metapack +( 11%) Producing: schemas/example/schemas/custom-meta/%/schema.metapack +error: The metaschema requires an unrecognised vocabulary + at vocabulary https://example.com/vocab/totally-unknown + at path $(realpath "$TMP")/schemas/custom-meta.json +EOF + +cat << EOF > "$TMP/expected2.txt" +Writing output to: $(realpath "$TMP")/output +Using configuration: $(realpath "$TMP")/one.json +Detecting: $(realpath "$TMP")/schemas/test.json (#1) +Detecting: $(realpath "$TMP")/schemas/custom-meta.json (#2) +( 50%) Resolving: test.json +(100%) Resolving: custom-meta.json +( 2%) Producing: configuration.json +( 5%) Producing: version.json +( 8%) Producing: explorer/%/404.metapack +( 11%) Producing: schemas/example/schemas/custom-meta/%/schema.metapack +error: The metaschema requires an unrecognised vocabulary + at vocabulary https://example.com/vocab/totally-unknown + at path $(realpath "$TMP")/schemas/custom-meta.json +EOF + +diff "$TMP/output.txt" "$TMP/expected1.txt" || diff "$TMP/output.txt" "$TMP/expected2.txt" diff --git a/vendor/core/src/core/jsonschema/include/sourcemeta/core/jsonschema.h b/vendor/core/src/core/jsonschema/include/sourcemeta/core/jsonschema.h index 4e44ce62..1f2fbd84 100644 --- a/vendor/core/src/core/jsonschema/include/sourcemeta/core/jsonschema.h +++ b/vendor/core/src/core/jsonschema/include/sourcemeta/core/jsonschema.h @@ -317,6 +317,45 @@ auto base_dialect(const JSON &schema, const SchemaResolver &resolver, std::string_view default_dialect = "") -> std::optional; +/// @ingroup jsonschema +/// +/// Parse the `$vocabulary` keyword from a given schema, if set. For example: +/// +/// ```cpp +/// #include +/// #include +/// #include +/// +/// const sourcemeta::core::JSON document = +/// sourcemeta::core::parse_json(R"JSON({ +/// "$schema": "https://json-schema.org/draft/2020-12/schema", +/// "$vocabulary": { +/// "https://json-schema.org/draft/2020-12/vocab/core": true, +/// "https://json-schema.org/draft/2020-12/vocab/applicator": true +/// } +/// })JSON"); +/// +/// const auto result{ +/// sourcemeta::core::parse_vocabularies( +/// document, sourcemeta::core::schema_resolver)}; +/// +/// assert(result.has_value()); +/// assert(result->size() == 2); +/// ``` +SOURCEMETA_CORE_JSONSCHEMA_EXPORT +auto parse_vocabularies(const JSON &schema, const SchemaResolver &resolver, + std::string_view default_dialect = "") + -> std::optional; + +/// @ingroup jsonschema +/// +/// A shortcut to sourcemeta::core::parse_vocabularies when the base dialect +/// is already known. +SOURCEMETA_CORE_JSONSCHEMA_EXPORT +auto parse_vocabularies(const JSON &schema, + const SchemaBaseDialect base_dialect) + -> std::optional; + /// @ingroup jsonschema /// /// List the vocabularies that a specific schema makes use of. If you set a diff --git a/vendor/core/src/core/jsonschema/jsonschema.cc b/vendor/core/src/core/jsonschema/jsonschema.cc index f7071696..56dabda6 100644 --- a/vendor/core/src/core/jsonschema/jsonschema.cc +++ b/vendor/core/src/core/jsonschema/jsonschema.cc @@ -440,6 +440,55 @@ auto is_pre_vocabulary_base_dialect( } } // namespace +auto sourcemeta::core::parse_vocabularies( + const sourcemeta::core::JSON &schema, + const sourcemeta::core::SchemaBaseDialect base_dialect) + -> std::optional { + if (base_dialect != + sourcemeta::core::SchemaBaseDialect::JSON_Schema_2020_12 && + base_dialect != + sourcemeta::core::SchemaBaseDialect::JSON_Schema_2020_12_Hyper && + base_dialect != + sourcemeta::core::SchemaBaseDialect::JSON_Schema_2019_09 && + base_dialect != + sourcemeta::core::SchemaBaseDialect::JSON_Schema_2019_09_Hyper) { + return std::nullopt; + } + + if (!schema.is_object()) { + return std::nullopt; + } + + const auto *vocabulary_entry{schema.try_at("$vocabulary")}; + if (!vocabulary_entry) { + return std::nullopt; + } + + assert(vocabulary_entry->is_object()); + sourcemeta::core::Vocabularies result; + for (const auto &entry : vocabulary_entry->as_object()) { + assert(entry.second.is_boolean()); + result.insert(entry.first, entry.second.to_boolean()); + } + + return result; +} + +auto sourcemeta::core::parse_vocabularies( + const sourcemeta::core::JSON &schema, + const sourcemeta::core::SchemaResolver &resolver, + std::string_view default_dialect) + -> std::optional { + const auto schema_base_dialect{ + sourcemeta::core::base_dialect(schema, resolver, default_dialect)}; + if (schema_base_dialect.has_value()) { + return sourcemeta::core::parse_vocabularies(schema, + schema_base_dialect.value()); + } else { + return std::nullopt; + } +} + auto sourcemeta::core::vocabularies( const sourcemeta::core::JSON &schema, const sourcemeta::core::SchemaResolver &resolver, @@ -545,16 +594,10 @@ auto sourcemeta::core::vocabularies(const SchemaResolver &resolver, * dialect */ - Vocabularies result; const auto core{core_vocabulary_known(base_dialect)}; - if (schema_dialect.defines("$vocabulary")) { - const sourcemeta::core::JSON &vocabularies{ - schema_dialect.at("$vocabulary")}; - assert(vocabularies.is_object()); - for (const auto &entry : vocabularies.as_object()) { - result.insert(entry.first, entry.second.to_boolean()); - } - } else { + auto result{parse_vocabularies(schema_dialect, base_dialect) + .value_or(Vocabularies{})}; + if (result.empty()) { result.insert(core, true); }