From a1ae8c2b7d4e573d641f4ad3d81f7731b133772b Mon Sep 17 00:00:00 2001 From: Haresh Khanna Date: Wed, 25 Mar 2026 14:41:40 +0000 Subject: [PATCH] Substrait join consumer should not merge nullability of join keys (#21121) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes #21124 ## Rationale for this change When a Substrait join expression contains both equal and is_not_distinct_from predicates (e.g. Spark pushes a null-safe filter into a join that already has a regular equality key), the `split_eq_and_noneq_join_predicate_with_nulls_equality` function uses a single `nulls_equal_nulls` boolean that gets overwritten per-predicate. Whichever operator is processed last determines the `NullEquality` for all keys, silently dropping NULL-matching rows. Since NullEquality is a join-level setting (not per-key) across all physical join implementations (HashJoinExec, SortMergeJoinExec, SymmetricHashJoinExec), the correct fix is to match DataFusion's own SQL planner behavior: demote IS NOT DISTINCT FROM keys to the join filter when mixed with Eq keys. This is already correctly handled for SQL as shown in [join_is_not_distinct_from.slt:L188](https://sourcegraph.com/r/github.com/apache/datafusion@2b7d4f9a5b005905b23128274ad37c3306ffcd15/-/blob/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt?L188) ``` # Test mixed equal and IS NOT DISTINCT FROM conditions # The `IS NOT DISTINCT FROM` expr should NOT in HashJoin's `on` predicate query TT EXPLAIN SELECT t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val FROM t1 JOIN t2 ON t1.id = t2.id AND t1.val IS NOT DISTINCT FROM t2.val ---- logical_plan 01)Projection: t1.id AS t1_id, t2.id AS t2_id, t1.val, t2.val 02)--Inner Join: t1.id = t2.id Filter: t1.val IS NOT DISTINCT FROM t2.val 03)----TableScan: t1 projection=[id, val] 04)----TableScan: t2 projection=[id, val] ``` ## What changes are included in this PR? `datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs`: - Collect eq_keys and indistinct_keys separately instead of using a single vec with an overwritable boolean - When both are present (mixed case), use eq_keys as equijoin keys with NullEqualsNothing and reconstruct the IsNotDistinctFrom expressions into the join filter - Return NullEquality directly instead of converting from bool ## Are these changes tested? Yes, three levels of coverage: 1. Unit tests (join_rel.rs) — directly assert the output of split_eq_and_noneq_join_predicate_with_nulls_equality for eq-only, indistinct-only, mixed, and non-column-operand cases 2. Integration test (consumer_integration.rs) — loads a JSON-encoded Substrait plan with a JoinRel containing both operators through from_substrait_plan, executes it, and asserts 6 rows (including NULL=NULL matches) 3. Existing SLT (join_is_not_distinct_from.slt:179-205) — confirms the SQL planner already exhibits the same demotion behavior that this PR adds to the Substrait consumer ## Are there any user-facing changes? No API changes. Substrait plans with mixed equal/is_not_distinct_from join predicates now correctly preserve null-safe semantics instead of silently dropping NULL-matching rows. --- .../test_files/join_is_not_distinct_from.slt | 30 +++ .../src/logical_plan/consumer/rel/join_rel.rs | 186 ++++++++++++++---- .../tests/cases/consumer_integration.rs | 106 ++++++++++ .../mixed_join_equal_and_indistinct.json | 102 ++++++++++ .../mixed_join_equal_and_indistinct_left.json | 102 ++++++++++ 5 files changed, 486 insertions(+), 40 deletions(-) create mode 100644 datafusion/substrait/tests/testdata/test_plans/mixed_join_equal_and_indistinct.json create mode 100644 datafusion/substrait/tests/testdata/test_plans/mixed_join_equal_and_indistinct_left.json diff --git a/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt b/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt index 8246f489c446d..2bab89c99eae0 100644 --- a/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt +++ b/datafusion/sqllogictest/test_files/join_is_not_distinct_from.slt @@ -291,6 +291,36 @@ JOIN t4 ON (t3.val1 IS NOT DISTINCT FROM t4.val1) AND (t3.val2 IS NOT DISTINCT F 2 2 NULL NULL 200 200 3 3 30 30 NULL NULL +# Test mixed: 1 Eq key + multiple IS NOT DISTINCT FROM keys. +# The optimizer unconditionally favours Eq keys (see extract_equijoin_predicate.rs, +# "Only convert when there are NO equijoin predicates, to be conservative"). +# All IS NOT DISTINCT FROM predicates should be demoted to filter, even when they outnumber the Eq key. +query TT +EXPLAIN SELECT t3.id AS t3_id, t4.id AS t4_id, t3.val1, t4.val1, t3.val2, t4.val2 +FROM t3 +JOIN t4 ON (t3.id = t4.id) AND (t3.val1 IS NOT DISTINCT FROM t4.val1) AND (t3.val2 IS NOT DISTINCT FROM t4.val2) +---- +logical_plan +01)Projection: t3.id AS t3_id, t4.id AS t4_id, t3.val1, t4.val1, t3.val2, t4.val2 +02)--Inner Join: t3.id = t4.id Filter: t3.val1 IS NOT DISTINCT FROM t4.val1 AND t3.val2 IS NOT DISTINCT FROM t4.val2 +03)----TableScan: t3 projection=[id, val1, val2] +04)----TableScan: t4 projection=[id, val1, val2] +physical_plan +01)ProjectionExec: expr=[id@0 as t3_id, id@3 as t4_id, val1@1 as val1, val1@4 as val1, val2@2 as val2, val2@5 as val2] +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], filter=val1@0 IS NOT DISTINCT FROM val1@2 AND val2@1 IS NOT DISTINCT FROM val2@3 +03)----DataSourceExec: partitions=1, partition_sizes=[1] +04)----DataSourceExec: partitions=1, partition_sizes=[1] + +# Verify correct results: all 3 rows should match (including NULL=NULL via IS NOT DISTINCT FROM in filter) +query IIIIII rowsort +SELECT t3.id AS t3_id, t4.id AS t4_id, t3.val1, t4.val1, t3.val2, t4.val2 +FROM t3 +JOIN t4 ON (t3.id = t4.id) AND (t3.val1 IS NOT DISTINCT FROM t4.val1) AND (t3.val2 IS NOT DISTINCT FROM t4.val2) +---- +1 1 10 10 100 100 +2 2 NULL NULL 200 200 +3 3 30 30 NULL NULL + statement ok drop table t0; diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs index 3604630d6f0bb..7850dbea797fb 100644 --- a/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs +++ b/datafusion/substrait/src/logical_plan/consumer/rel/join_rel.rs @@ -18,7 +18,7 @@ use crate::logical_plan::consumer::SubstraitConsumer; use datafusion::common::{Column, JoinType, NullEquality, not_impl_err, plan_err}; use datafusion::logical_expr::requalify_sides_if_needed; -use datafusion::logical_expr::utils::split_conjunction; +use datafusion::logical_expr::utils::split_conjunction_owned; use datafusion::logical_expr::{ BinaryExpr, Expr, LogicalPlan, LogicalPlanBuilder, Operator, }; @@ -56,15 +56,10 @@ pub async fn from_join_rel( // So we extract each part as follows: // - If an Eq or IsNotDistinctFrom op is encountered, add the left column, right column and is_null_equal_nulls to `join_ons` vector // - Otherwise we add the expression to join_filter (use conjunction if filter already exists) - let (join_ons, nulls_equal_nulls, join_filter) = - split_eq_and_noneq_join_predicate_with_nulls_equality(&on); + let (join_ons, null_equality, join_filter) = + split_eq_and_noneq_join_predicate_with_nulls_equality(on); let (left_cols, right_cols): (Vec<_>, Vec<_>) = itertools::multiunzip(join_ons); - let null_equality = if nulls_equal_nulls { - NullEquality::NullEqualsNull - } else { - NullEquality::NullEqualsNothing - }; left.join_detailed( right.build()?, join_type, @@ -89,49 +84,61 @@ pub async fn from_join_rel( } fn split_eq_and_noneq_join_predicate_with_nulls_equality( - filter: &Expr, -) -> (Vec<(Column, Column)>, bool, Option) { - let exprs = split_conjunction(filter); + filter: Expr, +) -> (Vec<(Column, Column)>, NullEquality, Option) { + let exprs = split_conjunction_owned(filter); - let mut accum_join_keys: Vec<(Column, Column)> = vec![]; + let mut eq_keys: Vec<(Column, Column)> = vec![]; + let mut indistinct_keys: Vec<(Column, Column)> = vec![]; let mut accum_filters: Vec = vec![]; - let mut nulls_equal_nulls = false; for expr in exprs { - #[expect(clippy::collapsible_match)] match expr { - Expr::BinaryExpr(binary_expr) => match binary_expr { - x @ (BinaryExpr { - left, - op: Operator::Eq, - right, + Expr::BinaryExpr(BinaryExpr { + left, + op: op @ (Operator::Eq | Operator::IsNotDistinctFrom), + right, + }) => match (*left, *right) { + (Expr::Column(l), Expr::Column(r)) => match op { + Operator::Eq => eq_keys.push((l, r)), + Operator::IsNotDistinctFrom => indistinct_keys.push((l, r)), + _ => unreachable!(), + }, + (left, right) => { + accum_filters.push(Expr::BinaryExpr(BinaryExpr { + left: Box::new(left), + op, + right: Box::new(right), + })); } - | BinaryExpr { - left, - op: Operator::IsNotDistinctFrom, - right, - }) => { - nulls_equal_nulls = match x.op { - Operator::Eq => false, - Operator::IsNotDistinctFrom => true, - _ => unreachable!(), - }; - - match (left.as_ref(), right.as_ref()) { - (Expr::Column(l), Expr::Column(r)) => { - accum_join_keys.push((l.clone(), r.clone())); - } - _ => accum_filters.push(expr.clone()), - } - } - _ => accum_filters.push(expr.clone()), }, - _ => accum_filters.push(expr.clone()), + _ => accum_filters.push(expr), } } + let (join_keys, null_equality) = + match (eq_keys.is_empty(), indistinct_keys.is_empty()) { + // Mixed: use eq_keys as equijoin keys, demote indistinct keys to filter + (false, false) => { + for (l, r) in indistinct_keys { + accum_filters.push(Expr::BinaryExpr(BinaryExpr { + left: Box::new(Expr::Column(l)), + op: Operator::IsNotDistinctFrom, + right: Box::new(Expr::Column(r)), + })); + } + (eq_keys, NullEquality::NullEqualsNothing) + } + // Only eq keys + (false, true) => (eq_keys, NullEquality::NullEqualsNothing), + // Only indistinct keys + (true, false) => (indistinct_keys, NullEquality::NullEqualsNull), + // No keys at all + (true, true) => (vec![], NullEquality::NullEqualsNothing), + }; + let join_filter = accum_filters.into_iter().reduce(Expr::and); - (accum_join_keys, nulls_equal_nulls, join_filter) + (join_keys, null_equality, join_filter) } fn from_substrait_jointype(join_type: i32) -> datafusion::common::Result { @@ -153,3 +160,102 @@ fn from_substrait_jointype(join_type: i32) -> datafusion::common::Result Expr { + Expr::Column(Column::from_name(name)) + } + + fn indistinct(left: Expr, right: Expr) -> Expr { + Expr::BinaryExpr(BinaryExpr { + left: Box::new(left), + op: Operator::IsNotDistinctFrom, + right: Box::new(right), + }) + } + + fn fmt_keys(keys: &[(Column, Column)]) -> String { + keys.iter() + .map(|(l, r)| format!("{l} = {r}")) + .collect::>() + .join(", ") + } + + #[test] + fn split_only_eq_keys() { + let expr = col("a").eq(col("b")); + let (keys, null_eq, filter) = + split_eq_and_noneq_join_predicate_with_nulls_equality(expr); + + assert_eq!(fmt_keys(&keys), "a = b"); + assert_eq!(null_eq, NullEquality::NullEqualsNothing); + assert!(filter.is_none()); + } + + #[test] + fn split_only_indistinct_keys() { + let expr = indistinct(col("a"), col("b")); + let (keys, null_eq, filter) = + split_eq_and_noneq_join_predicate_with_nulls_equality(expr); + + assert_eq!(fmt_keys(&keys), "a = b"); + assert_eq!(null_eq, NullEquality::NullEqualsNull); + assert!(filter.is_none()); + } + + /// Regression: mixed `equal` + `is_not_distinct_from` must demote + /// the indistinct key to the join filter so the single NullEquality + /// flag stays consistent (NullEqualsNothing for the eq keys). + #[test] + fn split_mixed_eq_and_indistinct_demotes_indistinct_to_filter() { + let expr = + indistinct(col("val_l"), col("val_r")).and(col("id_l").eq(col("id_r"))); + + let (keys, null_eq, filter) = + split_eq_and_noneq_join_predicate_with_nulls_equality(expr); + + assert_eq!(fmt_keys(&keys), "id_l = id_r"); + assert_eq!(null_eq, NullEquality::NullEqualsNothing); + assert_eq!( + filter.unwrap().to_string(), + "val_l IS NOT DISTINCT FROM val_r" + ); + } + + /// Multiple IS NOT DISTINCT FROM keys with a single Eq key should demote + /// all indistinct keys to the filter. + #[test] + fn split_mixed_multiple_indistinct_demoted() { + let expr = indistinct(col("a_l"), col("a_r")) + .and(indistinct(col("b_l"), col("b_r"))) + .and(col("id_l").eq(col("id_r"))); + + let (keys, null_eq, filter) = + split_eq_and_noneq_join_predicate_with_nulls_equality(expr); + + assert_eq!(fmt_keys(&keys), "id_l = id_r"); + assert_eq!(null_eq, NullEquality::NullEqualsNothing); + assert_eq!( + filter.unwrap().to_string(), + "a_l IS NOT DISTINCT FROM a_r AND b_l IS NOT DISTINCT FROM b_r" + ); + } + + #[test] + fn split_non_column_eq_goes_to_filter() { + let expr = Expr::Literal( + datafusion::common::ScalarValue::Utf8(Some("x".into())), + None, + ) + .eq(col("b")); + + let (keys, _, filter) = + split_eq_and_noneq_join_predicate_with_nulls_equality(expr); + + assert!(keys.is_empty()); + assert_eq!(filter.unwrap().to_string(), "Utf8(\"x\") = b"); + } +} diff --git a/datafusion/substrait/tests/cases/consumer_integration.rs b/datafusion/substrait/tests/cases/consumer_integration.rs index 7d871de0dfd8a..5454658cde314 100644 --- a/datafusion/substrait/tests/cases/consumer_integration.rs +++ b/datafusion/substrait/tests/cases/consumer_integration.rs @@ -25,6 +25,8 @@ #[cfg(test)] mod tests { use crate::utils::test::add_plan_schemas_to_ctx; + use datafusion::arrow::record_batch::RecordBatch; + use datafusion::arrow::util::pretty::pretty_format_batches; use datafusion::common::Result; use datafusion::prelude::SessionContext; use datafusion_substrait::logical_plan::consumer::from_substrait_plan; @@ -33,6 +35,34 @@ mod tests { use std::io::BufReader; use substrait::proto::Plan; + async fn execute_plan(name: &str) -> Result> { + let path = format!("tests/testdata/test_plans/{name}"); + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + let ctx = SessionContext::new(); + let plan = from_substrait_plan(&ctx.state(), &proto).await?; + ctx.execute_logical_plan(plan).await?.collect().await + } + + /// Pretty-print batches as a table with header on top and data rows sorted. + fn pretty_sorted(batches: &[RecordBatch]) -> String { + let pretty = pretty_format_batches(batches).unwrap().to_string(); + let all_lines: Vec<&str> = pretty.trim().lines().collect(); + let header = &all_lines[..3]; + let mut data: Vec<&str> = all_lines[3..all_lines.len() - 1].to_vec(); + data.sort(); + let footer = &all_lines[all_lines.len() - 1..]; + header + .iter() + .copied() + .chain(data) + .chain(footer.iter().copied()) + .collect::>() + .join("\n") + } + async fn tpch_plan_to_string(query_id: i32) -> Result { let path = format!("tests/testdata/tpch_substrait_plans/query_{query_id:02}_plan.json"); @@ -700,4 +730,80 @@ mod tests { Ok(()) } + + /// Substrait join with both `equal` and `is_not_distinct_from` must demote + /// `IS NOT DISTINCT FROM` to the join filter. + #[tokio::test] + async fn test_mixed_join_equal_and_indistinct_inner_join() -> Result<()> { + let plan_str = + test_plan_to_string("mixed_join_equal_and_indistinct.json").await?; + // Eq becomes the equijoin key; IS NOT DISTINCT FROM is demoted to filter. + assert_snapshot!( + plan_str, + @r#" + Projection: left.id, left.val, left.comment, right.id AS id0, right.val AS val0, right.comment AS comment0 + Inner Join: left.id = right.id Filter: left.val IS NOT DISTINCT FROM right.val + SubqueryAlias: left + Values: (Utf8("1"), Utf8("a"), Utf8("c1")), (Utf8("2"), Utf8("b"), Utf8("c2")), (Utf8("3"), Utf8(NULL), Utf8("c3")), (Utf8("4"), Utf8(NULL), Utf8("c4")), (Utf8("5"), Utf8("e"), Utf8("c5"))... + SubqueryAlias: right + Values: (Utf8("1"), Utf8("a"), Utf8("c1")), (Utf8("2"), Utf8("b"), Utf8("c2")), (Utf8("3"), Utf8(NULL), Utf8("c3")), (Utf8("4"), Utf8(NULL), Utf8("c4")), (Utf8("5"), Utf8("e"), Utf8("c5"))... + "# + ); + + // Execute and verify actual rows, including NULL=NULL matches (ids 3,4). + let results = execute_plan("mixed_join_equal_and_indistinct.json").await?; + assert_snapshot!(pretty_sorted(&results), + @r" + +----+-----+---------+-----+------+----------+ + | id | val | comment | id0 | val0 | comment0 | + +----+-----+---------+-----+------+----------+ + | 1 | a | c1 | 1 | a | c1 | + | 2 | b | c2 | 2 | b | c2 | + | 3 | | c3 | 3 | | c3 | + | 4 | | c4 | 4 | | c4 | + | 5 | e | c5 | 5 | e | c5 | + | 6 | f | c6 | 6 | f | c6 | + +----+-----+---------+-----+------+----------+ + " + ); + + Ok(()) + } + + /// Substrait join with both `equal` and `is_not_distinct_from` must demote + /// `IS NOT DISTINCT FROM` to the join filter. + #[tokio::test] + async fn test_mixed_join_equal_and_indistinct_left_join() -> Result<()> { + let plan_str = + test_plan_to_string("mixed_join_equal_and_indistinct_left.json").await?; + assert_snapshot!( + plan_str, + @r#" + Projection: left.id, left.val, left.comment, right.id AS id0, right.val AS val0, right.comment AS comment0 + Left Join: left.id = right.id Filter: left.val IS NOT DISTINCT FROM right.val + SubqueryAlias: left + Values: (Utf8("1"), Utf8("a"), Utf8("c1")), (Utf8("2"), Utf8("b"), Utf8("c2")), (Utf8("3"), Utf8(NULL), Utf8("c3")), (Utf8("4"), Utf8(NULL), Utf8("c4")), (Utf8("5"), Utf8("e"), Utf8("c5"))... + SubqueryAlias: right + Values: (Utf8("1"), Utf8("a"), Utf8("c1")), (Utf8("2"), Utf8("b"), Utf8("c2")), (Utf8("3"), Utf8(NULL), Utf8("c3")), (Utf8("4"), Utf8(NULL), Utf8("c4")), (Utf8("5"), Utf8("e"), Utf8("c5"))... + "# + ); + + let results = execute_plan("mixed_join_equal_and_indistinct_left.json").await?; + assert_snapshot!(pretty_sorted(&results), + @r" + +----+-----+---------+-----+------+----------+ + | id | val | comment | id0 | val0 | comment0 | + +----+-----+---------+-----+------+----------+ + | 1 | a | c1 | 1 | a | c1 | + | 2 | b | c2 | 2 | b | c2 | + | 3 | | c3 | 3 | | c3 | + | 4 | | c4 | 4 | | c4 | + | 5 | e | c5 | 5 | e | c5 | + | 6 | f | c6 | 6 | f | c6 | + +----+-----+---------+-----+------+----------+ + " + ); + + Ok(()) + } } diff --git a/datafusion/substrait/tests/testdata/test_plans/mixed_join_equal_and_indistinct.json b/datafusion/substrait/tests/testdata/test_plans/mixed_join_equal_and_indistinct.json new file mode 100644 index 0000000000000..642256c562995 --- /dev/null +++ b/datafusion/substrait/tests/testdata/test_plans/mixed_join_equal_and_indistinct.json @@ -0,0 +1,102 @@ +{ + "extensions": [ + { "extensionFunction": { "functionAnchor": 0, "name": "is_not_distinct_from" } }, + { "extensionFunction": { "functionAnchor": 1, "name": "equal" } }, + { "extensionFunction": { "functionAnchor": 2, "name": "and" } } + ], + "relations": [{ + "root": { + "input": { + "join": { + "common": { "direct": {} }, + "left": { + "read": { + "common": { "direct": {} }, + "baseSchema": { + "names": ["id", "val", "comment"], + "struct": { + "types": [ + { "string": { "nullability": "NULLABILITY_REQUIRED" } }, + { "string": { "nullability": "NULLABILITY_NULLABLE" } }, + { "string": { "nullability": "NULLABILITY_REQUIRED" } } + ], + "nullability": "NULLABILITY_REQUIRED" + } + }, + "virtualTable": { + "values": [ + { "fields": [{ "string": "1", "nullable": false }, { "string": "a", "nullable": true }, { "string": "c1", "nullable": false }] }, + { "fields": [{ "string": "2", "nullable": false }, { "string": "b", "nullable": true }, { "string": "c2", "nullable": false }] }, + { "fields": [{ "string": "3", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c3", "nullable": false }] }, + { "fields": [{ "string": "4", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c4", "nullable": false }] }, + { "fields": [{ "string": "5", "nullable": false }, { "string": "e", "nullable": true }, { "string": "c5", "nullable": false }] }, + { "fields": [{ "string": "6", "nullable": false }, { "string": "f", "nullable": true }, { "string": "c6", "nullable": false }] } + ] + } + } + }, + "right": { + "read": { + "common": { "direct": {} }, + "baseSchema": { + "names": ["id", "val", "comment"], + "struct": { + "types": [ + { "string": { "nullability": "NULLABILITY_REQUIRED" } }, + { "string": { "nullability": "NULLABILITY_NULLABLE" } }, + { "string": { "nullability": "NULLABILITY_REQUIRED" } } + ], + "nullability": "NULLABILITY_REQUIRED" + } + }, + "virtualTable": { + "values": [ + { "fields": [{ "string": "1", "nullable": false }, { "string": "a", "nullable": true }, { "string": "c1", "nullable": false }] }, + { "fields": [{ "string": "2", "nullable": false }, { "string": "b", "nullable": true }, { "string": "c2", "nullable": false }] }, + { "fields": [{ "string": "3", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c3", "nullable": false }] }, + { "fields": [{ "string": "4", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c4", "nullable": false }] }, + { "fields": [{ "string": "5", "nullable": false }, { "string": "e", "nullable": true }, { "string": "c5", "nullable": false }] }, + { "fields": [{ "string": "6", "nullable": false }, { "string": "f", "nullable": true }, { "string": "c6", "nullable": false }] } + ] + } + } + }, + "expression": { + "scalarFunction": { + "functionReference": 2, + "outputType": { "bool": { "nullability": "NULLABILITY_NULLABLE" } }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 0, + "outputType": { "bool": { "nullability": "NULLABILITY_NULLABLE" } }, + "arguments": [ + { "value": { "selection": { "directReference": { "structField": { "field": 1 } }, "rootReference": {} } } }, + { "value": { "selection": { "directReference": { "structField": { "field": 4 } }, "rootReference": {} } } } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "outputType": { "bool": { "nullability": "NULLABILITY_NULLABLE" } }, + "arguments": [ + { "value": { "selection": { "directReference": { "structField": { "field": 0 } }, "rootReference": {} } } }, + { "value": { "selection": { "directReference": { "structField": { "field": 3 } }, "rootReference": {} } } } + ] + } + } + } + ] + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "names": ["id", "val", "comment", "id0", "val0", "comment0"] + } + }] +} diff --git a/datafusion/substrait/tests/testdata/test_plans/mixed_join_equal_and_indistinct_left.json b/datafusion/substrait/tests/testdata/test_plans/mixed_join_equal_and_indistinct_left.json new file mode 100644 index 0000000000000..f16672947e1ee --- /dev/null +++ b/datafusion/substrait/tests/testdata/test_plans/mixed_join_equal_and_indistinct_left.json @@ -0,0 +1,102 @@ +{ + "extensions": [ + { "extensionFunction": { "functionAnchor": 0, "name": "is_not_distinct_from" } }, + { "extensionFunction": { "functionAnchor": 1, "name": "equal" } }, + { "extensionFunction": { "functionAnchor": 2, "name": "and" } } + ], + "relations": [{ + "root": { + "input": { + "join": { + "common": { "direct": {} }, + "left": { + "read": { + "common": { "direct": {} }, + "baseSchema": { + "names": ["id", "val", "comment"], + "struct": { + "types": [ + { "string": { "nullability": "NULLABILITY_REQUIRED" } }, + { "string": { "nullability": "NULLABILITY_NULLABLE" } }, + { "string": { "nullability": "NULLABILITY_REQUIRED" } } + ], + "nullability": "NULLABILITY_REQUIRED" + } + }, + "virtualTable": { + "values": [ + { "fields": [{ "string": "1", "nullable": false }, { "string": "a", "nullable": true }, { "string": "c1", "nullable": false }] }, + { "fields": [{ "string": "2", "nullable": false }, { "string": "b", "nullable": true }, { "string": "c2", "nullable": false }] }, + { "fields": [{ "string": "3", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c3", "nullable": false }] }, + { "fields": [{ "string": "4", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c4", "nullable": false }] }, + { "fields": [{ "string": "5", "nullable": false }, { "string": "e", "nullable": true }, { "string": "c5", "nullable": false }] }, + { "fields": [{ "string": "6", "nullable": false }, { "string": "f", "nullable": true }, { "string": "c6", "nullable": false }] } + ] + } + } + }, + "right": { + "read": { + "common": { "direct": {} }, + "baseSchema": { + "names": ["id", "val", "comment"], + "struct": { + "types": [ + { "string": { "nullability": "NULLABILITY_REQUIRED" } }, + { "string": { "nullability": "NULLABILITY_NULLABLE" } }, + { "string": { "nullability": "NULLABILITY_REQUIRED" } } + ], + "nullability": "NULLABILITY_REQUIRED" + } + }, + "virtualTable": { + "values": [ + { "fields": [{ "string": "1", "nullable": false }, { "string": "a", "nullable": true }, { "string": "c1", "nullable": false }] }, + { "fields": [{ "string": "2", "nullable": false }, { "string": "b", "nullable": true }, { "string": "c2", "nullable": false }] }, + { "fields": [{ "string": "3", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c3", "nullable": false }] }, + { "fields": [{ "string": "4", "nullable": false }, { "null": { "string": { "nullability": "NULLABILITY_NULLABLE" } }, "nullable": true }, { "string": "c4", "nullable": false }] }, + { "fields": [{ "string": "5", "nullable": false }, { "string": "e", "nullable": true }, { "string": "c5", "nullable": false }] }, + { "fields": [{ "string": "6", "nullable": false }, { "string": "f", "nullable": true }, { "string": "c6", "nullable": false }] } + ] + } + } + }, + "expression": { + "scalarFunction": { + "functionReference": 2, + "outputType": { "bool": { "nullability": "NULLABILITY_NULLABLE" } }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 0, + "outputType": { "bool": { "nullability": "NULLABILITY_NULLABLE" } }, + "arguments": [ + { "value": { "selection": { "directReference": { "structField": { "field": 1 } }, "rootReference": {} } } }, + { "value": { "selection": { "directReference": { "structField": { "field": 4 } }, "rootReference": {} } } } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "outputType": { "bool": { "nullability": "NULLABILITY_NULLABLE" } }, + "arguments": [ + { "value": { "selection": { "directReference": { "structField": { "field": 0 } }, "rootReference": {} } } }, + { "value": { "selection": { "directReference": { "structField": { "field": 3 } }, "rootReference": {} } } } + ] + } + } + } + ] + } + }, + "type": "JOIN_TYPE_LEFT" + } + }, + "names": ["id", "val", "comment", "id0", "val0", "comment0"] + } + }] +}