From db4927cb68c6c2fc3b19e06afac4630b74867c04 Mon Sep 17 00:00:00 2001 From: Rafael Herrero Date: Sun, 22 Mar 2026 23:55:22 +0100 Subject: [PATCH] Support duplicate column names in SQL projections (fix #6543) Add a deduplication pass in the SQL planner that auto-suffixes duplicate expression names with :{count} before projection, so queries like SELECT x, x or TPC-DS Q39 no longer error. The fix is scoped to the SQL path only. The Rust API (LogicalPlanBuilder::project) still rejects duplicates via validate_unique_names, keeping optimizer invariants intact. --- datafusion/sql/src/select.rs | 9 +- datafusion/sql/src/utils.rs | 37 ++++ datafusion/sql/tests/sql_integration.rs | 54 ++++-- .../sqllogictest/test_files/aggregate.slt | 19 +- .../test_files/duplicate_column_alias.slt | 175 ++++++++++++++++++ datafusion/sqllogictest/test_files/unnest.slt | 10 +- 6 files changed, 269 insertions(+), 35 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/duplicate_column_alias.slt diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 7e291afa04b6e..5ec5970e59c07 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -23,8 +23,9 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use crate::query::to_order_by_exprs_with_select; use crate::utils::{ CheckColumnsMustReferenceAggregatePurpose, CheckColumnsSatisfyExprsPurpose, - check_columns_satisfy_exprs, extract_aliases, rebase_expr, resolve_aliases_to_exprs, - resolve_columns, resolve_positions_to_exprs, rewrite_recursive_unnests_bottom_up, + check_columns_satisfy_exprs, deduplicate_select_expr_names, extract_aliases, + rebase_expr, resolve_aliases_to_exprs, resolve_columns, resolve_positions_to_exprs, + rewrite_recursive_unnests_bottom_up, }; use datafusion_common::error::DataFusionErrorBuilder; @@ -109,6 +110,10 @@ impl SqlToRel<'_, S> { planner_context, )?; + // Auto-suffix duplicate expression names (e.g. cov, cov → cov, cov:1) + // before projection so that the unique-name constraint is satisfied. + let select_exprs = deduplicate_select_expr_names(select_exprs); + // Having and group by clause may reference aliases defined in select projection let projected_plan = self.project(base_plan.clone(), select_exprs)?; let select_exprs = projected_plan.expressions(); diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 1a76dd69f46c5..f3407a3868dac 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -34,6 +34,7 @@ use datafusion_expr::expr::{ Alias, GroupingSet, Unnest, WindowFunction, WindowFunctionParams, }; use datafusion_expr::utils::{expr_as_column_expr, find_column_exprs}; +use datafusion_expr::select_expr::SelectExpr; use datafusion_expr::{ ColumnUnnestList, Expr, ExprSchemable, LogicalPlan, col, expr_vec_fmt, }; @@ -633,6 +634,42 @@ fn push_projection_dedupl(projection: &mut Vec, expr: Expr) { projection.push(expr); } } + +/// Auto-suffix duplicate SELECT expression names with `:{count}`. +/// +/// The first occurrence keeps its original name so that ORDER BY / HAVING +/// references resolve correctly. Wildcards are left untouched because they +/// are expanded later in `project_with_validation`. +/// +/// Duplicates are detected by the schema name of each expression, which +/// identifies logically identical expressions before column normalization. +pub(crate) fn deduplicate_select_expr_names( + exprs: Vec, +) -> Vec { + let mut seen: HashMap = HashMap::new(); + exprs + .into_iter() + .map(|select_expr| match select_expr { + SelectExpr::Expression(expr) => { + let name = expr.schema_name().to_string(); + let count = seen.entry(name.clone()).or_insert(0); + let result = if *count > 0 { + let (_qualifier, field_name) = expr.qualified_name(); + SelectExpr::Expression( + expr.alias(format!("{field_name}:{count}")), + ) + } else { + SelectExpr::Expression(expr) + }; + *count += 1; + result + } + // Leave wildcards alone — they are expanded later + other => other, + }) + .collect() +} + /// The context is we want to rewrite unnest() into InnerProjection->Unnest->OuterProjection /// Given an expression which contains unnest expr as one of its children, /// Try transform depends on unnest type diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 29c17be69ce5f..22030d17414fe 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -790,11 +790,13 @@ fn select_column_does_not_exist() { #[test] fn select_repeated_column() { let sql = "SELECT age, age FROM person"; - let err = logical_plan(sql).expect_err("query should have failed"); - + let plan = logical_plan(sql).unwrap(); assert_snapshot!( - err.strip_backtrace(), - @r#"Error during planning: Projections require unique expression names but the expression "person.age" at position 0 and "person.age" at position 1 have the same name. Consider aliasing ("AS") one of them."# + plan, + @r" + Projection: person.age, person.age AS age:1 + TableScan: person + " ); } @@ -1532,11 +1534,14 @@ fn select_simple_aggregate_column_does_not_exist() { #[test] fn select_simple_aggregate_repeated_aggregate() { let sql = "SELECT MIN(age), MIN(age) FROM person"; - let err = logical_plan(sql).expect_err("query should have failed"); - + let plan = logical_plan(sql).unwrap(); assert_snapshot!( - err.strip_backtrace(), - @r#"Error during planning: Projections require unique expression names but the expression "min(person.age)" at position 0 and "min(person.age)" at position 1 have the same name. Consider aliasing ("AS") one of them."# + plan, + @r" + Projection: min(person.age), min(person.age) AS min(person.age):1 + Aggregate: groupBy=[[]], aggr=[[min(person.age)]] + TableScan: person + " ); } @@ -1585,11 +1590,14 @@ fn select_from_typed_string_values() { #[test] fn select_simple_aggregate_repeated_aggregate_with_repeated_aliases() { let sql = "SELECT MIN(age) AS a, MIN(age) AS a FROM person"; - let err = logical_plan(sql).expect_err("query should have failed"); - + let plan = logical_plan(sql).unwrap(); assert_snapshot!( - err.strip_backtrace(), - @r#"Error during planning: Projections require unique expression names but the expression "min(person.age) AS a" at position 0 and "min(person.age) AS a" at position 1 have the same name. Consider aliasing ("AS") one of them."# + plan, + @r" + Projection: min(person.age) AS a, min(person.age) AS a AS a:1 + Aggregate: groupBy=[[]], aggr=[[min(person.age)]] + TableScan: person + " ); } @@ -1626,11 +1634,14 @@ fn select_simple_aggregate_with_groupby_with_aliases() { #[test] fn select_simple_aggregate_with_groupby_with_aliases_repeated() { let sql = "SELECT state AS a, MIN(age) AS a FROM person GROUP BY state"; - let err = logical_plan(sql).expect_err("query should have failed"); - + let plan = logical_plan(sql).unwrap(); assert_snapshot!( - err.strip_backtrace(), - @r#"Error during planning: Projections require unique expression names but the expression "person.state AS a" at position 0 and "min(person.age) AS a" at position 1 have the same name. Consider aliasing ("AS") one of them."# + plan, + @r" + Projection: person.state AS a, min(person.age) AS a AS a:1 + Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]] + TableScan: person + " ); } @@ -1751,11 +1762,14 @@ fn select_simple_aggregate_with_groupby_can_use_alias() { #[test] fn select_simple_aggregate_with_groupby_aggregate_repeated() { let sql = "SELECT state, MIN(age), MIN(age) FROM person GROUP BY state"; - let err = logical_plan(sql).expect_err("query should have failed"); - + let plan = logical_plan(sql).unwrap(); assert_snapshot!( - err.strip_backtrace(), - @r#"Error during planning: Projections require unique expression names but the expression "min(person.age)" at position 1 and "min(person.age)" at position 2 have the same name. Consider aliasing ("AS") one of them."# + plan, + @r" + Projection: person.state, min(person.age), min(person.age) AS min(person.age):1 + Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]] + TableScan: person + " ); } diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index cf894a494ad90..b8b6fa0ab4956 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -7941,21 +7941,16 @@ select count(), count() * count() from t; ---- 2 4 -# DataFusion error: Error during planning: Projections require unique expression names but the expression "count\(Int64\(1\)\)" at position 0 and "count\(Int64\(1\)\)" at position 1 have the same name\. Consider aliasing \("AS"\) one of them\. -query error +# Duplicate aggregate expressions are now auto-suffixed +query II select count(1), count(1) from t; +---- +2 2 -# DataFusion error: Error during planning: Projections require unique expression names but the expression "count\(Int64\(1\)\)" at position 0 and "count\(Int64\(1\)\)" at position 1 have the same name\. Consider aliasing \("AS"\) one of them\. -query error -explain select count(1), count(1) from t; - -# DataFusion error: Error during planning: Projections require unique expression names but the expression "count\(Int64\(1\) AS \)" at position 0 and "count\(Int64\(1\) AS \)" at position 1 have the same name\. Consider aliasing \("AS"\) one of them\. -query error +query II select count(), count() from t; - -# DataFusion error: Error during planning: Projections require unique expression names but the expression "count\(Int64\(1\) AS \)" at position 0 and "count\(Int64\(1\) AS \)" at position 1 have the same name\. Consider aliasing \("AS"\) one of them\. -query error -explain select count(), count() from t; +---- +2 2 query II select count(1), count(2) from t; diff --git a/datafusion/sqllogictest/test_files/duplicate_column_alias.slt b/datafusion/sqllogictest/test_files/duplicate_column_alias.slt new file mode 100644 index 0000000000000..e7e2d14976fcc --- /dev/null +++ b/datafusion/sqllogictest/test_files/duplicate_column_alias.slt @@ -0,0 +1,175 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Tests for duplicate column names/aliases in projections. +# DataFusion auto-suffixes duplicates with :{count} (e.g. cov, cov:1). + +# Setup +statement ok +CREATE TABLE t(x INT, y INT) AS VALUES (1, 2), (3, 4); + +# +# Basic duplicate alias +# +query II +SELECT x AS c1, y AS c1 FROM t; +---- +1 2 +3 4 + +# +# Duplicate literal expressions +# +query II +SELECT 1, 1; +---- +1 1 + +# +# Same column selected twice +# +query II +SELECT x, x FROM t; +---- +1 1 +3 3 + +# +# Subquery with duplicate column names +# +query II +SELECT * FROM (SELECT x AS c1, y AS c1 FROM t); +---- +1 2 +3 4 + +# +# ORDER BY referencing a duplicated alias resolves to first occurrence +# +query II +SELECT x AS c1, y AS c1 FROM t ORDER BY c1; +---- +1 2 +3 4 + +# +# CTE join producing duplicate column names (TPC-DS Q39 pattern) +# +statement ok +CREATE TABLE inv(warehouse_sk INT, item_sk INT, moy INT, cov DOUBLE) AS VALUES + (1, 10, 1, 1.5), + (1, 10, 2, 2.0), + (2, 20, 1, 0.8), + (2, 20, 2, 1.2); + +query IIIRIIIR +WITH inv1 AS ( + SELECT warehouse_sk, item_sk, moy, cov FROM inv WHERE moy = 1 +), +inv2 AS ( + SELECT warehouse_sk, item_sk, moy, cov FROM inv WHERE moy = 2 +) +SELECT inv1.warehouse_sk, inv1.item_sk, inv1.moy, inv1.cov, + inv2.warehouse_sk, inv2.item_sk, inv2.moy, inv2.cov +FROM inv1 JOIN inv2 + ON inv1.item_sk = inv2.item_sk AND inv1.warehouse_sk = inv2.warehouse_sk +ORDER BY inv1.warehouse_sk, inv1.item_sk; +---- +1 10 1 1.5 1 10 2 2 +2 20 1 0.8 2 20 2 1.2 + +# +# Three-way duplicate +# +query III +SELECT x AS a, y AS a, x + y AS a FROM t; +---- +1 2 3 +3 4 7 + +# +# CAST produces same schema_name as original column (TPC-DS Q39 pattern). +# CAST is transparent to schema_name, so CAST(x AS DOUBLE) and x +# both have schema_name "x" — this must be deduped. +# +query RI +SELECT CAST(x AS DOUBLE), x FROM t; +---- +1 1 +3 3 + +# +# GROUP BY with duplicate expressions in SELECT +# +query II +SELECT x, x FROM t GROUP BY x; +---- +1 1 +3 3 + +# +# Aggregate with GROUP BY producing duplicate column names +# +query III +SELECT x, SUM(y) AS total, SUM(y) AS total FROM t GROUP BY x ORDER BY x; +---- +1 2 2 +3 4 4 + +# +# ORDER BY referencing the second (renamed) column by position +# +query II +SELECT y AS c1, x AS c1 FROM t ORDER BY 2; +---- +2 1 +4 3 + +# +# Function calls that produce the same schema_name after argument +# normalization (reported in issue #6543 for iszero). +# +query BB +SELECT iszero(0.0), iszero(-0.0); +---- +true true + +# +# Duplicate expressions inside a UNION subquery +# +query II +SELECT * FROM (SELECT x AS a, y AS a FROM t UNION ALL SELECT y AS a, x AS a FROM t) ORDER BY 1, 2; +---- +1 2 +2 1 +3 4 +4 3 + +# +# Known limitation: wildcard expansion happens after dedup, so +# SELECT *, col FROM t still errors when col overlaps with *. +# This will be addressed in a follow-up PR. +# +query error DataFusion error: Error during planning: Projections require unique expression names but the expression "t\.x" at position 0 and "t\.x" at position 2 have the same name\. Consider aliasing \("AS"\) one of them\. +SELECT *, x FROM t; + +# Cleanup +statement ok +DROP TABLE t; + +statement ok +DROP TABLE inv; diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index ba499679a9a80..6a3aee97ba6c0 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -547,8 +547,16 @@ select unnest(column1) from (select * from (values([1,2,3]), ([4,5,6])) limit 1 5 6 -query error DataFusion error: Error during planning: Projections require unique expression names but the expression "UNNEST\(unnest_table.column1\)" at position 0 and "UNNEST\(unnest_table.column1\)" at position 1 have the same name. Consider aliasing \("AS"\) one of them. +query II select unnest(column1), unnest(column1) from unnest_table; +---- +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +12 12 query II select unnest(column1), unnest(column1) u1 from unnest_table;