diff --git a/superset-frontend/packages/superset-core/src/common/index.ts b/superset-frontend/packages/superset-core/src/common/index.ts index cc399cf62751..34d513e4f74e 100644 --- a/superset-frontend/packages/superset-core/src/common/index.ts +++ b/superset-frontend/packages/superset-core/src/common/index.ts @@ -130,6 +130,7 @@ export enum GenericDataType { String = 1, Temporal = 2, Boolean = 3, + MultiValue = 4, } /** diff --git a/superset-frontend/packages/superset-ui-chart-controls/src/components/ColumnTypeLabel/ColumnTypeLabel.tsx b/superset-frontend/packages/superset-ui-chart-controls/src/components/ColumnTypeLabel/ColumnTypeLabel.tsx index 8ea43bdc8a68..0316ca257629 100644 --- a/superset-frontend/packages/superset-ui-chart-controls/src/components/ColumnTypeLabel/ColumnTypeLabel.tsx +++ b/superset-frontend/packages/superset-ui-chart-controls/src/components/ColumnTypeLabel/ColumnTypeLabel.tsx @@ -28,6 +28,7 @@ import { FieldBinaryOutlined, FieldStringOutlined, NumberOutlined, + UnorderedListOutlined, } from '@ant-design/icons'; import { Icons } from '@superset-ui/core/components'; @@ -72,6 +73,10 @@ export function ColumnTypeLabel({ type }: ColumnTypeLabelProps) { typeIcon = ; } else if (type === GenericDataType.Temporal) { typeIcon = ; + } else if (type === GenericDataType.MultiValue) { + typeIcon = ( + + ); } return {typeIcon}; diff --git a/superset-frontend/packages/superset-ui-chart-controls/test/components/ColumnTypeLabel.test.tsx b/superset-frontend/packages/superset-ui-chart-controls/test/components/ColumnTypeLabel.test.tsx index fc65cd26f86f..46fc74b49367 100644 --- a/superset-frontend/packages/superset-ui-chart-controls/test/components/ColumnTypeLabel.test.tsx +++ b/superset-frontend/packages/superset-ui-chart-controls/test/components/ColumnTypeLabel.test.tsx @@ -64,4 +64,21 @@ describe('ColumnOption', () => { renderColumnTypeLabel({ type: GenericDataType.Temporal }); expect(screen.getByLabelText('temporal type icon')).toBeVisible(); }); + test('multi-value (array) type shows list icon', () => { + renderColumnTypeLabel({ type: GenericDataType.MultiValue }); + expect(screen.getByLabelText('multi-value type icon')).toBeVisible(); + }); +}); + +describe('GenericDataType enum parity', () => { + // These numeric values are shared with the backend enum in + // superset/utils/core.py (GenericDataType). They must stay in sync because + // the backend serializes columns using these integers. + test('values match the backend contract', () => { + expect(GenericDataType.Numeric).toBe(0); + expect(GenericDataType.String).toBe(1); + expect(GenericDataType.Temporal).toBe(2); + expect(GenericDataType.Boolean).toBe(3); + expect(GenericDataType.MultiValue).toBe(4); + }); }); diff --git a/superset-frontend/src/explore/components/controls/FilterControl/AdhocFilterEditPopoverSimpleTabContent/AdhocFilterEditPopoverSimpleTabContent.test.tsx b/superset-frontend/src/explore/components/controls/FilterControl/AdhocFilterEditPopoverSimpleTabContent/AdhocFilterEditPopoverSimpleTabContent.test.tsx index c2b5231a90dc..b6543b7e693e 100644 --- a/superset-frontend/src/explore/components/controls/FilterControl/AdhocFilterEditPopoverSimpleTabContent/AdhocFilterEditPopoverSimpleTabContent.test.tsx +++ b/superset-frontend/src/explore/components/controls/FilterControl/AdhocFilterEditPopoverSimpleTabContent/AdhocFilterEditPopoverSimpleTabContent.test.tsx @@ -35,6 +35,7 @@ import { } from 'src/explore/constants'; import AdhocMetric from 'src/explore/components/controls/MetricControl/AdhocMetric'; import { FeatureFlag, isFeatureEnabled } from '@superset-ui/core'; +import { GenericDataType } from '@apache-superset/core/common'; import fetchMock from 'fetch-mock'; import { TestDataset, Dataset } from '@superset-ui/chart-controls'; @@ -252,6 +253,59 @@ test('shows boolean only operators when subject is number', () => { ].map(operator => expect(isOperatorRelevant(operator, 'value')).toBe(true)); }); +test('shows CONTAINS and null operators when subject is multi-value', () => { + const props = setup({ + adhocFilter: new AdhocFilter({ + expressionType: ExpressionTypes.Simple, + subject: 'skills', + operatorId: undefined, + operator: undefined, + comparator: undefined, + clause: undefined, + }), + datasource: { + columns: [ + { + id: 3, + column_name: 'skills', + type: 'Array(String)', + type_generic: GenericDataType.MultiValue, + }, + ], + }, + }); + const { isOperatorRelevant } = useSimpleTabFilterProps( + props as unknown as Props, + ); + [Operators.Contains, Operators.IsNull, Operators.IsNotNull].forEach( + operator => expect(isOperatorRelevant(operator, 'skills')).toBe(true), + ); + // scalar operators are hidden for array columns + [Operators.Equals, Operators.GreaterThan, Operators.Like].forEach(operator => + expect(isOperatorRelevant(operator, 'skills')).toBe(false), + ); +}); + +test('hides CONTAINS for non multi-value columns', () => { + const props = setup({ + adhocFilter: new AdhocFilter({ + expressionType: ExpressionTypes.Simple, + subject: 'value', + operatorId: undefined, + operator: undefined, + comparator: undefined, + clause: undefined, + }), + datasource: { + columns: [{ id: 3, column_name: 'value', type: 'STRING' }], + }, + }); + const { isOperatorRelevant } = useSimpleTabFilterProps( + props as unknown as Props, + ); + expect(isOperatorRelevant(Operators.Contains, 'value')).toBe(false); +}); + test('will convert from individual comparator to array if the operator changes to multi', () => { const props = setup(); const { onOperatorChange } = useSimpleTabFilterProps( diff --git a/superset-frontend/src/explore/components/controls/FilterControl/AdhocFilterEditPopoverSimpleTabContent/index.tsx b/superset-frontend/src/explore/components/controls/FilterControl/AdhocFilterEditPopoverSimpleTabContent/index.tsx index 0c66026f8e1d..845e3cbd0e67 100644 --- a/superset-frontend/src/explore/components/controls/FilterControl/AdhocFilterEditPopoverSimpleTabContent/index.tsx +++ b/superset-frontend/src/explore/components/controls/FilterControl/AdhocFilterEditPopoverSimpleTabContent/index.tsx @@ -32,6 +32,7 @@ import { isDefined, SupersetClient, } from '@superset-ui/core'; +import { GenericDataType } from '@apache-superset/core/common'; import { styled, useTheme, css } from '@apache-superset/core/theme'; import { Operators, @@ -118,6 +119,8 @@ export const useSimpleTabFilterProps = (props: Props) => { const isColumnNumber = !!column && (column.type === 'INT' || column.type === 'INTEGER'); const isColumnFunction = !!column && !!column.expression; + const isColumnMultiValue = + !!column && column.type_generic === GenericDataType.MultiValue; if (operator && operator === Operators.LatestPartition) { const { partitionColumn } = props; @@ -127,6 +130,18 @@ export const useSimpleTabFilterProps = (props: Props) => { // hide the TEMPORAL_RANGE operator return false; } + // CONTAINS (array membership) only applies to multi-value columns. + if (operator === Operators.Contains) { + return isColumnMultiValue; + } + if (isColumnMultiValue) { + // multi-value columns support membership and null checks only + return ( + operator === Operators.Contains || + operator === Operators.IsNull || + operator === Operators.IsNotNull + ); + } if (operator === Operators.IsTrue || operator === Operators.IsFalse) { return isColumnBoolean || isColumnNumber || isColumnFunction; } diff --git a/superset-frontend/src/explore/components/controls/FilterControl/utils/translateToSQL.ts b/superset-frontend/src/explore/components/controls/FilterControl/utils/translateToSQL.ts index 171ddbc6f528..c105bcda9fdd 100644 --- a/superset-frontend/src/explore/components/controls/FilterControl/utils/translateToSQL.ts +++ b/superset-frontend/src/explore/components/controls/FilterControl/utils/translateToSQL.ts @@ -44,6 +44,7 @@ export const OPERATORS_TO_SQL = { 'IS NULL': 'IS NULL', 'IS TRUE': 'IS TRUE', 'IS FALSE': 'IS FALSE', + CONTAINS: 'CONTAINS', 'LATEST PARTITION': ({ datasource, }: { diff --git a/superset-frontend/src/explore/constants.ts b/superset-frontend/src/explore/constants.ts index 437c859232a4..f0c41d603756 100644 --- a/superset-frontend/src/explore/constants.ts +++ b/superset-frontend/src/explore/constants.ts @@ -45,6 +45,7 @@ export enum Operators { IsTrue = 'IS_TRUE', IsFalse = 'IS_FALSE', TemporalRange = 'TEMPORAL_RANGE', + Contains = 'CONTAINS', } export interface OperatorType { @@ -89,6 +90,7 @@ export const OPERATOR_ENUM_TO_OPERATOR_TYPE: { display: t('TEMPORAL_RANGE'), operation: 'TEMPORAL_RANGE', }, + [Operators.Contains]: { display: t('Contains'), operation: 'CONTAINS' }, }; export const OPERATORS_OPTIONS = Object.values(Operators) as Operators[]; diff --git a/superset/connectors/sqla/models.py b/superset/connectors/sqla/models.py index 81b5c2fc8d9d..de7bbbaec77b 100644 --- a/superset/connectors/sqla/models.py +++ b/superset/connectors/sqla/models.py @@ -1711,6 +1711,63 @@ def _render_adhoc_expression_for_metadata_lookup( ) ) from ex + def _multivalue_column_to_sqla( + self, + col: AdhocColumn, + template_processor: BaseTemplateProcessor | None = None, + ) -> tuple[ColumnElement, utils.GenericDataType | None]: + """ + Turn a multi-value (array) modifier column into a sqlalchemy column. + + The column references a base array column and an operation (e.g. array + length); the native SQL is produced by the engine spec so the same + payload works across any dialect that supports array columns. + """ + label = utils.get_column_name(col) + base_name = col.get("column") + operation = col.get("columnOperation") + + db_engine_spec = self.db_engine_spec + if not db_engine_spec.supports_multivalue_columns: + raise QueryObjectValidationError( + _("This database does not support multi-value (array) columns.") + ) + + base_column = self.get_column(base_name) + if base_column is None: + raise QueryObjectValidationError( + _("Unknown column used as multi-value source: %(col)s", col=base_name) + ) + base_sqla_col = base_column.get_sqla_col(template_processor=template_processor) + + if operation == utils.MultiValueColumnOperation.LENGTH: + expression = db_engine_spec.array_length(base_sqla_col) + generic_type = utils.GenericDataType.NUMERIC + elif operation == utils.MultiValueColumnOperation.EXPLODE: + # Scalar explode (e.g. ClickHouse arrayJoin) can be projected directly. + # Set-returning UNNEST dialects (Postgres/Trino/BigQuery) need extra + # FROM/JOIN plumbing and are handled in a later phase; for those the + # engine spec leaves array_explode unimplemented and we surface a + # clear error instead of emitting invalid SQL. + try: + expression = db_engine_spec.array_explode(base_sqla_col) + except NotImplementedError as ex: + raise QueryObjectValidationError( + _("This database does not support exploding array columns.") + ) from ex + # The exploded value is a single array element; its type is not + # reliably known from the array type string, so leave it unset. + generic_type = None + else: + raise QueryObjectValidationError( + _( + "Unsupported multi-value column operation: %(op)s", + op=operation, + ) + ) + + return self.make_sqla_column_compatible(expression, label), generic_type + def adhoc_column_to_sqla( # pylint: disable=too-many-locals self, col: AdhocColumn, @@ -1733,6 +1790,11 @@ def adhoc_column_to_sqla( # pylint: disable=too-many-locals Python type (e.g. numeric casts for numeric adhoc expressions). :rtype: tuple[sqlalchemy.sql.ColumnElement, Optional[GenericDataType]] """ + if utils.is_multivalue_operation_column(col): + return self._multivalue_column_to_sqla( + col, template_processor=template_processor + ) + label = utils.get_column_name(col) sql_expression = col["sqlExpression"] time_grain = col.get("timeGrain") diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py index 3b3c234c1650..08ff0aa254ee 100644 --- a/superset/db_engine_specs/base.py +++ b/superset/db_engine_specs/base.py @@ -55,7 +55,13 @@ from sqlalchemy.engine.url import URL from sqlalchemy.ext.compiler import compiles from sqlalchemy.sql import literal_column, quoted_name, text -from sqlalchemy.sql.expression import BinaryExpression, ColumnClause, Select, TextClause +from sqlalchemy.sql.expression import ( + BinaryExpression, + ColumnClause, + ColumnElement, + Select, + TextClause, +) from sqlalchemy.types import TypeEngine from superset import db @@ -493,6 +499,11 @@ class BaseEngineSpec: # pylint: disable=too-many-public-methods time_groupby_inline = False limit_method = LimitMethod.FORCE_LIMIT supports_multivalues_insert = False + # Whether this engine supports first-class multi-value (array-typed) columns. + # When True, array columns are classified as ``GenericDataType.MULTI_VALUE`` and + # the ``array_*`` capability methods below must be implemented. Defaults to + # False so engines that have not opted in keep treating arrays as strings. + supports_multivalue_columns = False allows_joins = True allows_subqueries = True allows_alias_in_select = True @@ -2393,6 +2404,51 @@ def update_params_from_encrypted_extra( # pylint: disable=invalid-name logger.error(ex, exc_info=True) raise + @classmethod + def array_contains(cls, col: ColumnElement, value: Any) -> ColumnElement: + """ + Build a boolean expression testing whether array column ``col`` contains + ``value``. Engines that set ``supports_multivalue_columns = True`` must + override this with their native array-membership function. + + :param col: SQLAlchemy column element for the array column + :param value: scalar value to look for inside the array + :return: a SQLAlchemy boolean expression + """ + raise NotImplementedError( + f"{cls.engine} does not support multi-value (array) columns" + ) + + @classmethod + def array_length(cls, col: ColumnElement) -> ColumnElement: + """ + Build a numeric expression returning the number of elements in array + column ``col``. Engines that set ``supports_multivalue_columns = True`` + must override this with their native array-length function. + + :param col: SQLAlchemy column element for the array column + :return: a SQLAlchemy numeric expression + """ + raise NotImplementedError( + f"{cls.engine} does not support multi-value (array) columns" + ) + + @classmethod + def array_explode(cls, col: ColumnElement) -> ColumnElement: + """ + Build an expression that expands array column ``col`` into one scalar + value per element (e.g. ClickHouse ``arrayJoin``). Set-returning dialects + (Postgres/Trino/BigQuery ``UNNEST``) need additional FROM/JOIN plumbing + that is handled by the query builder; this method only returns the + element-producing expression. + + :param col: SQLAlchemy column element for the array column + :return: a SQLAlchemy expression yielding individual array elements + """ + raise NotImplementedError( + f"{cls.engine} does not support multi-value (array) columns" + ) + @classmethod def get_column_spec( # pylint: disable=unused-argument cls, diff --git a/superset/db_engine_specs/clickhouse.py b/superset/db_engine_specs/clickhouse.py index a998dce75e5c..66b5ac7eebc4 100644 --- a/superset/db_engine_specs/clickhouse.py +++ b/superset/db_engine_specs/clickhouse.py @@ -26,8 +26,9 @@ from flask_babel import gettext as __ from marshmallow import fields, Schema from marshmallow.validate import Range -from sqlalchemy import types +from sqlalchemy import func, types from sqlalchemy.engine.url import URL +from sqlalchemy.sql.expression import ColumnElement from urllib3.exceptions import NewConnectionError from superset.databases.utils import make_url_safe @@ -55,6 +56,7 @@ class ClickHouseBaseEngineSpec(BaseEngineSpec): time_groupby_inline = True supports_multivalues_insert = True + supports_multivalue_columns = True _time_grain_expressions = { None: "{col}", @@ -80,7 +82,7 @@ class ClickHouseBaseEngineSpec(BaseEngineSpec): ( re.compile(r".*Array.*", re.IGNORECASE), types.String(), - GenericDataType.STRING, + GenericDataType.MULTI_VALUE, ), ( re.compile(r".*UUID.*", re.IGNORECASE), @@ -119,6 +121,22 @@ class ClickHouseBaseEngineSpec(BaseEngineSpec): ), ) + @classmethod + def array_contains(cls, col: ColumnElement, value: Any) -> ColumnElement: + # ClickHouse: has(arr, value) -> 1 if the array contains value + return func.has(col, value) + + @classmethod + def array_length(cls, col: ColumnElement) -> ColumnElement: + # ClickHouse: length(arr) -> number of elements + return func.length(col) + + @classmethod + def array_explode(cls, col: ColumnElement) -> ColumnElement: + # ClickHouse: arrayJoin(arr) is a scalar function usable directly in + # SELECT/GROUP BY (no JOIN needed, unlike UNNEST dialects). + return func.arrayJoin(col) + @classmethod def epoch_to_dttm(cls) -> str: return "{col}" diff --git a/superset/models/helpers.py b/superset/models/helpers.py index b603f0bb3cea..09280de3d5b1 100644 --- a/superset/models/helpers.py +++ b/superset/models/helpers.py @@ -3278,6 +3278,13 @@ def get_sqla_query( # pylint: disable=too-many-arguments,too-many-locals,too-ma select_exprs.append(outer) elif columns: for selected in columns: + if utils.is_multivalue_operation_column(selected): + outer, _unused = self.adhoc_column_to_sqla( + col=selected, + template_processor=template_processor, + ) + select_exprs.append(outer) + continue if is_adhoc_column(selected): _sql = selected["sqlExpression"] _column_label = selected["label"] @@ -3615,6 +3622,17 @@ def get_sqla_query( # pylint: disable=too-many-arguments,too-many-locals,too-ma target_clause_list.append(sqla_col.not_like(eq)) else: target_clause_list.append(sqla_col.not_ilike(eq)) + elif op == utils.FilterOperator.CONTAINS: + if not db_engine_spec.supports_multivalue_columns: + raise QueryObjectValidationError( + _( + "The CONTAINS operator is only supported for " + "multi-value (array) columns on this database." + ) + ) + target_clause_list.append( + db_engine_spec.array_contains(sqla_col, eq) + ) elif ( op == utils.FilterOperator.TEMPORAL_RANGE and isinstance(eq, str) diff --git a/superset/superset_typing.py b/superset/superset_typing.py index 39ff9ab13478..31294a5af3d7 100644 --- a/superset/superset_typing.py +++ b/superset/superset_typing.py @@ -104,6 +104,10 @@ class AdhocColumn(TypedDict, total=False): isColumnReference: bool | None columnType: Literal["BASE_AXIS", "SERIES"] | None timeGrain: str | None + # Multi-value (array) column modifiers: ``column`` names the base array + # column and ``columnOperation`` the operation to apply (e.g. "LENGTH"). + column: str + columnOperation: str class SQLAColumnType(TypedDict): diff --git a/superset/utils/core.py b/superset/utils/core.py index f49a874f2e70..08253956a5b2 100644 --- a/superset/utils/core.py +++ b/superset/utils/core.py @@ -187,7 +187,7 @@ class GenericDataType(IntEnum): STRING = 1 TEMPORAL = 2 BOOLEAN = 3 - # ARRAY = 4 # Mapping all the complex data types to STRING for now + MULTI_VALUE = 4 # array-typed columns (e.g. ClickHouse Array, Postgres ARRAY) # JSON = 5 # and leaving these as a reminder. # MAP = 6 # ROW = 7 @@ -277,6 +277,7 @@ class FilterOperator(StrEnum): IS_TRUE = "IS TRUE" IS_FALSE = "IS FALSE" TEMPORAL_RANGE = "TEMPORAL_RANGE" + CONTAINS = "CONTAINS" # array membership, for MULTI_VALUE columns class FilterStringOperators(StrEnum): @@ -295,6 +296,7 @@ class FilterStringOperators(StrEnum): LATEST_PARTITION = ("LATEST_PARTITION",) IS_TRUE = ("IS_TRUE",) IS_FALSE = ("IS_FALSE",) + CONTAINS = ("CONTAINS",) class PostProcessingBoxplotWhiskerType(StrEnum): @@ -1271,6 +1273,22 @@ def is_adhoc_column(column: Column) -> TypeGuard[AdhocColumn]: ) +class MultiValueColumnOperation(StrEnum): + """Operations that can be applied to a multi-value (array) column.""" + + LENGTH = "LENGTH" + EXPLODE = "EXPLODE" + + +def is_multivalue_operation_column(column: Column) -> bool: + """Whether ``column`` is a multi-value modifier (e.g. array length). + + These columns carry a base ``column`` plus a ``columnOperation`` instead of a + ``sqlExpression``; the actual SQL is produced by the engine spec. + """ + return isinstance(column, dict) and "columnOperation" in column + + def is_base_axis(column: Column) -> bool: return is_adhoc_column(column) and column.get("columnType") == "BASE_AXIS" @@ -1676,7 +1694,7 @@ def get_column_name_from_column(column: Column) -> str | None: :param column: Physical and ad-hoc column :return: column name if physical column, otherwise None """ - if is_adhoc_column(column): + if is_adhoc_column(column) or is_multivalue_operation_column(column): return None return column # type: ignore diff --git a/tests/unit_tests/db_engine_specs/test_base.py b/tests/unit_tests/db_engine_specs/test_base.py index e46517a9927e..18b456a3837a 100644 --- a/tests/unit_tests/db_engine_specs/test_base.py +++ b/tests/unit_tests/db_engine_specs/test_base.py @@ -1360,3 +1360,19 @@ def test_resolve_column_type_falls_back_to_pa_mapped() -> None: def test_resolve_column_type_returns_none_when_both_absent() -> None: """None is returned when neither source provides a type.""" assert BaseEngineSpec.resolve_column_type(None, None) is None + + +def test_multivalue_columns_disabled_by_default() -> None: + """Engines must opt in to multi-value support; base defaults to off.""" + assert BaseEngineSpec.supports_multivalue_columns is False + + +@pytest.mark.parametrize("method", ["array_contains", "array_length", "array_explode"]) +def test_array_capabilities_raise_when_unsupported(method: str) -> None: + """Array capability methods raise NotImplementedError unless overridden.""" + from sqlalchemy import column + + fn = getattr(BaseEngineSpec, method) + args = (column("c"), "v") if method == "array_contains" else (column("c"),) + with pytest.raises(NotImplementedError): + fn(*args) diff --git a/tests/unit_tests/db_engine_specs/test_clickhouse.py b/tests/unit_tests/db_engine_specs/test_clickhouse.py index 5f56554e82d1..d77b03aacae6 100644 --- a/tests/unit_tests/db_engine_specs/test_clickhouse.py +++ b/tests/unit_tests/db_engine_specs/test_clickhouse.py @@ -109,7 +109,16 @@ def test_connect_convert_dttm( GenericDataType.STRING, False, ), - ("Array(UInt8)", String, None, GenericDataType.STRING, False), + ("Array(UInt8)", String, None, GenericDataType.MULTI_VALUE, False), + ("Array(String)", String, None, GenericDataType.MULTI_VALUE, False), + ("Array(UInt64)", String, None, GenericDataType.MULTI_VALUE, False), + ( + "Array(LowCardinality(String))", + String, + None, + GenericDataType.MULTI_VALUE, + False, + ), ("Enum('hello', 'world')", String, None, GenericDataType.STRING, False), ("Enum('UInt32', 'Bool')", String, None, GenericDataType.STRING, False), ( @@ -233,3 +242,62 @@ def test_adjust_engine_params_fully_qualified( uri = spec.adjust_engine_params(url, {}, None, schema)[0] assert str(uri) == expected_result + + +def _compile(expr) -> str: + return str(expr.compile(compile_kwargs={"literal_binds": True})) + + +def test_clickhouse_supports_multivalue_columns() -> None: + from superset.db_engine_specs.clickhouse import ( # noqa: N813 + ClickHouseEngineSpec as spec, + ) + + assert spec.supports_multivalue_columns is True + + +def test_multivalue_contains_sql() -> None: + from sqlalchemy import column + + from superset.db_engine_specs.clickhouse import ( # noqa: N813 + ClickHouseEngineSpec as spec, + ) + + expr = spec.array_contains(column("skills"), "Driver") + assert _compile(expr) == "has(skills, 'Driver')" + + +def test_multivalue_contains_binds_parameters() -> None: + """Value must be a bound parameter, not inlined (SQL-injection safety).""" + from sqlalchemy import column + + from superset.db_engine_specs.clickhouse import ( # noqa: N813 + ClickHouseEngineSpec as spec, + ) + + expr = spec.array_contains(column("skills"), "Driver") + compiled = expr.compile() + assert "Driver" not in str(compiled) + assert "Driver" in compiled.params.values() + + +def test_multivalue_length_sql() -> None: + from sqlalchemy import column + + from superset.db_engine_specs.clickhouse import ( # noqa: N813 + ClickHouseEngineSpec as spec, + ) + + expr = spec.array_length(column("skills")) + assert _compile(expr) == "length(skills)" + + +def test_multivalue_explode_sql() -> None: + from sqlalchemy import column + + from superset.db_engine_specs.clickhouse import ( # noqa: N813 + ClickHouseEngineSpec as spec, + ) + + expr = spec.array_explode(column("skills")) + assert _compile(expr) == "arrayJoin(skills)" diff --git a/tests/unit_tests/models/test_multivalue_explode.py b/tests/unit_tests/models/test_multivalue_explode.py new file mode 100644 index 000000000000..79645acdd3bd --- /dev/null +++ b/tests/unit_tests/models/test_multivalue_explode.py @@ -0,0 +1,169 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Explode (group by array elements) for multi-value columns — ClickHouse MVP. + +Set-returning UNNEST dialects (Postgres/Trino/BigQuery) need CROSS JOIN UNNEST +plumbing and are handled in a later phase; here we cover the scalar ClickHouse +``arrayJoin`` path plus the guard that keeps unimplemented dialects from emitting +invalid SQL. +""" + +from __future__ import annotations + +import pytest +from flask import Flask +from pytest_mock import MockerFixture + +from superset.connectors.sqla.models import SqlaTable, SqlMetric, TableColumn +from superset.db_engine_specs.base import BaseEngineSpec +from superset.exceptions import QueryObjectValidationError +from superset.models.core import Database +from superset.superset_typing import AdhocColumn, Column, QueryObjectDict +from superset.utils.core import MultiValueColumnOperation + + +class _UnnestOnlySpec(BaseEngineSpec): + """Simulates a future array dialect that has not implemented scalar explode.""" + + supports_multivalue_columns = True + # array_explode intentionally left as the base NotImplementedError + + +def _make_dataset(mocker: MockerFixture) -> SqlaTable: + database = Database( + id=1, + database_name="test_db", + sqlalchemy_uri="sqlite://", + ) + columns = [ + TableColumn(column_name="skills", type="Array(String)"), + TableColumn(column_name="city", type="VARCHAR(100)"), + ] + dataset = SqlaTable( + table_name="jobs", + columns=columns, + database=database, + metrics=[SqlMetric(metric_name="count", expression="COUNT(*)")], + ) + mocker.patch( + "superset.connectors.sqla.models.security_manager.get_guest_rls_filters", + return_value=[], + ) + mocker.patch( + "superset.connectors.sqla.models.security_manager.is_guest_user", + return_value=False, + ) + return dataset + + +def _explode_dimension() -> AdhocColumn: + return { + "label": "skill", + "column": "skills", + "columnOperation": MultiValueColumnOperation.EXPLODE.value, + } + + +def _query_obj(dimension: Column) -> QueryObjectDict: + return { + "granularity": None, + "from_dttm": None, + "to_dttm": None, + "is_timeseries": False, + "groupby": [dimension], + "metrics": ["count"], + "filter": [], + "columns": [], + } + + +def _clickhouse_spec() -> type: + # Imported lazily: clickhouse.py touches app.config at import time, which + # is unavailable at pytest collection once clickhouse-connect is installed. + from superset.db_engine_specs.clickhouse import ClickHouseEngineSpec + + return ClickHouseEngineSpec + + +def _with_spec(mocker: MockerFixture, dataset: SqlaTable, spec: type) -> None: + mocker.patch.object( + SqlaTable, + "db_engine_spec", + new=property(lambda self: spec), + ) + + +def test_explode_dimension_generates_arrayjoin( + mocker: MockerFixture, + app: Flask, +) -> None: + """An explode dimension routes through array_explode -> arrayJoin(skills).""" + dataset = _make_dataset(mocker) + _with_spec(mocker, dataset, _clickhouse_spec()) + with app.test_request_context(): + sql = dataset.get_query_str_extended( + _query_obj(_explode_dimension()), mutate=False + ).sql.lower() + + assert "arrayjoin(skills)" in sql + assert "skill" in sql # the label is applied + + +def test_explode_changes_generated_sql( + mocker: MockerFixture, + app: Flask, +) -> None: + """Exploding a column yields different SQL than grouping by it raw. + + The explode modifier is part of the query object, so a query that explodes + cannot collide with one that does not (distinct cache keys downstream). + """ + dataset = _make_dataset(mocker) + _with_spec(mocker, dataset, _clickhouse_spec()) + with app.test_request_context(): + exploded = dataset.get_query_str_extended( + _query_obj(_explode_dimension()), mutate=False + ).sql + plain = dataset.get_query_str_extended(_query_obj("city"), mutate=False).sql + + assert exploded != plain + assert "arrayJoin" in exploded + + +def test_explode_unsupported_engine_raises( + mocker: MockerFixture, + app: Flask, +) -> None: + """On an engine without array support (sqlite) explode is rejected.""" + dataset = _make_dataset(mocker) + with app.test_request_context(): # noqa: SIM117 + with pytest.raises(QueryObjectValidationError): + dataset.get_query_str_extended( + _query_obj(_explode_dimension()), mutate=False + ) + + +def test_explode_unimplemented_dialect_raises_clean_error( + mocker: MockerFixture, + app: Flask, +) -> None: + """A multi-value dialect lacking scalar explode gets a clear error, not bad SQL.""" + dataset = _make_dataset(mocker) + _with_spec(mocker, dataset, _UnnestOnlySpec) + with app.test_request_context(): # noqa: SIM117 + with pytest.raises(QueryObjectValidationError): + dataset.adhoc_column_to_sqla(col=_explode_dimension()) diff --git a/tests/unit_tests/models/test_multivalue_filter.py b/tests/unit_tests/models/test_multivalue_filter.py new file mode 100644 index 000000000000..db52e67b017a --- /dev/null +++ b/tests/unit_tests/models/test_multivalue_filter.py @@ -0,0 +1,122 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Stage B: the CONTAINS filter operator for multi-value (array) columns.""" + +from __future__ import annotations + +import pytest +from flask import Flask +from pytest_mock import MockerFixture + +from superset.connectors.sqla.models import SqlaTable, SqlMetric, TableColumn +from superset.exceptions import QueryObjectValidationError +from superset.models.core import Database +from superset.superset_typing import QueryObjectDict +from superset.utils.core import FilterOperator + + +def _make_dataset(mocker: MockerFixture) -> SqlaTable: + database = Database( + id=1, + database_name="test_db", + sqlalchemy_uri="sqlite://", + ) + columns = [ + TableColumn(column_name="skills", type="Array(String)"), + TableColumn(column_name="city", type="VARCHAR(100)"), + ] + dataset = SqlaTable( + table_name="jobs", + columns=columns, + database=database, + metrics=[SqlMetric(metric_name="count", expression="COUNT(*)")], + ) + mocker.patch( + "superset.connectors.sqla.models.security_manager.get_guest_rls_filters", + return_value=[], + ) + mocker.patch( + "superset.connectors.sqla.models.security_manager.is_guest_user", + return_value=False, + ) + return dataset + + +def _query_obj() -> QueryObjectDict: + return { + "granularity": None, + "from_dttm": None, + "to_dttm": None, + "is_timeseries": False, + "groupby": ["city"], + "metrics": ["count"], + "filter": [ + { + "col": "skills", + "op": FilterOperator.CONTAINS.value, + "val": "Driver", + } + ], + "columns": [], + } + + +def test_contains_operator_is_registered() -> None: + """The operator round-trips through the enum used to parse filter payloads.""" + assert FilterOperator("CONTAINS") is FilterOperator.CONTAINS + assert FilterOperator.CONTAINS.value == "CONTAINS" + + +def test_contains_filter_unsupported_engine_raises( + mocker: MockerFixture, + app: Flask, +) -> None: + """On an engine without array support (sqlite) CONTAINS is rejected.""" + dataset = _make_dataset(mocker) + with app.test_request_context(): # noqa: SIM117 + with pytest.raises(QueryObjectValidationError): + dataset.get_query_str_extended(_query_obj(), mutate=False) + + +def test_contains_filter_generates_native_sql( + mocker: MockerFixture, + app: Flask, +) -> None: + """On a multi-value-capable engine CONTAINS compiles to the native call. + + The dataset compiles against sqlite (so no ClickHouse driver is needed), but + its engine spec is ClickHouse, so the operator routes through + ``ClickHouseEngineSpec.array_contains`` -> ``has(...)``. ``func.has`` renders + the same regardless of dialect, which lets us assert on it here. + """ + # Imported lazily: clickhouse.py touches app.config at import time, which + # is unavailable at pytest collection once clickhouse-connect is installed. + from superset.db_engine_specs.clickhouse import ClickHouseEngineSpec + + dataset = _make_dataset(mocker) + mocker.patch.object( + SqlaTable, + "db_engine_spec", + new=property(lambda self: ClickHouseEngineSpec), + ) + + with app.test_request_context(): + extended = dataset.get_query_str_extended(_query_obj(), mutate=False) + sql = extended.sql + + assert "has(" in sql.lower() + assert "driver" in sql.lower() diff --git a/tests/unit_tests/models/test_multivalue_length.py b/tests/unit_tests/models/test_multivalue_length.py new file mode 100644 index 000000000000..67b4db1b3ac2 --- /dev/null +++ b/tests/unit_tests/models/test_multivalue_length.py @@ -0,0 +1,179 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Stage D: the array-length virtual dimension for multi-value columns.""" + +from __future__ import annotations + +import pytest +from flask import Flask +from pytest_mock import MockerFixture + +from superset.connectors.sqla.models import SqlaTable, SqlMetric, TableColumn +from superset.exceptions import QueryObjectValidationError +from superset.models.core import Database +from superset.superset_typing import AdhocColumn, Column, QueryObjectDict +from superset.utils.core import ( + GenericDataType, + get_column_name_from_column, + get_column_names_from_columns, + MultiValueColumnOperation, +) + + +def _make_dataset(mocker: MockerFixture) -> SqlaTable: + database = Database( + id=1, + database_name="test_db", + sqlalchemy_uri="sqlite://", + ) + columns = [ + TableColumn(column_name="skills", type="Array(String)"), + TableColumn(column_name="city", type="VARCHAR(100)"), + ] + dataset = SqlaTable( + table_name="jobs", + columns=columns, + database=database, + metrics=[SqlMetric(metric_name="count", expression="COUNT(*)")], + ) + mocker.patch( + "superset.connectors.sqla.models.security_manager.get_guest_rls_filters", + return_value=[], + ) + mocker.patch( + "superset.connectors.sqla.models.security_manager.is_guest_user", + return_value=False, + ) + return dataset + + +def _length_dimension() -> AdhocColumn: + return { + "label": "skills_length", + "column": "skills", + "columnOperation": MultiValueColumnOperation.LENGTH.value, + } + + +def _query_obj(dimension: Column) -> QueryObjectDict: + return { + "granularity": None, + "from_dttm": None, + "to_dttm": None, + "is_timeseries": False, + "groupby": [dimension], + "metrics": ["count"], + "filter": [], + "columns": [], + } + + +def _clickhouse_dataset(mocker: MockerFixture) -> SqlaTable: + # Imported lazily: clickhouse.py touches app.config at import time, which + # is unavailable at pytest collection once clickhouse-connect is installed. + from superset.db_engine_specs.clickhouse import ClickHouseEngineSpec + + dataset = _make_dataset(mocker) + mocker.patch.object( + SqlaTable, + "db_engine_spec", + new=property(lambda self: ClickHouseEngineSpec), + ) + return dataset + + +def test_multivalue_column_not_treated_as_physical_column() -> None: + """Regression: multi-value modifier columns must be ignored by the + physical-column extraction used by query-context validation, otherwise they + are wrongly reported as "Columns missing in dataset" on execution. + """ + col = _length_dimension() + assert get_column_name_from_column(col) is None + assert get_column_names_from_columns([col, "city"]) == ["city"] + + +def test_length_dimension_generates_native_sql( + mocker: MockerFixture, + app: Flask, +) -> None: + """A length dimension routes through array_length -> length(skills).""" + dataset = _clickhouse_dataset(mocker) + with app.test_request_context(): + extended = dataset.get_query_str_extended( + _query_obj(_length_dimension()), mutate=False + ) + sql = extended.sql.lower() + + assert "length(skills)" in sql + assert "skills_length" in sql # the label is applied + + +def test_length_dimension_type_is_numeric( + mocker: MockerFixture, + app: Flask, +) -> None: + """The derived length column is typed NUMERIC so it behaves like a number.""" + dataset = _clickhouse_dataset(mocker) + with app.test_request_context(): + _expr, generic_type = dataset.adhoc_column_to_sqla(col=_length_dimension()) + + assert generic_type == GenericDataType.NUMERIC + + +def test_length_dimension_unsupported_engine_raises( + mocker: MockerFixture, + app: Flask, +) -> None: + """On an engine without array support (sqlite) the length op is rejected.""" + dataset = _make_dataset(mocker) + with app.test_request_context(): # noqa: SIM117 + with pytest.raises(QueryObjectValidationError): + dataset.get_query_str_extended( + _query_obj(_length_dimension()), mutate=False + ) + + +def test_length_dimension_unknown_base_column_raises( + mocker: MockerFixture, + app: Flask, +) -> None: + """A length op referencing a missing base column is rejected.""" + dataset = _clickhouse_dataset(mocker) + bad: AdhocColumn = { + "label": "nope_length", + "column": "does_not_exist", + "columnOperation": MultiValueColumnOperation.LENGTH.value, + } + with app.test_request_context(): # noqa: SIM117 + with pytest.raises(QueryObjectValidationError): + dataset.get_query_str_extended(_query_obj(bad), mutate=False) + + +def test_unsupported_operation_raises( + mocker: MockerFixture, + app: Flask, +) -> None: + """An unknown columnOperation is rejected rather than silently ignored.""" + dataset = _clickhouse_dataset(mocker) + bad: AdhocColumn = { + "label": "skills_x", + "column": "skills", + "columnOperation": "NOT_A_REAL_OP", + } + with app.test_request_context(): # noqa: SIM117 + with pytest.raises(QueryObjectValidationError): + dataset.adhoc_column_to_sqla(col=bad)