From ea28ef41a4963a91beb4eb600326f4354a2937dc Mon Sep 17 00:00:00 2001 From: geooo109 Date: Fri, 5 Dec 2025 15:50:37 +0200 Subject: [PATCH 1/5] feat(optimizer)!: bq annotate type for NULL --- sqlglot/typing/bigquery.py | 1 + tests/fixtures/optimizer/annotate_types.sql | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/sqlglot/typing/bigquery.py b/sqlglot/typing/bigquery.py index afce03961d..8a3b94547c 100644 --- a/sqlglot/typing/bigquery.py +++ b/sqlglot/typing/bigquery.py @@ -160,6 +160,7 @@ def _annotate_array(self: TypeAnnotator, expression: exp.Array) -> exp.Array: exp.LaxInt64, exp.Length, exp.Ntile, + exp.Null, exp.Rank, exp.RangeBucket, exp.RegexpInstr, diff --git a/tests/fixtures/optimizer/annotate_types.sql b/tests/fixtures/optimizer/annotate_types.sql index a9fde2beea..9df23d2426 100644 --- a/tests/fixtures/optimizer/annotate_types.sql +++ b/tests/fixtures/optimizer/annotate_types.sql @@ -122,3 +122,7 @@ DATETIME; # dialect: bigquery CASE WHEN TRUE THEN TIMESTAMP '2020-02-02 00:00:00' ELSE '2010-01-01' END; TIMESTAMP; + +# dialect: bigquery +NULL; +BIGINT; From b879da30afdde857029f4536150edf2e67b70fcb Mon Sep 17 00:00:00 2001 From: geooo109 Date: Fri, 5 Dec 2025 15:54:46 +0200 Subject: [PATCH 2/5] refactor return type to be cleaner --- tests/fixtures/optimizer/annotate_types.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fixtures/optimizer/annotate_types.sql b/tests/fixtures/optimizer/annotate_types.sql index 9df23d2426..6227f6f98a 100644 --- a/tests/fixtures/optimizer/annotate_types.sql +++ b/tests/fixtures/optimizer/annotate_types.sql @@ -125,4 +125,4 @@ TIMESTAMP; # dialect: bigquery NULL; -BIGINT; +INT64; From f20c0c67bf693cd0ad488158d6837c52d18e5709 Mon Sep 17 00:00:00 2001 From: geooo109 Date: Mon, 8 Dec 2025 16:10:07 +0200 Subject: [PATCH 3/5] refactor impl --- sqlglot/dialects/bigquery.py | 1 + sqlglot/dialects/dialect.py | 7 +++++++ sqlglot/optimizer/annotate_types.py | 6 +++--- sqlglot/typing/bigquery.py | 1 - tests/test_optimizer.py | 12 ++++++++++++ 5 files changed, 23 insertions(+), 4 deletions(-) diff --git a/sqlglot/dialects/bigquery.py b/sqlglot/dialects/bigquery.py index 8c23b9b1e4..0381bd54c5 100644 --- a/sqlglot/dialects/bigquery.py +++ b/sqlglot/dialects/bigquery.py @@ -371,6 +371,7 @@ class BigQuery(Dialect): EXCLUDES_PSEUDOCOLUMNS_FROM_STAR = True QUERY_RESULTS_ARE_STRUCTS = True JSON_EXTRACT_SCALAR_SCALAR_ONLY = True + DEFAULT_TYPE_OF_NULL = exp.DataType.Type.BIGINT # https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#initcap INITCAP_DEFAULT_DELIMITER_CHARS = ' \t\n\r\f\v\\[\\](){}/|<>!?@"^#$&~_,.:;*%+\\-' diff --git a/sqlglot/dialects/dialect.py b/sqlglot/dialects/dialect.py index f1ada85545..d7ef7c3349 100644 --- a/sqlglot/dialects/dialect.py +++ b/sqlglot/dialects/dialect.py @@ -703,6 +703,13 @@ class Dialect(metaclass=_Dialect): so we map the ExplodingGenerateSeries expression to "generate_series" string. """ + DEFAULT_TYPE_OF_NULL = exp.DataType.Type.UNKNOWN + """ + The default type of NULL value, it is mostly used to aid type coercion, e.g. in query set operations. + + For example, in Bigquery the default type of a NULL value is INT64. + """ + # --- Autofilled --- tokenizer_class = Tokenizer diff --git a/sqlglot/optimizer/annotate_types.py b/sqlglot/optimizer/annotate_types.py index 950ea9acb1..b5e99ad71b 100644 --- a/sqlglot/optimizer/annotate_types.py +++ b/sqlglot/optimizer/annotate_types.py @@ -261,15 +261,15 @@ def annotate(self, expression: E, annotate_scope: bool = True) -> E: # This takes care of non-traversable expressions self._annotate_expression(expression) - # Replace NULL type with UNKNOWN, since the former is not an actual type; - # it is mostly used to aid type coercion, e.g. in query set operations. + # Replace NULL type with the default type of the targeted dialect, since the former is not an actual type. for expr in self._null_expressions.values(): - expr.type = exp.DataType.Type.UNKNOWN + expr.type = self.dialect.DEFAULT_TYPE_OF_NULL return expression def annotate_scope(self, scope: Scope) -> None: selects = {} + for name, source in scope.sources.items(): if not isinstance(source, Scope): continue diff --git a/sqlglot/typing/bigquery.py b/sqlglot/typing/bigquery.py index 8a3b94547c..afce03961d 100644 --- a/sqlglot/typing/bigquery.py +++ b/sqlglot/typing/bigquery.py @@ -160,7 +160,6 @@ def _annotate_array(self: TypeAnnotator, expression: exp.Array) -> exp.Array: exp.LaxInt64, exp.Length, exp.Ntile, - exp.Null, exp.Rank, exp.RangeBucket, exp.RegexpInstr, diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py index 05d0f79f6d..a1f6cbd445 100644 --- a/tests/test_optimizer.py +++ b/tests/test_optimizer.py @@ -1937,3 +1937,15 @@ def test_deep_ast_type_annotation(self): annotated = annotate_types(parse_one(binary_sql), schema={"t": {"a": "INT"}}) self.assertEqual(annotated.sql(), binary_sql) self.assertEqual(annotated.selects[0].type.this, exp.DataType.Type.INT) + + def test_null_coerce_annotation(self): + null_sql = "SELECT t.foo FROM (SELECT CAST(1 AS BIGDECIMAL) AS foo UNION ALL SELECT NULL AS foo) AS t" + annotated = parse_and_optimize(annotate_types, null_sql, "bigquery", dialect="bigquery") + + self.assertEqual(annotated.sql(), null_sql) + self.assertEqual(annotated.selects[0].type.this, exp.DataType.Type.BIGDECIMAL) + + null_sql = "SELECT t.foo FROM (SELECT NULL AS foo UNION ALL SELECT CAST(1 AS BIGDECIMAL) AS foo) AS t" + annotated = parse_and_optimize(annotate_types, null_sql, "bigquery", dialect="bigquery") + self.assertEqual(annotated.sql(), null_sql) + self.assertEqual(annotated.selects[0].type.this, exp.DataType.Type.BIGDECIMAL) From 52a122e51bb9ee1b68f789e65416e5c6ab77f769 Mon Sep 17 00:00:00 2001 From: geooo109 Date: Mon, 8 Dec 2025 16:14:38 +0200 Subject: [PATCH 4/5] refactor naming --- sqlglot/dialects/bigquery.py | 2 +- sqlglot/dialects/dialect.py | 6 +++--- sqlglot/optimizer/annotate_types.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sqlglot/dialects/bigquery.py b/sqlglot/dialects/bigquery.py index 0381bd54c5..34b951320a 100644 --- a/sqlglot/dialects/bigquery.py +++ b/sqlglot/dialects/bigquery.py @@ -371,7 +371,7 @@ class BigQuery(Dialect): EXCLUDES_PSEUDOCOLUMNS_FROM_STAR = True QUERY_RESULTS_ARE_STRUCTS = True JSON_EXTRACT_SCALAR_SCALAR_ONLY = True - DEFAULT_TYPE_OF_NULL = exp.DataType.Type.BIGINT + DEFAULT_NULL_TYPE = exp.DataType.Type.BIGINT # https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#initcap INITCAP_DEFAULT_DELIMITER_CHARS = ' \t\n\r\f\v\\[\\](){}/|<>!?@"^#$&~_,.:;*%+\\-' diff --git a/sqlglot/dialects/dialect.py b/sqlglot/dialects/dialect.py index d7ef7c3349..a2e159a92b 100644 --- a/sqlglot/dialects/dialect.py +++ b/sqlglot/dialects/dialect.py @@ -703,11 +703,11 @@ class Dialect(metaclass=_Dialect): so we map the ExplodingGenerateSeries expression to "generate_series" string. """ - DEFAULT_TYPE_OF_NULL = exp.DataType.Type.UNKNOWN + DEFAULT_NULL_TYPE = exp.DataType.Type.UNKNOWN """ - The default type of NULL value, it is mostly used to aid type coercion, e.g. in query set operations. + The default type of NULL, it is mostly used to aid type coercion, e.g. in query set operations. - For example, in Bigquery the default type of a NULL value is INT64. + For example, in BigQuery the default type of the NULL value is INT64. """ # --- Autofilled --- diff --git a/sqlglot/optimizer/annotate_types.py b/sqlglot/optimizer/annotate_types.py index b5e99ad71b..530602e986 100644 --- a/sqlglot/optimizer/annotate_types.py +++ b/sqlglot/optimizer/annotate_types.py @@ -263,7 +263,7 @@ def annotate(self, expression: E, annotate_scope: bool = True) -> E: # Replace NULL type with the default type of the targeted dialect, since the former is not an actual type. for expr in self._null_expressions.values(): - expr.type = self.dialect.DEFAULT_TYPE_OF_NULL + expr.type = self.dialect.DEFAULT_NULL_TYPE return expression From 820f42b39a40b603a652f16e38bd7cdfc775894e Mon Sep 17 00:00:00 2001 From: geooo109 Date: Mon, 8 Dec 2025 16:19:13 +0200 Subject: [PATCH 5/5] refactor comments --- sqlglot/dialects/dialect.py | 2 +- sqlglot/optimizer/annotate_types.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sqlglot/dialects/dialect.py b/sqlglot/dialects/dialect.py index a2e159a92b..4dc15d99f1 100644 --- a/sqlglot/dialects/dialect.py +++ b/sqlglot/dialects/dialect.py @@ -705,7 +705,7 @@ class Dialect(metaclass=_Dialect): DEFAULT_NULL_TYPE = exp.DataType.Type.UNKNOWN """ - The default type of NULL, it is mostly used to aid type coercion, e.g. in query set operations. + The default type of NULL for producing the correct projection type. For example, in BigQuery the default type of the NULL value is INT64. """ diff --git a/sqlglot/optimizer/annotate_types.py b/sqlglot/optimizer/annotate_types.py index 530602e986..39020155fb 100644 --- a/sqlglot/optimizer/annotate_types.py +++ b/sqlglot/optimizer/annotate_types.py @@ -261,7 +261,8 @@ def annotate(self, expression: E, annotate_scope: bool = True) -> E: # This takes care of non-traversable expressions self._annotate_expression(expression) - # Replace NULL type with the default type of the targeted dialect, since the former is not an actual type. + # Replace NULL type with the default type of the targeted dialect, since the former is not an actual type; + # it is mostly used to aid type coercion, e.g. in query set operations. for expr in self._null_expressions.values(): expr.type = self.dialect.DEFAULT_NULL_TYPE