diff --git a/scripts/optimization/README.md b/scripts/optimization/README.md index 2a28f070c..e1a5cad1a 100644 --- a/scripts/optimization/README.md +++ b/scripts/optimization/README.md @@ -5,18 +5,31 @@ named, `optimization_workshop`, with several tables inside the dataset. \ These tables are populated with information to help you optimize your BigQuery tables, views, and queries. -Run all the scripts within this folder using the following commands: +Run all the .sql scripts within this folder using the following commands: ```bash gcloud auth login && bash run_all_scripts.sh ``` -Run [Anti-pattern Recognition Tool](https://github.com/GoogleCloudPlatform/bigquery-antipattern-recognition/tree/main): +The `anti-pattern-recognittion-tool-scripts` subfolder contains additional scripts that need to be executed separetly to run +the [Anti-pattern Recognition Tool](https://github.com/GoogleCloudPlatform/bigquery-antipattern-recognition/tree/main): ```bash -bash run_anti_pattern_tool.sh +bash ./anti-pattern-recognittion-tool-scripts/run_anti_pattern_tool.sh \ +--input_table_name="optimization_workshop.viewable_queries_grouped_by_hash" \ +--input_table_id_col_name="Query_Hash" \ +--input_table_query_text_col_name="Query_Raw_Sample" \ +--input_table_slots_col_name="Total_Slot_Hours" + +bash ./anti-pattern-recognittion-tool-scripts/run_anti_pattern_tool.sh \ +--input_table_name="optimization_workshop.queries_grouped_by_hash_project" \ +--input_table_id_col_name="query_hash" \ +--input_table_query_text_col_name="top_10_jobs[SAFE_OFFSET(0)].query_text" \ +--input_table_slots_col_name="avg_total_slots" ``` +Te above command tales the `.` as input. In can be executed on any table with a schema similar to the one generated by `viewable_queries_grouped_by_hash`. + The scripts are described in more detail in the following sections. --- diff --git a/scripts/optimization/anti_pattern_recoginition_tool_tables.sql b/scripts/optimization/anti-pattern-recognittion-tool-scripts/anti_pattern_recoginition_tool_tables.sql similarity index 82% rename from scripts/optimization/anti_pattern_recoginition_tool_tables.sql rename to scripts/optimization/anti-pattern-recognittion-tool-scripts/anti_pattern_recoginition_tool_tables.sql index 9c61caff3..0ee864cf7 100644 --- a/scripts/optimization/anti_pattern_recoginition_tool_tables.sql +++ b/scripts/optimization/anti-pattern-recognittion-tool-scripts/anti_pattern_recoginition_tool_tables.sql @@ -26,16 +26,16 @@ CREATE OR REPLACE TABLE optimization_workshop.antipattern_output_table ( CREATE OR REPLACE VIEW optimization_workshop.antipattern_tool_input_view AS SELECT - Query_Hash id, - ANY_VALUE(Query_Raw_Sample) query, + id, + ANY_VALUE() query FROM - optimization_workshop.viewable_queries_grouped_by_hash + WHERE - Query_Hash is not null + is not null GROUP BY - Query_Hash + ORDER BY - ANY_VALUE(Total_Slot_Hours) desc + ANY_VALUE() desc LIMIT 1000 ; diff --git a/scripts/optimization/run_anti_pattern_tool.sh b/scripts/optimization/anti-pattern-recognittion-tool-scripts/run_anti_pattern_tool.sh similarity index 50% rename from scripts/optimization/run_anti_pattern_tool.sh rename to scripts/optimization/anti-pattern-recognittion-tool-scripts/run_anti_pattern_tool.sh index d1cef149b..dd608d237 100644 --- a/scripts/optimization/run_anti_pattern_tool.sh +++ b/scripts/optimization/anti-pattern-recognittion-tool-scripts/run_anti_pattern_tool.sh @@ -16,6 +16,35 @@ # Exit immediately if a command exits with a non-zero status. set -e + +# Get input_table name as input +for i in "$@"; do + case $i in + --input_table_name=*) + input_table_name="${i#*=}" + shift # past argument=value + ;; + --input_table_id_col_name=*) + input_table_id_col_name="${i#*=}" + shift # past argument=value + ;; + --input_table_query_text_col_name=*) + input_table_query_text_col_name="${i#*=}" + shift # past argument=value + ;; + --input_table_slots_col_name=*) + input_table_slots_col_name="${i#*=}" + shift # past argument=value + ;; + -*|--*) + echo "Unknown option $i" + exit 1 + ;; + *) + ;; + esac +done + # Set the following flags for the bq command: # --quiet: suppress status updates while jobs are running # --nouse_legacy_sql: use standard SQL syntax @@ -24,7 +53,13 @@ bq_flags="--quiet --nouse_legacy_sql --nouse_cache" # Run setup for anti pattern recognition tool -bq query ${bq_flags} /$input_table_name/g" \ + -e "s//$input_table_id_col_name/g" \ + -e "s//$input_table_query_text_col_name/g" \ + -e "s//$input_table_slots_col_name/g" \ + "./anti-pattern-recognittion-tool-scripts/anti_pattern_recoginition_tool_tables.sql") + +bq query ${bq_flags} <<< "$anti_pattern_recoginition_tool_tables_sql" { # try @@ -38,8 +73,11 @@ bq query ${bq_flags} /$input_table_name/g" \ + -e "s//$input_table_id_col_name/g" \ + "./anti-pattern-recognittion-tool-scripts/update_queries_by_hash_w_anti_patterns.sql") + bq query ${bq_flags} <<< "$update_queries_by_hash_w_anti_patterns_sql" } || { # catch echo 'Error: could not run Anti-pattern Recognition Tool. Try using GCP Cloud Shell https://cloud.google.com/shell/docs/launching-cloud-shell' diff --git a/scripts/optimization/update_queries_by_hash_w_anti_patterns.sql b/scripts/optimization/anti-pattern-recognittion-tool-scripts/update_queries_by_hash_w_anti_patterns.sql similarity index 82% rename from scripts/optimization/update_queries_by_hash_w_anti_patterns.sql rename to scripts/optimization/anti-pattern-recognittion-tool-scripts/update_queries_by_hash_w_anti_patterns.sql index 2f7184bd1..ef39080c1 100644 --- a/scripts/optimization/update_queries_by_hash_w_anti_patterns.sql +++ b/scripts/optimization/anti-pattern-recognittion-tool-scripts/update_queries_by_hash_w_anti_patterns.sql @@ -14,10 +14,10 @@ * limitations under the License. */ -ALTER TABLE optimization_workshop.viewable_queries_grouped_by_hash +ALTER TABLE ADD COLUMN IF NOT EXISTS recommendation ARRAY>; -UPDATE optimization_workshop.viewable_queries_grouped_by_hash t1 +UPDATE t1 SET t1.recommendation = t2.recommendation FROM optimization_workshop.antipattern_output_table t2 -WHERE t1.Query_Hash = t2.job_id; +WHERE t1. = t2.job_id; diff --git a/scripts/optimization/queries_grouped_by_hash.sql b/scripts/optimization/queries_grouped_by_hash_org.sql similarity index 99% rename from scripts/optimization/queries_grouped_by_hash.sql rename to scripts/optimization/queries_grouped_by_hash_org.sql index 6b14d243c..6973d54e5 100644 --- a/scripts/optimization/queries_grouped_by_hash.sql +++ b/scripts/optimization/queries_grouped_by_hash_org.sql @@ -41,7 +41,7 @@ CREATE TEMP FUNCTION num_stages_with_perf_insights(query_info ANY TYPE) AS ( ); CREATE SCHEMA IF NOT EXISTS optimization_workshop; -CREATE OR REPLACE TABLE optimization_workshop.queries_grouped_by_hash AS +CREATE OR REPLACE TABLE optimization_workshop.queries_grouped_by_hash_org AS SELECT statement_type, query_info.query_hashes.normalized_literals AS query_hash, @@ -53,8 +53,7 @@ SELECT ARRAY_AGG( STRUCT( bqutil.fn.job_url(project_id || ':us.' || parent_job_id) AS parent_job_url, - bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url, - query as query_text + bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url ) ORDER BY total_slot_ms DESC LIMIT 10) AS top_10_jobs, diff --git a/scripts/optimization/queries_grouped_by_hash_project.sql b/scripts/optimization/queries_grouped_by_hash_project.sql new file mode 100644 index 000000000..ba714daaf --- /dev/null +++ b/scripts/optimization/queries_grouped_by_hash_project.sql @@ -0,0 +1,85 @@ +/* + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This script creates a table named, top_bytes_scanning_queries_by_hash, + * which contains the top 200 most expensive queries by total bytes scanned + * within the past 30 days. + * 30 days is the default timeframe, but you can change this by setting the + * num_days_to_scan variable to a different value. + * Queries are grouped by their normalized query pattern, which ignores + * comments, parameter values, UDFs, and literals in the query text. + * This allows us to group queries that are logically the same, but + * have different literals. + * + * For example, the following queries would be grouped together: + * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-01' + * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-02' + * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-03' + */ + +DECLARE num_days_to_scan INT64 DEFAULT 30; + +CREATE TEMP FUNCTION num_stages_with_perf_insights(query_info ANY TYPE) AS ( + COALESCE(( + SELECT SUM(IF(i.slot_contention, 1, 0) + IF(i.insufficient_shuffle_quota, 1, 0)) + FROM UNNEST(query_info.performance_insights.stage_performance_standalone_insights) i), 0) + + COALESCE(ARRAY_LENGTH(query_info.performance_insights.stage_performance_change_insights), 0) +); + +CREATE SCHEMA IF NOT EXISTS optimization_workshop; +CREATE OR REPLACE TABLE optimization_workshop.queries_grouped_by_hash_project AS +SELECT + statement_type, + query_info.query_hashes.normalized_literals AS query_hash, + COUNT(DISTINCT DATE(start_time)) AS days_active, + ARRAY_AGG(DISTINCT project_id IGNORE NULLS) AS project_ids, + ARRAY_AGG(DISTINCT reservation_id IGNORE NULLS) AS reservation_ids, + SUM(num_stages_with_perf_insights(query_info)) AS num_stages_with_perf_insights, + COUNT(DISTINCT (project_id || ':us.' || job_id)) AS job_count, + ARRAY_AGG( + STRUCT( + bqutil.fn.job_url(project_id || ':us.' || parent_job_id) AS parent_job_url, + bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url, + query as query_text + ) + ORDER BY total_slot_ms + DESC LIMIT 10) AS top_10_jobs, + ARRAY_AGG(DISTINCT user_email) AS user_emails, + SUM(total_bytes_processed) / POW(1024, 3) AS total_gigabytes_processed, + AVG(total_bytes_processed) / POW(1024, 3) AS avg_gigabytes_processed, + SUM(total_slot_ms) / (1000 * 60 * 60) AS total_slot_hours, + AVG(total_slot_ms) / (1000 * 60 * 60) AS avg_total_slot_hours_per_active_day, + AVG(TIMESTAMP_DIFF(end_time, start_time, SECOND) ) AS avg_job_duration_seconds, + ARRAY_AGG(DISTINCT FORMAT("%T",labels)) AS labels, + SUM(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS total_slots, + AVG(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS avg_total_slots, + -- query hashes will all have the same referenced tables so we can use ANY_VALUE below + ANY_VALUE(ARRAY( + SELECT + ref_table.project_id || '.' || + IF(STARTS_WITH(ref_table.dataset_id, '_'), 'TEMP', ref_table.dataset_id) + || '.' || ref_table.table_id + FROM UNNEST(referenced_tables) ref_table + )) AS referenced_tables, +FROM `region-us`.INFORMATION_SCHEMA.JOBS +WHERE + DATE(creation_time) >= CURRENT_DATE - num_days_to_scan + AND state = 'DONE' + AND error_result IS NULL + AND job_type = 'QUERY' + AND statement_type != 'SCRIPT' +GROUP BY statement_type, query_hash; diff --git a/scripts/policy_tag_extractor/README.md b/scripts/policy_tag_extractor/README.md new file mode 100644 index 000000000..377b87c65 --- /dev/null +++ b/scripts/policy_tag_extractor/README.md @@ -0,0 +1,24 @@ +# BigQuery Policy Tag Extractor + +## Introduction +This directory contains the [policy_tag_export.sh](policy_tag_export.sh) bash script which extracts BigQuery policy tag information from a given dataset. The script will iterate through at most 10K tables in a dataset and then for every column with a policy tag, it will output the table name, column name, and policy tag ID in CSV format. + +## Instructions for use +The simplest way to execute this script is to run it directly in Cloud Shell, but if needed it can be executed as part of a larger CI/CD pipeline or process. + +Before using, make sure to update the bash script with the dataset that needs to be reviewed. + +To exceute in Cloud Shell: +1. [Launch a Cloud Shell session](https://cloud.google.com/shell/docs/launching-cloud-shell) in the GCP project where your BigQuery data resides. + * When Cloud Shell is started, the active project in Cloud Shell is propagated to your gcloud configuration inside Cloud Shell for immediate use. GOOGLE_CLOUD_PROJECT, the environmental variable used by Application Default Credentials library, is also set to point to the active project in Cloud Shell. You can also explicitly set the project using `gcloud config set project [PROJECT_ID]`. +1. [Upload](https://cloud.google.com/shell/docs/uploading-and-downloading-files#upload_and_download_files_and_folders) the policy_tag_export.sh script to the Cloud Shell environment. +1. Execute the script by running `bash policy_tag_export.sh`. +1. List the resources in Cloud Shell (ls) and verify that a file called "policy_tags.csv" was created. +1. [Download](https://cloud.google.com/shell/docs/uploading-and-downloading-files#upload_and_download_files_and_folders) the file. + +## Considerations +* Ensure either you or the service account executing the bash script has the bigquery.metadataViewer role to access the required level of information. +* Currently, the extractor only handles simple column types. RECORD type columns with nested policy tags are not supported. +* The extractor can identify specific policy tags on columns, but is limited to the information available to the bq command line tool. In it's current state, this is the full policy tag identifier: + +projects//locations//taxonomies//policyTags/ \ No newline at end of file diff --git a/scripts/policy_tag_extractor/policy_tag_export.sh b/scripts/policy_tag_extractor/policy_tag_export.sh new file mode 100644 index 000000000..26b09d8d3 --- /dev/null +++ b/scripts/policy_tag_extractor/policy_tag_export.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Prompt user for DATASET value if not set +if [ -z "$DATASET" ]; then + read -p "Enter the BigQuery dataset name: " DATASET +fi + +#write all tables in a dataset to a reference TXT file +bq --format=sparse ls --max_results=10000 ${DATASET} | awk '{ print $1 }' | sed '1,2d' > table_list.txt + +#loop through each table and export policy tags (if any) to a CSV +echo "Writing to CSV..." +while IFS= read -r TABLE; do + TAG_COUNT="`bq show --schema ${DATASET}.${TABLE} | grep "policyTags" | wc -l`" + + if [ "${TAG_COUNT}" -ge 1 ] + then + COLUMN_AND_TAG=`bq show --format=prettyjson ${DATASET}.${TABLE} | jq '.schema.fields[] | select(.policyTags | length>=1)'` + COLUMN=`echo $COLUMN_AND_TAG | jq '.name'` + TAG_ID=`echo $COLUMN_AND_TAG | jq '.policyTags.names[]'` + echo ${TABLE},${COLUMN},${TAG_ID} | tr -d '"' + fi +done < table_list.txt >> policy_tags.csv +echo "Done."