Feat: Automatically trigger integration tests scoped to modified dialects (#6505)

erindru · web-flow · commit ee5e7b931ca7 · 2025-12-11T11:33:43.000+02:00
* Feat: Automatically trigger integration tests scoped to modified dialects

* revert temporary changes

* Output conclusion if no tests to run as well
diff --git a/.github/scripts/get_integration_test_params.py b/.github/scripts/get_integration_test_params.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+"""
+This script is intended to be used as part of a GitHub Actions workflow in order to decide if the integration tests should:
+
+a) be triggered at all
+b) if they should be triggered, should they be triggered for a subset of dialects or all dialects?
+
+The tests can be triggered manually by using the following directive in either the PR description or a PR comment:
+
+ /integration-tests
+
+To limit them to a certain dialect or dialects, you can specify:
+
+ /integration-tests dialects=bigquery,duckdb
+
+If you specify nothing, a `git diff` will be performed between your PR branch and the base branch.
+If any files modified contain one of the SUPPORTED_DIALECTS in the filename, that dialect will be added to the
+list of dialects to test. If no files match, the integration tests will be skipped.
+
+Note that integration tests in the remote workflow are only implemented for a subset of dialects.
+If new ones are added, update the SUPPORTED_DIALECTS constant below.
+
+Each dialect is tested against itself (roundtrip) and duckdb (transpilation).
+Supplying a dialect not in this list will cause the tests to get skipped.
+"""
+
+import typing as t
+import os
+import sys
+import json
+import subprocess
+from pathlib import Path
+
+TRIGGER = "/integration-test"
+SUPPORTED_DIALECTS = ["duckdb", "bigquery", "snowflake"]
+
+
+def get_dialects_from_manual_trigger(trigger: str) -> t.Set[str]:
+    """
+    Takes a trigger string and parses out the supported dialects
+
+    /integration_test -> []
+    /integration_test dialects=bigquery -> ["bigquery"]
+    /integration_test dialects=bigquery,duckdb -> ["bigquery","duckdb"]
+    /integration_test dialects=exasol,duckdb -> ["duckdb"]
+    """
+
+    if not trigger.startswith(TRIGGER):
+        raise ValueError(f"Invalid trigger: {trigger}")
+
+    # trim off start at first space (to cover both /integration-test and /integration-tests)
+    trigger_parts = trigger.split(" ")[1:]
+
+    print(f"Parsing trigger args: {trigger_parts}")
+
+    dialects: t.List[str] = []
+    for part in trigger_parts:
+        # try to parse key=value pairs
+        maybe_kv = part.split("=", maxsplit=1)
+        if len(maybe_kv) >= 2:
+            k, v = maybe_kv[0], maybe_kv[1]
+            if k.lower().startswith("dialect"):
+                dialects.extend([d.lower().strip() for d in v.split(",")])
+
+    return {d for d in dialects if d in SUPPORTED_DIALECTS}
+
+
+def get_dialects_from_git(base_ref: str, current_ref: str) -> t.Set[str]:
+    """
+    Takes two git refs and runs `git diff --name-only <base_ref> <current_ref>`
+
+    If any of the returned file names contain a dialect from SUPPORTED_DIALECTS as
+    a substring, that dialect is included in the returned set
+    """
+    print(f"Checking for files changed between '{base_ref}' and '{current_ref}'")
+
+    result = subprocess.run(
+        ["git", "diff", "--name-only", base_ref, current_ref],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+    )
+    output = result.stdout.decode("utf8")
+
+    if result.returncode != 0:
+        raise ValueError(f"Git process failed with exit code {result.returncode}:\n{output}")
+
+    print(f"Git output:\n{output}")
+
+    matching_dialects = []
+
+    for l in output.splitlines():
+        l = l.strip().lower()
+
+        matching_dialects.extend([d for d in SUPPORTED_DIALECTS if d in l])
+
+    return set(matching_dialects)
+
+
+if __name__ == "__main__":
+    github_event_path = os.environ.get("GITHUB_EVENT_PATH")
+    github_sha = os.environ.get("GITHUB_SHA")
+    github_output = os.environ.get("GITHUB_OUTPUT")
+
+    if (
+        not os.environ.get("GITHUB_ACTIONS")
+        or not github_event_path
+        or not github_sha
+        or not github_output
+    ):
+        print(f"This script needs to run within GitHub Actions")
+        sys.exit(1)
+
+    github_event_path = Path(github_event_path)
+    github_output = Path(github_output)
+
+    with github_event_path.open("r") as f:
+        event: t.Dict[str, t.Any] = json.load(f)
+
+    print(f"Handling event: \n" + json.dumps(event, indent=2))
+
+    # for issue_comment events, the body is located at github.event.comment.body
+    # since issues and PR's are the same thing in the GH backend, we also have to check if the issue type is "pull_request"
+    comment_body = (
+        event.get("comment", {}).get("body") if event.get("issue", {}).get("pull_request") else None
+    )
+
+    # for pull_request events, the body is located at github.event.pull_request.body
+    pr_description = event.get("pull_request", {}).get("body")
+
+    dialects = []
+    should_run = False
+
+    text_blob = f"{comment_body or ''}{pr_description or ''}"
+    text_blob_lines = [l.strip().lower() for l in text_blob.splitlines()]
+    if trigger_line := [l for l in text_blob_lines if l.startswith(TRIGGER)]:
+        # if the user has explicitly requested /integration-tests then use that
+        print(f"Handling trigger line: {trigger_line[0]}")
+        dialects = get_dialects_from_manual_trigger(trigger_line[0])
+        should_run = True
+    else:
+        # otherwise, do a git diff and inspect the changed files
+        print(f"Explicit trigger line not detected; performing git diff")
+        pull_request_base_ref = event.get("pull_request", {}).get("base", {}).get("sha")
+        issue_comment_base_ref = event.get("before")
+
+        base_ref = pull_request_base_ref or issue_comment_base_ref
+        if not base_ref:
+            raise ValueError("Unable to determine base ref")
+
+        current_ref = github_sha
+        print(f"Comparing '{current_ref}' against '{base_ref}'")
+        # otherwise, look at git files changed and only trigger if a file relating
+        # to a supported dialect has changed
+        dialects = get_dialects_from_git(base_ref=base_ref, current_ref=github_sha)
+        if dialects:
+            should_run = True
+
+    if should_run:
+        dialects_str = (
+            f"the following dialects: {', '.join(dialects)}"
+            if dialects
+            else f"all supported dialects"
+        )
+        print(f"Conclusion: should run tests for {dialects_str}")
+    else:
+        print(f"Conclusion: No tests to run")
+
+    # write output variables
+    lines = []
+    if should_run:
+        lines.append("skip=false")
+        if dialects:
+            lines.append(f"dialects={','.join(dialects)}")
+    else:
+        lines.append("skip=true")
+
+    with github_output.open("a") as f:
+        f.writelines(f"{l}\n" for l in lines)
diff --git a/.github/workflows/run-integration-tests.yml b/.github/workflows/run-integration-tests.yml
@@ -10,15 +10,13 @@ on:
     types: [opened, synchronize, reopened]
 
 jobs:
-  run-integration-tests:
-    name: Run Integration Tests
+  should-run:
+    name: Check if integration tests should run
     runs-on: ubuntu-latest
-    if: |
-      (github.event_name == 'issue_comment' &&
-       contains(github.event.comment.body, '/integration-test') &&
-       github.event.issue.pull_request) ||
-      (github.event_name == 'pull_request' &&
-       contains(github.event.pull_request.body, '/integration-test'))
+    outputs:
+      skip: ${{ steps.test-parameters.outputs.skip }}
+      dialects: ${{ steps.test-parameters.outputs.dialects }}
+
     steps:
       - name: Print debugging info
         run: |
@@ -43,6 +41,29 @@ jobs:
 
             Ref Name: ${{ github.ref_name }}
           EOF
+
+      - name: Checkout Code
+        uses: actions/checkout@v5
+        with:
+          # we need to checkout all refs so we can run `git diff`
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13'
+
+      - name: Check if integration tests should be run
+        id: test-parameters
+        run: |
+          python .github/scripts/get_integration_test_params.py
+
+  run-integration-tests:
+    name: Run Integration Tests
+    runs-on: ubuntu-latest
+    needs: should-run
+    if: needs.should-run.outputs.skip == 'false'
+    steps:
       - name: Acquire credentials
         id: app-token
         uses: actions/create-github-app-token@v2
@@ -70,7 +91,8 @@ jobs:
             -f sqlglot_ref=${{ github.sha }} \
             -f sqlglot_pr_number=${{ github.event.number || github.event.issue.number }} \
             -f sqlglot_branch_name=${{ github.head_ref || github.ref_name }} \
-            -f correlation_id="$CORRELATION_ID"
+            -f correlation_id="$CORRELATION_ID" \
+            -f dialects="${{ needs.should-run.outputs.dialects }}"
 
           echo "Triggered workflow using correlation id: $CORRELATION_ID"
 
@@ -126,11 +148,37 @@ jobs:
         with:
           script: |
             // summary.json is downloaded from the remote workflow in the previous step
-            const summary = require("./summary.json")
+            const summary = require("./summary.json");
+
+            // Add a unique identifier to find this comment later
+            const commentIdentifier = "<!-- integration-test-summary -->";
+            const body = `${commentIdentifier}\n${summary.msg}`;
 
-            github.rest.issues.createComment({
+            // Find existing comment
+            const { data: comments } = await github.rest.issues.listComments({
               issue_number: context.issue.number,
               owner: context.repo.owner,
               repo: context.repo.repo,
-              body: summary.msg
-            })
+            });
+
+            const existingComment = comments.find(comment =>
+              comment.body.includes(commentIdentifier)
+            );
+
+            if (existingComment) {
+              // Update existing comment
+              await github.rest.issues.updateComment({
+                comment_id: existingComment.id,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: body
+              });
+            } else {
+              // Create new comment
+              await github.rest.issues.createComment({
+                issue_number: context.issue.number,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: body
+              });
+            }