+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000000000..35eb1ddfbbc02
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000000000..f5d97424c5047
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,22 @@
+{
+ // Use IntelliSense to learn about possible attributes.
+ // Hover to view descriptions of existing attributes.
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+ "version": "0.2.0",
+ "configurations": [
+ {
+ "name": "(gdb) Attach Postgres",
+ "type": "cppdbg",
+ "request": "attach",
+ "program": "${workspaceRoot}/install/bin/postgres",
+ "MIMode": "gdb",
+ "setupCommands": [
+ {
+ "description": "Enable pretty-printing for gdb",
+ "text": "-enable-pretty-printing",
+ "ignoreFailures": true
+ }
+ ],
+ }
+ ]
+}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000000000..cc8a64fa9fa85
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+ "files.associations": {
+ "syscache.h": "c"
+ }
+}
\ No newline at end of file
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index f48da3185307c..52eea716cd1f1 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -180,6 +180,9 @@ typedef struct IndexAmRoutine
/* interface functions to support planning */
amtranslate_strategy_function amtranslatestrategy; /* can be NULL */
amtranslate_cmptype_function amtranslatecmptype; /* can be NULL */
+
+ /* interface function to compare datums on update */
+ amcomparedatums_function amcomparedatums; /* can be NULL */
} IndexAmRoutine;
@@ -915,6 +918,31 @@ amtranslatecmptype (CompareType cmptype, Oid opfamily, Oid opcintype);
fully functional.
+
+
+bool
+amcomparedatums (Relation indexRelation,
+ int attnum,
+ Datum oldValue, bool oldIsNull,
+ Datum newValue, bool newIsNull);
+
+ Compare old and new datum values for a single index attribute to determine
+ whether the index entry needs to be updated. Returns true
+ if the two values are equal from the index's perspective and therefore
+ the index does not need to be updated for this attribute. This function
+ allows index access methods to use their own semantics for datum comparison,
+ which may differ from simple datum_is_equal comparison.
+ For example, an index that stores hashed values only needs to compare the
+ hash outputs, not the original values.
+
+
+
+ If the amcomparedatums field in
+ IndexAmRoutine is set to NULL, the system will
+ fall back to using a generic bitwise datum comparison for determining
+ whether an index update is needed during update optimization.
+
+
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 0000000000000..545e2069cec6d
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,78 @@
+{
+ "nodes": {
+ "flake-utils": {
+ "inputs": {
+ "systems": "systems"
+ },
+ "locked": {
+ "lastModified": 1731533236,
+ "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+ "type": "github"
+ },
+ "original": {
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "type": "github"
+ }
+ },
+ "nixpkgs": {
+ "locked": {
+ "lastModified": 1764522689,
+ "narHash": "sha256-SqUuBFjhl/kpDiVaKLQBoD8TLD+/cTUzzgVFoaHrkqY=",
+ "owner": "NixOS",
+ "repo": "nixpkgs",
+ "rev": "8bb5646e0bed5dbd3ab08c7a7cc15b75ab4e1d0f",
+ "type": "github"
+ },
+ "original": {
+ "owner": "NixOS",
+ "ref": "nixos-25.11",
+ "repo": "nixpkgs",
+ "type": "github"
+ }
+ },
+ "nixpkgs-unstable": {
+ "locked": {
+ "lastModified": 1757651841,
+ "narHash": "sha256-Lh9QoMzTjY/O4LqNwcm6s/WSYStDmCH6f3V/izwlkHc=",
+ "owner": "nixos",
+ "repo": "nixpkgs",
+ "rev": "ad4e6dd68c30bc8bd1860a27bc6f0c485bd7f3b6",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nixos",
+ "ref": "nixpkgs-unstable",
+ "repo": "nixpkgs",
+ "type": "github"
+ }
+ },
+ "root": {
+ "inputs": {
+ "flake-utils": "flake-utils",
+ "nixpkgs": "nixpkgs",
+ "nixpkgs-unstable": "nixpkgs-unstable"
+ }
+ },
+ "systems": {
+ "locked": {
+ "lastModified": 1681028828,
+ "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+ "owner": "nix-systems",
+ "repo": "default",
+ "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nix-systems",
+ "repo": "default",
+ "type": "github"
+ }
+ }
+ },
+ "root": "root",
+ "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 0000000000000..0cd4a1bfb1701
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,45 @@
+{
+ description = "PostgreSQL development environment";
+
+ inputs = {
+ nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.11";
+ nixpkgs-unstable.url = "github:nixos/nixpkgs/nixpkgs-unstable";
+ flake-utils.url = "github:numtide/flake-utils";
+ };
+
+ outputs = {
+ self,
+ nixpkgs,
+ nixpkgs-unstable,
+ flake-utils,
+ }:
+ flake-utils.lib.eachDefaultSystem (
+ system: let
+ pkgs = import nixpkgs {
+ inherit system;
+ config.allowUnfree = true;
+ };
+ pkgs-unstable = import nixpkgs-unstable {
+ inherit system;
+ config.allowUnfree = true;
+ };
+
+ shellConfig = import ./shell.nix {inherit pkgs pkgs-unstable system;};
+ in {
+ formatter = pkgs.alejandra;
+ devShells = {
+ default = shellConfig.devShell;
+ gcc = shellConfig.devShell;
+ clang = shellConfig.clangDevShell;
+ gcc-musl = shellConfig.muslDevShell;
+ clang-musl = shellConfig.clangMuslDevShell;
+ };
+
+ packages = {
+ inherit (shellConfig) gdbConfig flameGraphScript pgbenchScript;
+ };
+
+ environment.localBinInPath = true;
+ }
+ );
+}
diff --git a/glibc-no-fortify-warning.patch b/glibc-no-fortify-warning.patch
new file mode 100644
index 0000000000000..4657a12adbcc5
--- /dev/null
+++ b/glibc-no-fortify-warning.patch
@@ -0,0 +1,24 @@
+From 130c231020f97e5eb878cc9fdb2bd9b186a5aa04 Mon Sep 17 00:00:00 2001
+From: Greg Burd
+Date: Fri, 24 Oct 2025 11:58:24 -0400
+Subject: [PATCH] no warnings with -O0 and fortify source please
+
+---
+ include/features.h | 1 -
+ 1 file changed, 1 deletion(-)
+
+diff --git a/include/features.h b/include/features.h
+index 673c4036..a02c8a3f 100644
+--- a/include/features.h
++++ b/include/features.h
+@@ -432,7 +432,6 @@
+
+ #if defined _FORTIFY_SOURCE && _FORTIFY_SOURCE > 0
+ # if !defined __OPTIMIZE__ || __OPTIMIZE__ <= 0
+-# warning _FORTIFY_SOURCE requires compiling with optimization (-O)
+ # elif !__GNUC_PREREQ (4, 1)
+ # warning _FORTIFY_SOURCE requires GCC 4.1 or later
+ # elif _FORTIFY_SOURCE > 2 && (__glibc_clang_prereq (9, 0) \
+--
+2.50.1
+
diff --git a/pg-aliases.sh b/pg-aliases.sh
new file mode 100644
index 0000000000000..59fccd8f44a50
--- /dev/null
+++ b/pg-aliases.sh
@@ -0,0 +1,439 @@
+# PostgreSQL Development Aliases
+
+# Build system management
+pg_clean_for_compiler() {
+ local current_compiler="$(basename $CC)"
+ local build_dir="$PG_BUILD_DIR"
+
+ if [ -f "$build_dir/compile_commands.json" ]; then
+ local last_compiler=$(grep -o '/[^/]*/bin/[gc]cc\|/[^/]*/bin/clang' "$build_dir/compile_commands.json" | head -1 | xargs basename 2>/dev/null || echo "unknown")
+
+ if [ "$last_compiler" != "$current_compiler" ] && [ "$last_compiler" != "unknown" ]; then
+ echo "Detected compiler change from $last_compiler to $current_compiler"
+ echo "Cleaning build directory..."
+ rm -rf "$build_dir"
+ mkdir -p "$build_dir"
+ fi
+ fi
+
+ mkdir -p "$build_dir"
+ echo "$current_compiler" >"$build_dir/.compiler_used"
+}
+
+# Core PostgreSQL commands
+alias pg-setup='
+ if [ -z "$PERL_CORE_DIR" ]; then
+ echo "Error: Could not find perl CORE directory" >&2
+ return 1
+ fi
+
+ pg_clean_for_compiler
+
+ echo "=== PostgreSQL Build Configuration ==="
+ echo "Compiler: $CC"
+ echo "LLVM: $(llvm-config --version 2>/dev/null || echo 'disabled')"
+ echo "Source: $PG_SOURCE_DIR"
+ echo "Build: $PG_BUILD_DIR"
+ echo "Install: $PG_INSTALL_DIR"
+ echo "======================================"
+ # --fatal-meson-warnings
+ # --buildtype=debugoptimized \
+ env CFLAGS="-I$PERL_CORE_DIR $CFLAGS" \
+ LDFLAGS="-L$PERL_CORE_DIR -lperl $LDFLAGS" \
+ meson setup $MESON_EXTRA_SETUP \
+ --reconfigure \
+ -Ddebug=true \
+ -Doptimization=0 \
+ -Db_coverage=false \
+ -Db_lundef=false \
+ -Dcassert=true \
+ -Ddocs_html_style=website \
+ -Ddocs_pdf=enabled \
+ -Dicu=enabled \
+ -Dinjection_points=true \
+ -Dldap=enabled \
+ -Dlibcurl=enabled \
+ -Dlibxml=enabled \
+ -Dlibxslt=enabled \
+ -Dllvm=auto \
+ -Dlz4=enabled \
+ -Dnls=enabled \
+ -Dplperl=enabled \
+ -Dplpython=enabled \
+ -Dpltcl=enabled \
+ -Dreadline=enabled \
+ -Dssl=openssl \
+ -Dtap_tests=enabled \
+ -Duuid=e2fs \
+ -Dzstd=enabled \
+ --prefix="$PG_INSTALL_DIR" \
+ "$PG_BUILD_DIR" \
+ "$PG_SOURCE_DIR"'
+
+alias pg-compdb='compdb -p build/ list > compile_commands.json'
+alias pg-build='meson compile -C "$PG_BUILD_DIR"'
+alias pg-install='meson install -C "$PG_BUILD_DIR"'
+alias pg-test='meson test -q --print-errorlogs -C "$PG_BUILD_DIR"'
+
+# Clean commands
+alias pg-clean='ninja -C "$PG_BUILD_DIR" clean'
+alias pg-full-clean='rm -rf "$PG_BUILD_DIR" "$PG_INSTALL_DIR" && echo "Build and install directories cleaned"'
+
+# Database management
+alias pg-init='rm -rf "$PG_DATA_DIR" && "$PG_INSTALL_DIR/bin/initdb" --debug --no-clean "$PG_DATA_DIR"'
+alias pg-start='"$PG_INSTALL_DIR/bin/postgres" -D "$PG_DATA_DIR" -k "$PG_DATA_DIR"'
+alias pg-stop='pkill -f "postgres.*-D.*$PG_DATA_DIR" || true'
+alias pg-restart='pg-stop && sleep 2 && pg-start'
+alias pg-status='pgrep -f "postgres.*-D.*$PG_DATA_DIR" && echo "PostgreSQL is running" || echo "PostgreSQL is not running"'
+
+# Client connections
+alias pg-psql='"$PG_INSTALL_DIR/bin/psql" -h "$PG_DATA_DIR" postgres'
+alias pg-createdb='"$PG_INSTALL_DIR/bin/createdb" -h "$PG_DATA_DIR"'
+alias pg-dropdb='"$PG_INSTALL_DIR/bin/dropdb" -h "$PG_DATA_DIR"'
+
+# Debugging
+alias pg-debug-gdb='gdb -x "$GDBINIT" "$PG_INSTALL_DIR/bin/postgres"'
+alias pg-debug-lldb='lldb "$PG_INSTALL_DIR/bin/postgres"'
+alias pg-debug='
+ if command -v gdb >/dev/null 2>&1; then
+ pg-debug-gdb
+ elif command -v lldb >/dev/null 2>&1; then
+ pg-debug-lldb
+ else
+ echo "No debugger available (gdb or lldb required)"
+ fi'
+
+# Attach to running process
+alias pg-attach-gdb='
+ PG_PID=$(pgrep -f "postgres.*-D.*$PG_DATA_DIR" | head -1)
+ if [ -n "$PG_PID" ]; then
+ echo "Attaching GDB to PostgreSQL process $PG_PID"
+ gdb -x "$GDBINIT" -p "$PG_PID"
+ else
+ echo "No PostgreSQL process found"
+ fi'
+
+alias pg-attach-lldb='
+ PG_PID=$(pgrep -f "postgres.*-D.*$PG_DATA_DIR" | head -1)
+ if [ -n "$PG_PID" ]; then
+ echo "Attaching LLDB to PostgreSQL process $PG_PID"
+ lldb -p "$PG_PID"
+ else
+ echo "No PostgreSQL process found"
+ fi'
+
+alias pg-attach='
+ if command -v gdb >/dev/null 2>&1; then
+ pg-attach-gdb
+ elif command -v lldb >/dev/null 2>&1; then
+ pg-attach-lldb
+ else
+ echo "No debugger available (gdb or lldb required)"
+ fi'
+
+# Performance profiling and analysis
+alias pg-valgrind='valgrind --tool=memcheck --leak-check=full --show-leak-kinds=all "$PG_INSTALL_DIR/bin/postgres" -D "$PG_DATA_DIR"'
+alias pg-strace='strace -f -o /tmp/postgres.strace "$PG_INSTALL_DIR/bin/postgres" -D "$PG_DATA_DIR"'
+
+# Flame graph generation
+alias pg-flame='pg-flame-generate'
+alias pg-flame-30='pg-flame-generate 30'
+alias pg-flame-60='pg-flame-generate 60'
+alias pg-flame-120='pg-flame-generate 120'
+
+# Custom flame graph with specific duration and output
+pg-flame-custom() {
+ local duration=${1:-30}
+ local output_dir=${2:-$PG_FLAME_DIR}
+ echo "Generating flame graph for ${duration}s, output to: $output_dir"
+ pg-flame-generate "$duration" "$output_dir"
+}
+
+# Benchmarking with pgbench
+alias pg-bench='pg-bench-run'
+alias pg-bench-quick='pg-bench-run 5 1 100 1 30 select-only'
+alias pg-bench-standard='pg-bench-run 10 2 1000 10 60 tpcb-like'
+alias pg-bench-heavy='pg-bench-run 50 4 5000 100 300 tpcb-like'
+alias pg-bench-readonly='pg-bench-run 20 4 2000 50 120 select-only'
+
+# Custom benchmark function
+pg-bench-custom() {
+ local clients=${1:-10}
+ local threads=${2:-2}
+ local transactions=${3:-1000}
+ local scale=${4:-10}
+ local duration=${5:-60}
+ local test_type=${6:-tpcb-like}
+
+ echo "Running custom benchmark:"
+ echo " Clients: $clients, Threads: $threads"
+ echo " Transactions: $transactions, Scale: $scale"
+ echo " Duration: ${duration}s, Type: $test_type"
+
+ pg-bench-run "$clients" "$threads" "$transactions" "$scale" "$duration" "$test_type"
+}
+
+# Benchmark with flame graph
+pg-bench-flame() {
+ local duration=${1:-60}
+ local clients=${2:-10}
+ local scale=${3:-10}
+
+ echo "Running benchmark with flame graph generation"
+ echo "Duration: ${duration}s, Clients: $clients, Scale: $scale"
+
+ # Start benchmark in background
+ pg-bench-run "$clients" 2 1000 "$scale" "$duration" tpcb-like &
+ local bench_pid=$!
+
+ # Wait a bit for benchmark to start
+ sleep 5
+
+ # Generate flame graph for most of the benchmark duration
+ local flame_duration=$((duration - 10))
+ if [ $flame_duration -gt 10 ]; then
+ pg-flame-generate "$flame_duration" &
+ local flame_pid=$!
+ fi
+
+ # Wait for benchmark to complete
+ wait $bench_pid
+
+ # Wait for flame graph if it was started
+ if [ -n "${flame_pid:-}" ]; then
+ wait $flame_pid
+ fi
+
+ echo "Benchmark and flame graph generation completed"
+}
+
+# Performance monitoring
+alias pg-perf='perf top -p $(pgrep -f "postgres.*-D.*$PG_DATA_DIR" | head -1)'
+alias pg-htop='htop -p $(pgrep -f "postgres.*-D.*$PG_DATA_DIR" | tr "\n" "," | sed "s/,$//")'
+
+# System performance stats during PostgreSQL operation
+pg-stats() {
+ local duration=${1:-30}
+ echo "Collecting system stats for ${duration}s..."
+
+ iostat -x 1 "$duration" >"$PG_BENCH_DIR/iostat_$(date +%Y%m%d_%H%M%S).log" &
+ vmstat 1 "$duration" >"$PG_BENCH_DIR/vmstat_$(date +%Y%m%d_%H%M%S).log" &
+
+ wait
+ echo "System stats saved to $PG_BENCH_DIR"
+}
+
+# Development helpers
+pg-format() {
+ local since=${1:-HEAD}
+
+ if [ ! -f "$PG_SOURCE_DIR/src/tools/pgindent/pgindent" ]; then
+ echo "Error: pgindent not found at $PG_SOURCE_DIR/src/tools/pgindent/pgindent"
+ else
+
+ modified_files=$(git diff --name-only "${since}" | grep -E "\.c$|\.h$")
+
+ if [ -z "$modified_files" ]; then
+ echo "No modified .c or .h files found"
+ else
+
+ echo "Formatting modified files with pgindent:"
+ for file in $modified_files; do
+ if [ -f "$file" ]; then
+ echo " Formatting: $file"
+ "$PG_SOURCE_DIR/src/tools/pgindent/pgindent" "$file"
+ else
+ echo " Warning: File not found: $file"
+ fi
+ done
+
+ echo "Checking files for whitespace:"
+ git diff --check "${since}"
+ fi
+ fi
+}
+
+alias pg-tidy='find "$PG_SOURCE_DIR" -name "*.c" | head -10 | xargs clang-tidy'
+
+# Log management
+alias pg-log='tail -f "$PG_DATA_DIR/log/postgresql-$(date +%Y-%m-%d).log" 2>/dev/null || echo "No log file found"'
+alias pg-log-errors='grep -i error "$PG_DATA_DIR/log/"*.log 2>/dev/null || echo "No error logs found"'
+
+# Build logs
+alias pg-build-log='cat "$PG_BUILD_DIR/meson-logs/meson-log.txt"'
+alias pg-build-errors='grep -i error "$PG_BUILD_DIR/meson-logs/meson-log.txt" 2>/dev/null || echo "No build errors found"'
+
+# Results viewing
+alias pg-bench-results='ls -la "$PG_BENCH_DIR" && echo "Latest results:" && tail -20 "$PG_BENCH_DIR"/results_*.txt 2>/dev/null | tail -20'
+alias pg-flame-results='ls -la "$PG_FLAME_DIR" && echo "Open flame graphs with: firefox $PG_FLAME_DIR/*.svg"'
+
+# Clean up old results
+pg-clean-results() {
+ local days=${1:-7}
+ echo "Cleaning benchmark and flame graph results older than $days days..."
+ find "$PG_BENCH_DIR" -type f -mtime +$days -delete 2>/dev/null || true
+ find "$PG_FLAME_DIR" -type f -mtime +$days -delete 2>/dev/null || true
+ echo "Cleanup completed"
+}
+
+# Information
+# Test failure analysis and debugging
+alias pg-retest='
+ local testlog="$PG_BUILD_DIR/meson-logs/testlog.txt"
+
+ if [ ! -f "$testlog" ]; then
+ echo "No test log found at $testlog"
+ echo "Run pg-test first to generate test results"
+ return 1
+ fi
+
+ echo "Finding failed tests..."
+ local failed_tests=$(grep "^FAIL" "$testlog" | awk "{print \$2}" | sort -u)
+
+ if [ -z "$failed_tests" ]; then
+ echo "No failed tests found!"
+ return 0
+ fi
+
+ local count=$(echo "$failed_tests" | wc -l)
+ echo "Found $count failed test(s). Re-running one at a time..."
+ echo ""
+
+ for test in $failed_tests; do
+ echo "========================================"
+ echo "Running: $test"
+ echo "========================================"
+ meson test -C "$PG_BUILD_DIR" "$test" --print-errorlogs
+ echo ""
+ done
+'
+
+pg_meld_test() {
+ local test_name="$1"
+ local testrun_dir="$PG_BUILD_DIR/testrun"
+
+ # Function to find expected and actual output files for a test
+ find_test_files() {
+ local tname="$1"
+ local expected=""
+ local actual=""
+
+ # Try to find in testrun directory structure
+ # Pattern: testrun///results/*.out vs src/test//expected/*.out
+ for suite_dir in "$testrun_dir"/*; do
+ if [ -d "$suite_dir" ]; then
+ local suite=$(basename "$suite_dir")
+ local test_dir="$suite_dir/$tname"
+
+ if [ -d "$test_dir/results" ]; then
+ local result_file=$(find "$test_dir/results" -name "*.out" -o -name "*.diff" | head -1)
+
+ if [ -n "$result_file" ]; then
+ # Found actual output, now find expected
+ local base_name=$(basename "$result_file" .out)
+ base_name=$(basename "$base_name" .diff)
+
+ # Look for expected file
+ if [ -f "$PG_SOURCE_DIR/src/test/$suite/expected/${base_name}.out" ]; then
+ expected="$PG_SOURCE_DIR/src/test/$suite/expected/${base_name}.out"
+ actual="$result_file"
+ break
+ fi
+ fi
+ fi
+ fi
+ done
+
+ if [ -n "$expected" ] && [ -n "$actual" ]; then
+ echo "$expected|$actual"
+ return 0
+ fi
+ return 1
+ }
+
+ if [ -n "$test_name" ]; then
+ # Single test specified
+ local files=$(find_test_files "$test_name")
+
+ if [ -z "$files" ]; then
+ echo "Could not find test output files for: $test_name"
+ return 1
+ fi
+
+ local expected=$(echo "$files" | cut -d"|" -f1)
+ local actual=$(echo "$files" | cut -d"|" -f2)
+
+ echo "Opening meld for test: $test_name"
+ echo "Expected: $expected"
+ echo "Actual: $actual"
+ nohup meld "$expected" "$actual" >/dev/null 2>&1 &
+ else
+ # No test specified - find all failed tests
+ local testlog="$PG_BUILD_DIR/meson-logs/testlog.txt"
+
+ if [ ! -f "$testlog" ]; then
+ echo "No test log found. Run pg-test first."
+ return 1
+ fi
+
+ local failed_tests=$(grep "^FAIL" "$testlog" | awk "{print \$2}" | sort -u)
+
+ if [ -z "$failed_tests" ]; then
+ echo "No failed tests found!"
+ return 0
+ fi
+
+ echo "Opening meld for all failed tests..."
+ local opened=0
+
+ for test in $failed_tests; do
+ local files=$(find_test_files "$test")
+
+ if [ -n "$files" ]; then
+ local expected=$(echo "$files" | cut -d"|" -f1)
+ local actual=$(echo "$files" | cut -d"|" -f2)
+
+ echo " $test: $expected vs $actual"
+ nohup meld "$expected" "$actual" >/dev/null 2>&1 &
+ opened=$((opened + 1))
+ sleep 0.5 # Small delay to avoid overwhelming the system
+ fi
+ done
+
+ if [ $opened -eq 0 ]; then
+ echo "Could not find output files for any failed tests"
+ return 1
+ fi
+
+ echo "Opened $opened meld session(s)"
+ fi
+}
+
+alias pg-meld="pg_meld_test"
+
+alias pg-info='
+ echo "=== PostgreSQL Development Environment ==="
+ echo "Source: $PG_SOURCE_DIR"
+ echo "Build: $PG_BUILD_DIR"
+ echo "Install: $PG_INSTALL_DIR"
+ echo "Data: $PG_DATA_DIR"
+ echo "Benchmarks: $PG_BENCH_DIR"
+ echo "Flame graphs: $PG_FLAME_DIR"
+ echo "Compiler: $CC"
+ echo ""
+ echo "Available commands:"
+ echo " Setup: pg-setup, pg-build, pg-install"
+ echo " Testing: pg-test, pg-retest, pg-meld"
+ echo " Database: pg-init, pg-start, pg-stop, pg-psql"
+ echo " Debug: pg-debug, pg-attach, pg-valgrind"
+ echo " Performance: pg-flame, pg-bench, pg-perf"
+ echo " Benchmarks: pg-bench-quick, pg-bench-standard, pg-bench-heavy"
+ echo " Flame graphs: pg-flame-30, pg-flame-60, pg-flame-custom"
+ echo " Combined: pg-bench-flame"
+ echo " Results: pg-bench-results, pg-flame-results"
+ echo " Logs: pg-log, pg-build-log"
+ echo " Clean: pg-clean, pg-full-clean, pg-clean-results"
+ echo " Code quality: pg-format, pg-tidy"
+ echo "=========================================="'
+
+echo "PostgreSQL aliases loaded. Run 'pg-info' for available commands."
diff --git a/shell.nix b/shell.nix
new file mode 100644
index 0000000000000..5a1c18596234c
--- /dev/null
+++ b/shell.nix
@@ -0,0 +1,820 @@
+{
+ pkgs,
+ pkgs-unstable,
+ system,
+}: let
+ # Create a patched glibc only for the dev shell
+ patchedGlibc = pkgs.glibc.overrideAttrs (oldAttrs: {
+ patches = (oldAttrs.patches or []) ++ [
+ ./glibc-no-fortify-warning.patch
+ ];
+ });
+
+ llvmPkgs = pkgs-unstable.llvmPackages_21;
+
+ # Configuration constants
+ config = {
+ pgSourceDir = "$PWD";
+ pgBuildDir = "$PWD/build";
+ pgInstallDir = "$PWD/install";
+ pgDataDir = "/tmp/test-db-$(basename $PWD)";
+ pgBenchDir = "/tmp/pgbench-results-$(basename $PWD)";
+ pgFlameDir = "/tmp/flame-graphs-$(basename $PWD)";
+ };
+
+ # Single dependency function that can be used for all environments
+ getPostgreSQLDeps = muslLibs:
+ with pkgs;
+ [
+ # Build system (always use host tools)
+ pkgs-unstable.meson
+ pkgs-unstable.ninja
+ pkg-config
+ autoconf
+ libtool
+ git
+ which
+ binutils
+ gnumake
+
+ # Parser/lexer tools
+ bison
+ flex
+
+ # Documentation
+ docbook_xml_dtd_45
+ docbook-xsl-nons
+ fop
+ gettext
+ libxslt
+ libxml2
+
+ # Development tools (always use host tools)
+ coreutils
+ shellcheck
+ ripgrep
+ valgrind
+ curl
+ uv
+ pylint
+ black
+ lcov
+ strace
+ ltrace
+ perf-tools
+ perf
+ flamegraph
+ htop
+ iotop
+ sysstat
+ ccache
+ cppcheck
+ compdb
+
+ # GCC/GDB
+# pkgs-unstable.gcc15
+ gcc
+ gdb
+
+ # LLVM toolchain
+ llvmPkgs.llvm
+ llvmPkgs.llvm.dev
+ llvmPkgs.clang-tools
+ llvmPkgs.lldb
+
+ # Language support
+ (perl.withPackages (ps: with ps; [IPCRun]))
+ (python3.withPackages (ps: with ps; [requests browser-cookie3]))
+ tcl
+ ]
+ ++ (
+ if muslLibs
+ then [
+ # Musl target libraries for cross-compilation
+ pkgs.pkgsMusl.readline
+ pkgs.pkgsMusl.zlib
+ pkgs.pkgsMusl.openssl
+ pkgs.pkgsMusl.icu
+ pkgs.pkgsMusl.lz4
+ pkgs.pkgsMusl.zstd
+ pkgs.pkgsMusl.libuuid
+ pkgs.pkgsMusl.libkrb5
+ pkgs.pkgsMusl.linux-pam
+ pkgs.pkgsMusl.libxcrypt
+ ]
+ else [
+ # Glibc target libraries
+ readline
+ zlib
+ openssl
+ icu
+ lz4
+ zstd
+ libuuid
+ libkrb5
+ linux-pam
+ libxcrypt
+ numactl
+ openldap
+ liburing
+ libselinux
+ patchedGlibc
+ glibcInfo
+ glibc.dev
+ ]
+ );
+
+ # GDB configuration for PostgreSQL debugging
+ gdbConfig = pkgs.writeText "gdbinit-postgres" ''
+ # PostgreSQL-specific GDB configuration
+
+ # Pretty-print PostgreSQL data structures
+ define print_node
+ if $arg0
+ printf "Node type: %s\n", nodeTagNames[$arg0->type]
+ print *$arg0
+ else
+ printf "NULL node\n"
+ end
+ end
+ document print_node
+ Print a PostgreSQL Node with type information
+ Usage: print_node
+ end
+
+ define print_list
+ set $list = (List*)$arg0
+ if $list
+ printf "List length: %d\n", $list->length
+ set $cell = $list->head
+ set $i = 0
+ while $cell && $i < $list->length
+ printf " [%d]: ", $i
+ print_node $cell->data.ptr_value
+ set $cell = $cell->next
+ set $i = $i + 1
+ end
+ else
+ printf "NULL list\n"
+ end
+ end
+ document print_list
+ Print a PostgreSQL List structure
+ Usage: print_list
+ end
+
+ define print_query
+ set $query = (Query*)$arg0
+ if $query
+ printf "Query type: %d, command type: %d\n", $query->querySource, $query->commandType
+ print *$query
+ else
+ printf "NULL query\n"
+ end
+ end
+ document print_query
+ Print a PostgreSQL Query structure
+ Usage: print_query
+ end
+
+ define print_relcache
+ set $rel = (Relation)$arg0
+ if $rel
+ printf "Relation: %s.%s (OID: %u)\n", $rel->rd_rel->relnamespace, $rel->rd_rel->relname.data, $rel->rd_id
+ printf " natts: %d, relkind: %c\n", $rel->rd_rel->relnatts, $rel->rd_rel->relkind
+ else
+ printf "NULL relation\n"
+ end
+ end
+ document print_relcache
+ Print relation cache entry information
+ Usage: print_relcache
+ end
+
+ define print_tupdesc
+ set $desc = (TupleDesc)$arg0
+ if $desc
+ printf "TupleDesc: %d attributes\n", $desc->natts
+ set $i = 0
+ while $i < $desc->natts
+ set $attr = $desc->attrs[$i]
+ printf " [%d]: %s (type: %u, len: %d)\n", $i, $attr->attname.data, $attr->atttypid, $attr->attlen
+ set $i = $i + 1
+ end
+ else
+ printf "NULL tuple descriptor\n"
+ end
+ end
+ document print_tupdesc
+ Print tuple descriptor information
+ Usage: print_tupdesc
+ end
+
+ define print_slot
+ set $slot = (TupleTableSlot*)$arg0
+ if $slot
+ printf "TupleTableSlot: %s\n", $slot->tts_ops->name
+ printf " empty: %d, shouldFree: %d\n", $slot->tts_empty, $slot->tts_shouldFree
+ if $slot->tts_tupleDescriptor
+ print_tupdesc $slot->tts_tupleDescriptor
+ end
+ else
+ printf "NULL slot\n"
+ end
+ end
+ document print_slot
+ Print tuple table slot information
+ Usage: print_slot
+ end
+
+ # Memory context debugging
+ define print_mcxt
+ set $context = (MemoryContext)$arg0
+ if $context
+ printf "MemoryContext: %s\n", $context->name
+ printf " type: %s, parent: %p\n", $context->methods->name, $context->parent
+ printf " total: %zu, free: %zu\n", $context->mem_allocated, $context->freep - $context->freeptr
+ else
+ printf "NULL memory context\n"
+ end
+ end
+ document print_mcxt
+ Print memory context information
+ Usage: print_mcxt
+ end
+
+ # Process debugging
+ define print_proc
+ set $proc = (PGPROC*)$arg0
+ if $proc
+ printf "PGPROC: pid=%d, database=%u\n", $proc->pid, $proc->databaseId
+ printf " waiting: %d, waitStatus: %d\n", $proc->waiting, $proc->waitStatus
+ else
+ printf "NULL process\n"
+ end
+ end
+ document print_proc
+ Print process information
+ Usage: print_proc
+ end
+
+ # Set useful defaults
+ set print pretty on
+ set print object on
+ set print static-members off
+ set print vtbl on
+ set print demangle on
+ set demangle-style gnu-v3
+ set print sevenbit-strings off
+ set history save on
+ set history size 1000
+ set history filename ~/.gdb_history_postgres
+
+ # Common breakpoints for PostgreSQL debugging
+ define pg_break_common
+ break elog
+ break errfinish
+ break ExceptionalCondition
+ break ProcessInterrupts
+ end
+ document pg_break_common
+ Set common PostgreSQL debugging breakpoints
+ end
+
+ printf "PostgreSQL GDB configuration loaded.\n"
+ printf "Available commands: print_node, print_list, print_query, print_relcache,\n"
+ printf " print_tupdesc, print_slot, print_mcxt, print_proc, pg_break_common\n"
+ '';
+
+ # Flame graph generation script
+ flameGraphScript = pkgs.writeScriptBin "pg-flame-generate" ''
+ #!${pkgs.bash}/bin/bash
+ set -euo pipefail
+
+ DURATION=''${1:-30}
+ OUTPUT_DIR=''${2:-${config.pgFlameDir}}
+ TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+
+ mkdir -p "$OUTPUT_DIR"
+
+ echo "Generating flame graph for PostgreSQL (duration: ''${DURATION}s)"
+
+ # Find PostgreSQL processes
+ PG_PIDS=$(pgrep -f "postgres.*-D.*${config.pgDataDir}" || true)
+
+ if [ -z "$PG_PIDS" ]; then
+ echo "Error: No PostgreSQL processes found"
+ exit 1
+ fi
+
+ echo "Found PostgreSQL processes: $PG_PIDS"
+
+ # Record perf data
+ PERF_DATA="$OUTPUT_DIR/perf_$TIMESTAMP.data"
+ echo "Recording perf data to $PERF_DATA"
+
+ ${pkgs.perf}/bin/perf record \
+ -F 997 \
+ -g \
+ --call-graph dwarf \
+ -p "$(echo $PG_PIDS | tr ' ' ',')" \
+ -o "$PERF_DATA" \
+ sleep "$DURATION"
+
+ # Generate flame graph
+ FLAME_SVG="$OUTPUT_DIR/postgres_flame_$TIMESTAMP.svg"
+ echo "Generating flame graph: $FLAME_SVG"
+
+ ${pkgs.perf}/bin/perf script -i "$PERF_DATA" | \
+ ${pkgs.flamegraph}/bin/stackcollapse-perf.pl | \
+ ${pkgs.flamegraph}/bin/flamegraph.pl \
+ --title "PostgreSQL Flame Graph ($TIMESTAMP)" \
+ --width 1200 \
+ --height 800 \
+ > "$FLAME_SVG"
+
+ echo "Flame graph generated: $FLAME_SVG"
+ echo "Perf data saved: $PERF_DATA"
+
+ # Generate summary report
+ REPORT="$OUTPUT_DIR/report_$TIMESTAMP.txt"
+ echo "Generating performance report: $REPORT"
+
+ {
+ echo "PostgreSQL Performance Analysis Report"
+ echo "Generated: $(date)"
+ echo "Duration: ''${DURATION}s"
+ echo "Processes: $PG_PIDS"
+ echo ""
+ echo "=== Top Functions ==="
+ ${pkgs.perf}/bin/perf report -i "$PERF_DATA" --stdio --sort comm,dso,symbol | head -50
+ echo ""
+ echo "=== Call Graph ==="
+ ${pkgs.perf}/bin/perf report -i "$PERF_DATA" --stdio -g --sort comm,dso,symbol | head -100
+ } > "$REPORT"
+
+ echo "Report generated: $REPORT"
+ echo ""
+ echo "Files created:"
+ echo " Flame graph: $FLAME_SVG"
+ echo " Perf data: $PERF_DATA"
+ echo " Report: $REPORT"
+ '';
+
+ # pgbench wrapper script
+ pgbenchScript = pkgs.writeScriptBin "pg-bench-run" ''
+ #!${pkgs.bash}/bin/bash
+ set -euo pipefail
+
+ # Default parameters
+ CLIENTS=''${1:-10}
+ THREADS=''${2:-2}
+ TRANSACTIONS=''${3:-1000}
+ SCALE=''${4:-10}
+ DURATION=''${5:-60}
+ TEST_TYPE=''${6:-tpcb-like}
+
+ OUTPUT_DIR="${config.pgBenchDir}"
+ TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+
+ mkdir -p "$OUTPUT_DIR"
+
+ echo "=== PostgreSQL Benchmark Configuration ==="
+ echo "Clients: $CLIENTS"
+ echo "Threads: $THREADS"
+ echo "Transactions: $TRANSACTIONS"
+ echo "Scale factor: $SCALE"
+ echo "Duration: ''${DURATION}s"
+ echo "Test type: $TEST_TYPE"
+ echo "Output directory: $OUTPUT_DIR"
+ echo "============================================"
+
+ # Check if PostgreSQL is running
+ if ! pgrep -f "postgres.*-D.*${config.pgDataDir}" >/dev/null; then
+ echo "Error: PostgreSQL is not running. Start it with 'pg-start'"
+ exit 1
+ fi
+
+ PGBENCH="${config.pgInstallDir}/bin/pgbench"
+ PSQL="${config.pgInstallDir}/bin/psql"
+ CREATEDB="${config.pgInstallDir}/bin/createdb"
+ DROPDB="${config.pgInstallDir}/bin/dropdb"
+
+ DB_NAME="pgbench_test_$TIMESTAMP"
+ RESULTS_FILE="$OUTPUT_DIR/results_$TIMESTAMP.txt"
+ LOG_FILE="$OUTPUT_DIR/pgbench_$TIMESTAMP.log"
+
+ echo "Creating test database: $DB_NAME"
+ "$CREATEDB" -h "${config.pgDataDir}" "$DB_NAME" || {
+ echo "Failed to create database"
+ exit 1
+ }
+
+ # Initialize pgbench tables
+ echo "Initializing pgbench tables (scale factor: $SCALE)"
+ "$PGBENCH" -h "${config.pgDataDir}" -i -s "$SCALE" "$DB_NAME" || {
+ echo "Failed to initialize pgbench tables"
+ "$DROPDB" -h "${config.pgDataDir}" "$DB_NAME" 2>/dev/null || true
+ exit 1
+ }
+
+ # Run benchmark based on test type
+ echo "Running benchmark..."
+
+ case "$TEST_TYPE" in
+ "tpcb-like"|"default")
+ BENCH_ARGS=""
+ ;;
+ "select-only")
+ BENCH_ARGS="-S"
+ ;;
+ "simple-update")
+ BENCH_ARGS="-N"
+ ;;
+ "read-write")
+ BENCH_ARGS="-b select-only@70 -b tpcb-like@30"
+ ;;
+ *)
+ echo "Unknown test type: $TEST_TYPE"
+ echo "Available types: tpcb-like, select-only, simple-update, read-write"
+ "$DROPDB" -h "${config.pgDataDir}" "$DB_NAME" 2>/dev/null || true
+ exit 1
+ ;;
+ esac
+
+ {
+ echo "PostgreSQL Benchmark Results"
+ echo "Generated: $(date)"
+ echo "Test type: $TEST_TYPE"
+ echo "Clients: $CLIENTS, Threads: $THREADS"
+ echo "Transactions: $TRANSACTIONS, Duration: ''${DURATION}s"
+ echo "Scale factor: $SCALE"
+ echo "Database: $DB_NAME"
+ echo ""
+ echo "=== System Information ==="
+ echo "CPU: $(nproc) cores"
+ echo "Memory: $(free -h | grep '^Mem:' | awk '{print $2}')"
+ echo "Compiler: $CC"
+ echo "PostgreSQL version: $("$PSQL" --no-psqlrc -h "${config.pgDataDir}" -d "$DB_NAME" -t -c "SELECT version();" | head -1)"
+ echo ""
+ echo "=== Benchmark Results ==="
+ } > "$RESULTS_FILE"
+
+ # Run the actual benchmark
+ "$PGBENCH" \
+ -h "${config.pgDataDir}" \
+ -c "$CLIENTS" \
+ -j "$THREADS" \
+ -T "$DURATION" \
+ -P 5 \
+ --log \
+ --log-prefix="$OUTPUT_DIR/pgbench_$TIMESTAMP" \
+ $BENCH_ARGS \
+ "$DB_NAME" 2>&1 | tee -a "$RESULTS_FILE"
+
+ # Collect additional statistics
+ {
+ echo ""
+ echo "=== Database Statistics ==="
+ "$PSQL" --no-psqlrc -h "${config.pgDataDir}" -d "$DB_NAME" -c "
+ SELECT
+ schemaname,
+ relname,
+ n_tup_ins as inserts,
+ n_tup_upd as updates,
+ n_tup_del as deletes,
+ n_live_tup as live_tuples,
+ n_dead_tup as dead_tuples
+ FROM pg_stat_user_tables;
+ "
+
+ echo ""
+ echo "=== Index Statistics ==="
+ "$PSQL" --no-psqlrc -h "${config.pgDataDir}" -d "$DB_NAME" -c "
+ SELECT
+ schemaname,
+ relname,
+ indexrelname,
+ idx_scan,
+ idx_tup_read,
+ idx_tup_fetch
+ FROM pg_stat_user_indexes;
+ "
+ } >> "$RESULTS_FILE"
+
+ # Clean up
+ echo "Cleaning up test database: $DB_NAME"
+ "$DROPDB" -h "${config.pgDataDir}" "$DB_NAME" 2>/dev/null || true
+
+ echo ""
+ echo "Benchmark completed!"
+ echo "Results saved to: $RESULTS_FILE"
+ echo "Transaction logs: $OUTPUT_DIR/pgbench_$TIMESTAMP*"
+
+ # Show summary
+ echo ""
+ echo "=== Quick Summary ==="
+ grep -E "(tps|latency)" "$RESULTS_FILE" | tail -5
+ '';
+
+ # Development shell (GCC + glibc)
+ devShell = pkgs.mkShell {
+ name = "postgresql-dev";
+ buildInputs =
+ (getPostgreSQLDeps false)
+ ++ [
+ flameGraphScript
+ pgbenchScript
+ ];
+
+ shellHook = let
+ icon = "f121";
+ in ''
+ # History configuration
+ export HISTFILE=.history
+ export HISTSIZE=1000000
+ export HISTFILESIZE=1000000
+
+ # Clean environment
+ unset LD_LIBRARY_PATH LD_PRELOAD LIBRARY_PATH C_INCLUDE_PATH CPLUS_INCLUDE_PATH
+
+ # Essential tools in PATH
+ export PATH="${pkgs.which}/bin:${pkgs.coreutils}/bin:$PATH"
+ export PS1="$(echo -e '\u${icon}') {\[$(tput sgr0)\]\[\033[38;5;228m\]\w\[$(tput sgr0)\]\[\033[38;5;15m\]} ($(git rev-parse --abbrev-ref HEAD)) \\$ \[$(tput sgr0)\]"
+
+ # Ccache configuration
+ export PATH=${pkgs.ccache}/bin:$PATH
+ export CCACHE_COMPILERCHECK=content
+ export CCACHE_DIR=$HOME/.ccache/pg/$(basename $PWD)
+ mkdir -p "$CCACHE_DIR"
+
+ # LLVM configuration
+ export LLVM_CONFIG="${llvmPkgs.llvm}/bin/llvm-config"
+ export PATH="${llvmPkgs.llvm}/bin:$PATH"
+ export PKG_CONFIG_PATH="${llvmPkgs.llvm.dev}/lib/pkgconfig:$PKG_CONFIG_PATH"
+ export LLVM_DIR="${llvmPkgs.llvm.dev}/lib/cmake/llvm"
+ export LLVM_ROOT="${llvmPkgs.llvm}"
+
+ # Development tools in PATH
+ export PATH=${pkgs.clang-tools}/bin:$PATH
+ export PATH=${pkgs.cppcheck}/bin:$PATH
+
+ # PosgreSQL Development CFLAGS
+ # -DRELCACHE_FORCE_RELEASE -DCATCACHE_FORCE_RELEASE -fno-omit-frame-pointer -fno-stack-protector -DUSE_VALGRIND
+ export CFLAGS=""
+ export CXXFLAGS=""
+
+ # Python UV
+ UV_PYTHON_DOWNLOADS=never
+
+ # GCC configuration (default compiler)
+ export CC="${pkgs.gcc}/bin/gcc"
+ export CXX="${pkgs.gcc}/bin/g++"
+
+ # PostgreSQL environment
+ export PG_SOURCE_DIR="${config.pgSourceDir}"
+ export PG_BUILD_DIR="${config.pgBuildDir}"
+ export PG_INSTALL_DIR="${config.pgInstallDir}"
+ export PG_DATA_DIR="${config.pgDataDir}"
+ export PG_BENCH_DIR="${config.pgBenchDir}"
+ export PG_FLAME_DIR="${config.pgFlameDir}"
+ export PERL_CORE_DIR=$(find ${pkgs.perl} -maxdepth 5 -path "*/CORE" -type d)
+
+ # GDB configuration
+ export GDBINIT="${gdbConfig}"
+
+ # Performance tools in PATH
+ export PATH="${flameGraphScript}/bin:${pgbenchScript}/bin:$PATH"
+
+ # Create output directories
+ mkdir -p "$PG_BENCH_DIR" "$PG_FLAME_DIR"
+
+ # Compiler verification
+ echo "Environment configured:"
+ echo " Compiler: $CC"
+ echo " libc: glibc"
+ echo " LLVM: $(llvm-config --version 2>/dev/null || echo 'not available')"
+
+ # Load PostgreSQL development aliases
+ if [ -f ./pg-aliases.sh ]; then
+ source ./pg-aliases.sh
+ else
+ echo "Warning: pg-aliases.sh not found in current directory"
+ fi
+
+ echo ""
+ echo "PostgreSQL Development Environment Ready (GCC + glibc)"
+ echo "Run 'pg-info' for available commands"
+ '';
+ };
+
+ # Clang + glibc variant
+ clangDevShell = pkgs.mkShell {
+ name = "postgresql-clang-glibc";
+ buildInputs =
+ (getPostgreSQLDeps false)
+ ++ [
+ llvmPkgs.clang
+ llvmPkgs.lld
+ llvmPkgs.compiler-rt
+ flameGraphScript
+ pgbenchScript
+ ];
+
+ shellHook = let
+ icon = "f121";
+ in ''
+ # History configuration
+ export HISTFILE=.history
+ export HISTSIZE=1000000
+ export HISTFILESIZE=1000000
+
+ # Clean environment
+ unset LD_LIBRARY_PATH LD_PRELOAD LIBRARY_PATH C_INCLUDE_PATH CPLUS_INCLUDE_PATH
+
+ # Essential tools in PATH
+ export PATH="${pkgs.which}/bin:${pkgs.coreutils}/bin:$PATH"
+ export PS1="$(echo -e '\u${icon}') {\[$(tput sgr0)\]\[\033[38;5;228m\]\w\[$(tput sgr0)\]\[\033[38;5;15m\]} ($(git rev-parse --abbrev-ref HEAD)) \\$ \[$(tput sgr0)\]"
+
+ # Ccache configuration
+ export PATH=${pkgs.ccache}/bin:$PATH
+ export CCACHE_COMPILERCHECK=content
+ export CCACHE_DIR=$HOME/.ccache_pg_dev_clang
+ mkdir -p "$CCACHE_DIR"
+
+ # LLVM configuration
+ export LLVM_CONFIG="${llvmPkgs.llvm}/bin/llvm-config"
+ export PATH="${llvmPkgs.llvm}/bin:$PATH"
+ export PKG_CONFIG_PATH="${llvmPkgs.llvm.dev}/lib/pkgconfig:$PKG_CONFIG_PATH"
+ export LLVM_DIR="${llvmPkgs.llvm.dev}/lib/cmake/llvm"
+ export LLVM_ROOT="${llvmPkgs.llvm}"
+
+ # Development tools in PATH
+ export PATH=${pkgs.clang-tools}/bin:$PATH
+ export PATH=${pkgs.cppcheck}/bin:$PATH
+
+ # Clang + glibc configuration - use system linker instead of LLD for compatibility
+ export CC="${llvmPkgs.clang}/bin/clang"
+ export CXX="${llvmPkgs.clang}/bin/clang++"
+
+ # Use system linker and standard runtime
+ #export CFLAGS=""
+ #export CXXFLAGS=""
+ #export LDFLAGS=""
+
+ # PostgreSQL environment
+ export PG_SOURCE_DIR="${config.pgSourceDir}"
+ export PG_BUILD_DIR="${config.pgBuildDir}"
+ export PG_INSTALL_DIR="${config.pgInstallDir}"
+ export PG_DATA_DIR="${config.pgDataDir}"
+ export PG_BENCH_DIR="${config.pgBenchDir}"
+ export PG_FLAME_DIR="${config.pgFlameDir}"
+ export PERL_CORE_DIR=$(find ${pkgs.perl} -maxdepth 5 -path "*/CORE" -type d)
+
+ # GDB configuration
+ export GDBINIT="${gdbConfig}"
+
+ # Performance tools in PATH
+ export PATH="${flameGraphScript}/bin:${pgbenchScript}/bin:$PATH"
+
+ # Create output directories
+ mkdir -p "$PG_BENCH_DIR" "$PG_FLAME_DIR"
+
+ # Compiler verification
+ echo "Environment configured:"
+ echo " Compiler: $CC"
+ echo " libc: glibc"
+ echo " LLVM: $(llvm-config --version 2>/dev/null || echo 'not available')"
+
+ # Load PostgreSQL development aliases
+ if [ -f ./pg-aliases.sh ]; then
+ source ./pg-aliases.sh
+ else
+ echo "Warning: pg-aliases.sh not found in current directory"
+ fi
+
+ echo ""
+ echo "PostgreSQL Development Environment Ready (Clang + glibc)"
+ echo "Run 'pg-info' for available commands"
+ '';
+ };
+
+ # GCC + musl variant (cross-compilation)
+ muslDevShell = pkgs.mkShell {
+ name = "postgresql-gcc-musl";
+ buildInputs =
+ (getPostgreSQLDeps true)
+ ++ [
+ pkgs.gcc
+ flameGraphScript
+ pgbenchScript
+ ];
+
+ shellHook = ''
+ # Same base configuration as main shell
+ export HISTFILE=.history
+ export HISTSIZE=1000000
+ export HISTFILESIZE=1000000
+
+ unset LD_LIBRARY_PATH LD_PRELOAD LIBRARY_PATH C_INCLUDE_PATH CPLUS_INCLUDE_PATH
+
+ export PATH="${pkgs.which}/bin:${pkgs.coreutils}/bin:$PATH"
+
+ # Cross-compilation to musl
+ export CC="${pkgs.gcc}/bin/gcc"
+ export CXX="${pkgs.gcc}/bin/g++"
+
+ # Point to musl libraries for linking
+ export PKG_CONFIG_PATH="${pkgs.pkgsMusl.openssl.dev}/lib/pkgconfig:${pkgs.pkgsMusl.zlib.dev}/lib/pkgconfig:${pkgs.pkgsMusl.icu.dev}/lib/pkgconfig"
+ export CFLAGS="-ggdb -Og -fno-omit-frame-pointer -DUSE_VALGRIND -D_FORTIFY_SOURCE=1 -I${pkgs.pkgsMusl.stdenv.cc.libc}/include"
+ export CXXFLAGS="-ggdb -Og -fno-omit-frame-pointer -DUSE_VALGRIND -D_FORTIFY_SOURCE=1 -I${pkgs.pkgsMusl.stdenv.cc.libc}/include"
+ export LDFLAGS="-L${pkgs.pkgsMusl.stdenv.cc.libc}/lib -static-libgcc"
+
+ # PostgreSQL environment
+ export PG_SOURCE_DIR="${config.pgSourceDir}"
+ export PG_BUILD_DIR="${config.pgBuildDir}"
+ export PG_INSTALL_DIR="${config.pgInstallDir}"
+ export PG_DATA_DIR="${config.pgDataDir}"
+ export PG_BENCH_DIR="${config.pgBenchDir}"
+ export PG_FLAME_DIR="${config.pgFlameDir}"
+ export PERL_CORE_DIR=$(find ${pkgs.perl} -maxdepth 5 -path "*/CORE" -type d)
+
+ export GDBINIT="${gdbConfig}"
+ export PATH="${flameGraphScript}/bin:${pgbenchScript}/bin:$PATH"
+
+ mkdir -p "$PG_BENCH_DIR" "$PG_FLAME_DIR"
+
+ echo "GCC + musl environment configured"
+ echo " Compiler: $CC"
+ echo " LibC: musl (cross-compilation)"
+
+ if [ -f ./pg-aliases.sh ]; then
+ source ./pg-aliases.sh
+ fi
+
+ echo "PostgreSQL Development Environment Ready (GCC + musl)"
+ '';
+ };
+
+ # Clang + musl variant (cross-compilation)
+ clangMuslDevShell = pkgs.mkShell {
+ name = "postgresql-clang-musl";
+ buildInputs =
+ (getPostgreSQLDeps true)
+ ++ [
+ llvmPkgs.clang
+ llvmPkgs.lld
+ flameGraphScript
+ pgbenchScript
+ ];
+
+ shellHook = let
+ icon = "f121";
+ in ''
+ export HISTFILE=.history
+ export HISTSIZE=1000000
+ export HISTFILESIZE=1000000
+
+ unset LD_LIBRARY_PATH LD_PRELOAD LIBRARY_PATH C_INCLUDE_PATH CPLUS_INCLUDE_PATH
+
+ export PATH="${pkgs.which}/bin:${pkgs.coreutils}/bin:$PATH"
+ export PS1="$(echo -e '\u${icon}') {\[$(tput sgr0)\]\[\033[38;5;228m\]\w\[$(tput sgr0)\]\[\033[38;5;15m\]} ($(git rev-parse --abbrev-ref HEAD)) \\$ \[$(tput sgr0)\]"
+
+ # Cross-compilation to musl with clang
+ export CC="${llvmPkgs.clang}/bin/clang"
+ export CXX="${llvmPkgs.clang}/bin/clang++"
+
+ # Point to musl libraries for linking
+ export PKG_CONFIG_PATH="${pkgs.pkgsMusl.openssl.dev}/lib/pkgconfig:${pkgs.pkgsMusl.zlib.dev}/lib/pkgconfig:${pkgs.pkgsMusl.icu.dev}/lib/pkgconfig"
+ export CFLAGS="--target=x86_64-linux-musl -ggdb -Og -fno-omit-frame-pointer -DUSE_VALGRIND -D_FORTIFY_SOURCE=1 -I${pkgs.pkgsMusl.stdenv.cc.libc}/include"
+ export CXXFLAGS="--target=x86_64-linux-musl -ggdb -Og -fno-omit-frame-pointer -DUSE_VALGRIND -D_FORTIFY_SOURCE=1 -I${pkgs.pkgsMusl.stdenv.cc.libc}/include"
+ export LDFLAGS="--target=x86_64-linux-musl -L${pkgs.pkgsMusl.stdenv.cc.libc}/lib -fuse-ld=lld"
+
+ # PostgreSQL environment
+ export PG_SOURCE_DIR="${config.pgSourceDir}"
+ export PG_BUILD_DIR="${config.pgBuildDir}"
+ export PG_INSTALL_DIR="${config.pgInstallDir}"
+ export PG_DATA_DIR="${config.pgDataDir}"
+ export PG_BENCH_DIR="${config.pgBenchDir}"
+ export PG_FLAME_DIR="${config.pgFlameDir}"
+ export PERL_CORE_DIR=$(find ${pkgs.perl} -maxdepth 5 -path "*/CORE" -type d)
+
+ export GDBINIT="${gdbConfig}"
+ export PATH="${flameGraphScript}/bin:${pgbenchScript}/bin:$PATH"
+
+ mkdir -p "$PG_BENCH_DIR" "$PG_FLAME_DIR"
+
+ echo "Clang + musl environment configured"
+ echo " Compiler: $CC"
+ echo " LibC: musl (cross-compilation)"
+
+ if [ -f ./pg-aliases.sh ]; then
+ source ./pg-aliases.sh
+ fi
+
+ echo "PostgreSQL Development Environment Ready (Clang + musl)"
+ '';
+ };
+in {
+ inherit devShell clangDevShell muslDevShell clangMuslDevShell gdbConfig flameGraphScript pgbenchScript;
+}
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 1909c3254b5ba..768b65592046a 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -305,6 +305,7 @@ brinhandler(PG_FUNCTION_ARGS)
.amparallelrescan = NULL,
.amtranslatestrategy = NULL,
.amtranslatecmptype = NULL,
+ .amcomparedatums = NULL,
};
PG_RETURN_POINTER(&amroutine);
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index ff927279cc39a..d787460bb4171 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -26,6 +26,7 @@
#include "storage/indexfsm.h"
#include "utils/builtins.h"
#include "utils/index_selfuncs.h"
+#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/typcache.h"
@@ -89,6 +90,7 @@ ginhandler(PG_FUNCTION_ARGS)
.amestimateparallelscan = NULL,
.aminitparallelscan = NULL,
.amparallelrescan = NULL,
+ .amcomparedatums = gincomparedatums,
};
PG_RETURN_POINTER(&amroutine);
@@ -692,3 +694,84 @@ ginbuildphasename(int64 phasenum)
return NULL;
}
}
+
+/*
+ * gincomparedatums - Compare datums to determine if they produce identical keys
+ *
+ * This function extracts keys from both old_datum and new_datum using the
+ * opclass's extractValue function, then compares the extracted key arrays.
+ * Returns true if the key sets are identical (same keys, same counts).
+ *
+ * This enables HOT updates for GIN indexes when the indexed portions of a
+ * value haven't changed, even if the value itself has changed.
+ *
+ * Example: JSONB column with GIN index. If an update changes a non-indexed
+ * key in the JSONB document, the extracted keys are identical and we can
+ * do a HOT update.
+ */
+bool
+gincomparedatums(Relation index, int attnum,
+ Datum old_datum, bool old_isnull,
+ Datum new_datum, bool new_isnull)
+{
+ GinState ginstate;
+ Datum *old_keys;
+ Datum *new_keys;
+ GinNullCategory *old_categories;
+ GinNullCategory *new_categories;
+ int32 old_nkeys;
+ int32 new_nkeys;
+ MemoryContext tmpcontext;
+ MemoryContext oldcontext;
+ bool result = true;
+
+ /* Handle NULL cases */
+ if (old_isnull != new_isnull)
+ return false;
+ if (old_isnull)
+ return true;
+
+ /* Create temporary context for extraction work */
+ tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
+ "GIN datum comparison",
+ ALLOCSET_DEFAULT_SIZES);
+ oldcontext = MemoryContextSwitchTo(tmpcontext);
+
+ initGinState(&ginstate, index);
+
+ /* Extract keys from both datums using existing GIN infrastructure */
+ old_keys = ginExtractEntries(&ginstate, attnum, old_datum, old_isnull,
+ &old_nkeys, &old_categories);
+ new_keys = ginExtractEntries(&ginstate, attnum, new_datum, new_isnull,
+ &new_nkeys, &new_categories);
+
+ /* Different number of keys means definitely different */
+ if (old_nkeys != new_nkeys)
+ {
+ result = false;
+ goto cleanup;
+ }
+
+ /*
+ * Compare the sorted key arrays element-by-element. Since both arrays
+ * are already sorted by ginExtractEntries, we can do a simple O(n)
+ * comparison.
+ */
+ for (int i = 0; i < old_nkeys; i++)
+ {
+ if (ginCompareEntries(&ginstate, attnum,
+ old_keys[i], old_categories[i],
+ new_keys[i], new_categories[i]) != 0)
+ {
+ result = false;
+ break;
+ }
+ }
+
+cleanup:
+ /* Clean up */
+ MemoryContextSwitchTo(oldcontext);
+ MemoryContextDelete(tmpcontext);
+
+ return result;
+}
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index dfffce3e39660..b231009490d68 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -112,6 +112,7 @@ gisthandler(PG_FUNCTION_ARGS)
.amparallelrescan = NULL,
.amtranslatestrategy = NULL,
.amtranslatecmptype = gisttranslatecmptype,
+ .amcomparedatums = NULL,
};
PG_RETURN_POINTER(&amroutine);
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index e88ddb32a054c..65111b72d9818 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -111,6 +111,7 @@ hashhandler(PG_FUNCTION_ARGS)
.amparallelrescan = NULL,
.amtranslatestrategy = hashtranslatestrategy,
.amtranslatecmptype = hashtranslatecmptype,
+ .amcomparedatums = NULL,
};
PG_RETURN_POINTER(&amroutine);
diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT
index 74e407f375aad..d306b709c797a 100644
--- a/src/backend/access/heap/README.HOT
+++ b/src/backend/access/heap/README.HOT
@@ -156,6 +156,117 @@ all summarizing indexes. (Realistically, we only need to propagate the
update to the indexes that contain the updated values, but that is yet to
be implemented.)
+
+Expression Index Sub-Attribute Tracking
+----------------------------------
+
+For expression indexes on structured types (JSONB, XML), PostgreSQL can
+track modifications at a finer granularity than whole-column changes. When
+an indexed column contains structured data and indexes reference specific
+sub-attributes (e.g., JSONB paths like data->'status' or XML XPath
+expressions like xpath('/doc/title', data)), the system can determine if
+only non-indexed sub-attributes were modified.
+
+This enables HOT updates even when the column's binary representation
+changes, as long as no indexed sub-attributes were modified. For example:
+
+ CREATE TABLE t (id int PRIMARY KEY, data jsonb);
+ CREATE INDEX idx ON t((data->'status'));
+
+ -- This is HOT-eligible even though 'data' column changes:
+ UPDATE t SET data = jsonb_set(data, '{count}', '42') WHERE id = 1;
+
+ -- Because only the non-indexed 'count' field was modified.
+
+Types implement sub-attribute tracking via three catalog mechanisms:
+
+1. typidxextract (pg_type column): Function to extract indexed sub-attribute
+ descriptors from expression index definitions. Called at relcache build
+ time to identify which sub-attributes are indexed.
+
+2. typidxcompare (pg_type column): Function to compare old and new values at
+ specific indexed sub-attributes, returning true if any indexed sub-attribute
+ changed. This is the fallback comparison path.
+
+3. prosubattrmutator (pg_proc column): Marks mutation functions (like
+ jsonb_set) that can report modifications via slot_add_modified_idx_attr()
+ when provided a SubpathTrackingContext. This is the instrumented fast path
+ that avoids re-comparing entire values.
+
+The executor creates a SubpathTrackingContext when processing UPDATE
+operations on tables with expression indexes on types that support sub-attribute
+tracking. Mutation functions mark which indexed sub-attributes they modified,
+and the executor uses this information to determine HOT eligibility.
+
+If instrumented tracking is unavailable (e.g., direct assignment rather than
+function call), the system falls back to calling typidxcompare on each
+indexed expression.
+
+This optimization is controlled by the enable_subattr_hot GUC (default on).
+When disabled, sub-attribute granularity tracking is not performed and the
+system falls back to whole-column comparison.
+
+
+Determining Modified Indexed Attributes
+----------------------------------------
+
+Prior to PostgreSQL 19, the determination of which indexed attributes were
+modified during an UPDATE was performed inside heap_update() under buffer
+lock by HeapDetermineColumnsInfo(). This had two limitations:
+
+1. The work was done while holding an exclusive buffer lock, increasing
+ contention.
+2. The logic was heap-specific, making it difficult to share with other
+ table access methods.
+
+Now, this determination is performed in the executor by
+ExecUpdateModifiedIdxAttrs() before calling table_tuple_update(). This
+function:
+
+1. Compares old and new tuple slots to identify which attributes changed
+ (using ExecCompareSlotAttrs)
+2. Intersects changed attributes with indexed attributes to determine
+ modified_idx_attrs
+3. For attributes with expression indexes on subattr-tracked types, applies
+ fine-grained comparison using the type's tracking mechanisms
+
+This moves the work outside the buffer lock and makes it table-AM-agnostic.
+The heap AM receives the modified_idx_attrs bitmapset and uses it to
+determine HOT eligibility.
+
+For non-executor paths (e.g., catalog updates via simple_heap_update), the
+heap AM still performs this determination internally using
+HeapUpdateModifiedIdxAttrs(), which provides equivalent functionality.
+
+
+Per-Index Update Tracking
+-------------------------
+
+After the table AM performs the update, the executor determines which
+indexes need new entries using per-index tracking rather than a single
+global enum.
+
+The table AM communicates whether a HOT update occurred by setting (or not)
+the MODIFIED_IDX_ATTRS_ALL_IDX sentinel bit (bit 0) in the modified_idx_attrs
+bitmapset. When this bit is set, the update was non-HOT and all indexes
+require new entries (because the tuple has a new TID). When the bit is not
+set, the update was HOT and only summarizing indexes whose columns changed
+need new entries.
+
+The executor then calls ExecSetIndexUnchanged() to populate the per-index
+ii_IndexUnchanged flag on each IndexInfo. This flag indicates whether each
+index's key values are unchanged by the update. For non-HOT updates, even
+"unchanged" indexes must get new entries (new TID), but the indexUnchanged
+hint is passed to the index AM's aminsert callback to enable optimizations
+such as bottom-up deletion of logically-equivalent duplicate entries.
+
+The EIIT_ALL_INDEXES flag is passed to ExecInsertIndexTuples() to indicate
+whether all indexes need entries (non-HOT) or only summarizing indexes (HOT).
+This replaces the previous TU_UpdateIndexes enum (TU_None/TU_All/TU_Summarizing)
+with a cleaner separation between the table AM (which determines HOT
+eligibility) and the executor (which determines per-index behavior).
+
+
Abort Cases
-----------
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 8f1c11a93500d..19c64ba7b5d18 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -37,14 +37,21 @@
#include "access/multixact.h"
#include "access/subtrans.h"
#include "access/syncscan.h"
+#include "access/sysattr.h"
+#include "access/tableam.h"
#include "access/valid.h"
#include "access/visibilitymap.h"
#include "access/xloginsert.h"
#include "catalog/pg_database.h"
#include "catalog/pg_database_d.h"
#include "commands/vacuum.h"
+#include "executor/execMutation.h"
+#include "executor/tuptable.h"
+#include "optimizer/cost.h"
+#include "nodes/lockoptions.h"
#include "pgstat.h"
#include "port/pg_bitutils.h"
+#include "storage/buf.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/proc.h"
@@ -52,6 +59,7 @@
#include "utils/datum.h"
#include "utils/injection_point.h"
#include "utils/inval.h"
+#include "utils/relcache.h"
#include "utils/spccache.h"
#include "utils/syscache.h"
@@ -68,11 +76,8 @@ static void check_lock_if_inplace_updateable_rel(Relation relation,
HeapTuple newtup);
static void check_inplace_rel_lock(HeapTuple oldtup);
#endif
-static Bitmapset *HeapDetermineColumnsInfo(Relation relation,
- Bitmapset *interesting_cols,
- Bitmapset *external_cols,
- HeapTuple oldtup, HeapTuple newtup,
- bool *has_external);
+static Bitmapset *HeapUpdateModifiedIdxAttrs(Relation relation,
+ HeapTuple oldtup, HeapTuple newtup);
static bool heap_acquire_tuplock(Relation relation, const ItemPointerData *tid,
LockTupleMode mode, LockWaitPolicy wait_policy,
bool *have_tuple_lock);
@@ -3302,7 +3307,7 @@ simple_heap_delete(Relation relation, const ItemPointerData *tid)
* heap_update - replace a tuple
*
* See table_tuple_update() for an explanation of the parameters, except that
- * this routine directly takes a tuple rather than a slot.
+ * this routine directly takes a heap tuple rather than a slot.
*
* In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
* t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
@@ -3312,17 +3317,13 @@ simple_heap_delete(Relation relation, const ItemPointerData *tid)
TM_Result
heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
CommandId cid, Snapshot crosscheck, bool wait,
- TM_FailureData *tmfd, LockTupleMode *lockmode,
- TU_UpdateIndexes *update_indexes)
+ TM_FailureData *tmfd, const LockTupleMode lockmode,
+ const Bitmapset *modified_idx_attrs, const bool hot_allowed)
{
TM_Result result;
TransactionId xid = GetCurrentTransactionId();
- Bitmapset *hot_attrs;
- Bitmapset *sum_attrs;
- Bitmapset *key_attrs;
- Bitmapset *id_attrs;
- Bitmapset *interesting_attrs;
- Bitmapset *modified_attrs;
+ Bitmapset *idx_attrs,
+ *rid_attrs;
ItemId lp;
HeapTupleData oldtup;
HeapTuple heaptup;
@@ -3341,13 +3342,12 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
bool have_tuple_lock = false;
bool iscombo;
bool use_hot_update = false;
- bool summarized_update = false;
bool key_intact;
bool all_visible_cleared = false;
bool all_visible_cleared_new = false;
bool checked_lockers;
bool locker_remains;
- bool id_has_external = false;
+ bool rep_id_key_required = false;
TransactionId xmax_new_tuple,
xmax_old_tuple;
uint16 infomask_old_tuple,
@@ -3378,33 +3378,14 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
#endif
/*
- * Fetch the list of attributes to be checked for various operations.
- *
- * For HOT considerations, this is wasted effort if we fail to update or
- * have to put the new tuple on a different page. But we must compute the
- * list before obtaining buffer lock --- in the worst case, if we are
- * doing an update on one of the relevant system catalogs, we could
- * deadlock if we try to fetch the list later. In any case, the relcache
- * caches the data so this is usually pretty cheap.
- *
- * We also need columns used by the replica identity and columns that are
- * considered the "key" of rows in the table.
+ * Fetch the attributes used across all indexes on this relation as well
+ * as the replica identity and columns.
*
- * Note that we get copies of each bitmap, so we need not worry about
- * relcache flush happening midway through.
- */
- hot_attrs = RelationGetIndexAttrBitmap(relation,
- INDEX_ATTR_BITMAP_HOT_BLOCKING);
- sum_attrs = RelationGetIndexAttrBitmap(relation,
- INDEX_ATTR_BITMAP_SUMMARIZED);
- key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
- id_attrs = RelationGetIndexAttrBitmap(relation,
- INDEX_ATTR_BITMAP_IDENTITY_KEY);
- interesting_attrs = NULL;
- interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
- interesting_attrs = bms_add_members(interesting_attrs, sum_attrs);
- interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
- interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
+ * NOTE: relcache returns copies of each bitmap, so we need not worry
+ * about relcache flush happening midway through.
+ */
+ idx_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_INDEXED);
+ rid_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_IDENTITY_KEY);
block = ItemPointerGetBlockNumber(otid);
INJECTION_POINT("heap_update-before-pin", NULL);
@@ -3458,20 +3439,17 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
tmfd->ctid = *otid;
tmfd->xmax = InvalidTransactionId;
tmfd->cmax = InvalidCommandId;
- *update_indexes = TU_None;
- bms_free(hot_attrs);
- bms_free(sum_attrs);
- bms_free(key_attrs);
- bms_free(id_attrs);
- /* modified_attrs not yet initialized */
- bms_free(interesting_attrs);
+ bms_free(rid_attrs);
+ bms_free(idx_attrs);
+ /* modified_idx_attrs is owned by the caller, don't free it */
+
return TM_Deleted;
}
/*
- * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work
- * properly.
+ * Fill in enough data in oldtup to determine replica identity attribute
+ * requirements.
*/
oldtup.t_tableOid = RelationGetRelid(relation);
oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
@@ -3482,16 +3460,59 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
newtup->t_tableOid = RelationGetRelid(relation);
/*
- * Determine columns modified by the update. Additionally, identify
- * whether any of the unmodified replica identity key attributes in the
- * old tuple is externally stored or not. This is required because for
- * such attributes the flattened value won't be WAL logged as part of the
- * new tuple so we must include it as part of the old_key_tuple. See
- * ExtractReplicaIdentity.
+ * ExtractReplicaIdentity() needs to know if a modified indexed attrbute
+ * is used as a replica indentity or if any of the replica identity
+ * attributes are referenced in an index, unmodified, and are stored
+ * externally in the old tuple being replaced. In those cases it may be
+ * necessary to WAL log them to so they are available to replicas.
*/
- modified_attrs = HeapDetermineColumnsInfo(relation, interesting_attrs,
- id_attrs, &oldtup,
- newtup, &id_has_external);
+ rep_id_key_required = bms_overlap(modified_idx_attrs, rid_attrs);
+ if (!rep_id_key_required)
+ {
+ Bitmapset *attrs;
+ TupleDesc tupdesc = RelationGetDescr(relation);
+ int attidx = -1;
+
+ /*
+ * Reduce the set under review to only the unmodified indexed replica
+ * identity key attributes. idx_attrs is copied (by bms_difference())
+ * not modified here.
+ */
+ attrs = bms_difference(idx_attrs, modified_idx_attrs);
+ attrs = bms_int_members(attrs, rid_attrs);
+
+ while ((attidx = bms_next_member(attrs, attidx)) >= 0)
+ {
+ /*
+ * attidx is zero-based, attrnum is the normal attribute number
+ */
+ AttrNumber attrnum = attidx + FirstLowInvalidHeapAttributeNumber;
+ Datum value;
+ bool isnull;
+
+ /*
+ * System attributes are not added into INDEX_ATTR_BITMAP_INDEXED
+ * bitmap by relcache.
+ */
+ Assert(attrnum > 0);
+
+ value = heap_getattr(&oldtup, attrnum, tupdesc, &isnull);
+
+ /* No need to check attributes that can't be stored externally */
+ if (isnull ||
+ TupleDescCompactAttr(tupdesc, attrnum - 1)->attlen != -1)
+ continue;
+
+ /* Check if the old tuple's attribute is stored externally */
+ if (VARATT_IS_EXTERNAL((struct varlena *) DatumGetPointer(value)))
+ {
+ rep_id_key_required = true;
+ break;
+ }
+ }
+
+ bms_free(attrs);
+ }
/*
* If we're not updating any "key" column, we can grab a weaker lock type.
@@ -3504,9 +3525,8 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
* is updates that don't manipulate key columns, not those that
* serendipitously arrive at the same key values.
*/
- if (!bms_overlap(modified_attrs, key_attrs))
+ if (lockmode == LockTupleNoKeyExclusive)
{
- *lockmode = LockTupleNoKeyExclusive;
mxact_status = MultiXactStatusNoKeyUpdate;
key_intact = true;
@@ -3523,7 +3543,7 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
}
else
{
- *lockmode = LockTupleExclusive;
+ Assert(lockmode == LockTupleExclusive);
mxact_status = MultiXactStatusUpdate;
key_intact = false;
}
@@ -3534,7 +3554,6 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
* with the new tuple's location, so there's great risk of confusion if we
* use otid anymore.
*/
-
l2:
checked_lockers = false;
locker_remains = false;
@@ -3602,7 +3621,7 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
bool current_is_member = false;
if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
- *lockmode, ¤t_is_member))
+ lockmode, ¤t_is_member))
{
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
@@ -3611,7 +3630,7 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
* requesting a lock and already have one; avoids deadlock).
*/
if (!current_is_member)
- heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
+ heap_acquire_tuplock(relation, &(oldtup.t_self), lockmode,
LockWaitBlock, &have_tuple_lock);
/* wait for multixact */
@@ -3696,7 +3715,7 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
* lock.
*/
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
- heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
+ heap_acquire_tuplock(relation, &(oldtup.t_self), lockmode,
LockWaitBlock, &have_tuple_lock);
XactLockTableWait(xwait, relation, &oldtup.t_self,
XLTW_Update);
@@ -3756,17 +3775,14 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
tmfd->cmax = InvalidCommandId;
UnlockReleaseBuffer(buffer);
if (have_tuple_lock)
- UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
+ UnlockTupleTuplock(relation, &(oldtup.t_self), lockmode);
if (vmbuffer != InvalidBuffer)
ReleaseBuffer(vmbuffer);
- *update_indexes = TU_None;
- bms_free(hot_attrs);
- bms_free(sum_attrs);
- bms_free(key_attrs);
- bms_free(id_attrs);
- bms_free(modified_attrs);
- bms_free(interesting_attrs);
+ bms_free(rid_attrs);
+ bms_free(idx_attrs);
+ /* modified_idx_attrs is owned by the caller, don't free it */
+
return result;
}
@@ -3796,7 +3812,7 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
oldtup.t_data->t_infomask,
oldtup.t_data->t_infomask2,
- xid, *lockmode, true,
+ xid, lockmode, true,
&xmax_old_tuple, &infomask_old_tuple,
&infomask2_old_tuple);
@@ -3913,7 +3929,7 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
oldtup.t_data->t_infomask,
oldtup.t_data->t_infomask2,
- xid, *lockmode, false,
+ xid, lockmode, false,
&xmax_lock_old_tuple, &infomask_lock_old_tuple,
&infomask2_lock_old_tuple);
@@ -4073,37 +4089,19 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
/*
* At this point newbuf and buffer are both pinned and locked, and newbuf
- * has enough space for the new tuple. If they are the same buffer, only
- * one pin is held.
+ * has enough space for the new tuple so we can use the HOT update path if
+ * the caller determined that it is allowable.
+ *
+ * NOTE: If newbuf == buffer then only one pin is held.
*/
-
if (newbuf == buffer)
{
- /*
- * Since the new tuple is going into the same page, we might be able
- * to do a HOT update. Check if any of the index columns have been
- * changed.
- */
- if (!bms_overlap(modified_attrs, hot_attrs))
- {
+ if (hot_allowed)
use_hot_update = true;
-
- /*
- * If none of the columns that are used in hot-blocking indexes
- * were updated, we can apply HOT, but we do still need to check
- * if we need to update the summarizing indexes, and update those
- * indexes if the columns were updated, or we may fail to detect
- * e.g. value bound changes in BRIN minmax indexes.
- */
- if (bms_overlap(modified_attrs, sum_attrs))
- summarized_update = true;
- }
}
else
- {
/* Set a hint that the old page could use prune/defrag */
PageSetFull(page);
- }
/*
* Compute replica identity tuple before entering the critical section so
@@ -4113,8 +4111,7 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
* columns are modified or it has external data.
*/
old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
- bms_overlap(modified_attrs, id_attrs) ||
- id_has_external,
+ rep_id_key_required,
&old_key_copied);
/* NO EREPORT(ERROR) from here till changes are logged */
@@ -4243,7 +4240,7 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
* Release the lmgr tuple lock, if we had it.
*/
if (have_tuple_lock)
- UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
+ UnlockTupleTuplock(relation, &(oldtup.t_self), lockmode);
pgstat_count_heap_update(relation, use_hot_update, newbuf != buffer);
@@ -4257,31 +4254,12 @@ heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup,
heap_freetuple(heaptup);
}
- /*
- * If it is a HOT update, the update may still need to update summarized
- * indexes, lest we fail to update those summaries and get incorrect
- * results (for example, minmax bounds of the block may change with this
- * update).
- */
- if (use_hot_update)
- {
- if (summarized_update)
- *update_indexes = TU_Summarizing;
- else
- *update_indexes = TU_None;
- }
- else
- *update_indexes = TU_All;
-
if (old_key_tuple != NULL && old_key_copied)
heap_freetuple(old_key_tuple);
- bms_free(hot_attrs);
- bms_free(sum_attrs);
- bms_free(key_attrs);
- bms_free(id_attrs);
- bms_free(modified_attrs);
- bms_free(interesting_attrs);
+ bms_free(rid_attrs);
+ bms_free(idx_attrs);
+ /* modified_idx_attrs is owned by the caller, don't free it */
return TM_Ok;
}
@@ -4454,28 +4432,110 @@ heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2,
}
/*
- * Check which columns are being updated.
- *
- * Given an updated tuple, determine (and return into the output bitmapset),
- * from those listed as interesting, the set of columns that changed.
- *
- * has_external indicates if any of the unmodified attributes (from those
- * listed as interesting) of the old tuple is a member of external_cols and is
- * stored externally.
+ * HOT updates are possible when either: a) there are no modified indexed
+ * attributes, or b) the modified attributes are all on summarizing indexes.
+ * Later, in heap_update(), we can choose to perform a HOT update if there is
+ * space on the page for the new tuple and the following code has determined
+ * that HOT is allowed.
+ */
+bool
+HeapUpdateHotAllowable(Relation relation, const Bitmapset *modified_idx_attrs)
+{
+ bool hot_allowed;
+
+ /*
+ * Let's be optimistic and start off by assuming the best case, no indexes
+ * need updating and HOT is allowable.
+ */
+ hot_allowed = true;
+
+ /*
+ * Check for case (a); when there are no modified index attributes HOT is
+ * allowed.
+ */
+ if (bms_is_empty(modified_idx_attrs))
+ hot_allowed = true;
+ else
+ {
+ Bitmapset *sum_attrs = RelationGetIndexAttrBitmap(relation,
+ INDEX_ATTR_BITMAP_SUMMARIZED);
+
+ /*
+ * At least one index attribute was modified, but is this case (b)
+ * where all the modified index attributes are only used by
+ * summarizing indexes? If that's the case we need to update those
+ * indexes, but this can be a HOT update.
+ */
+ if (bms_is_subset(modified_idx_attrs, sum_attrs))
+ {
+ hot_allowed = true;
+ }
+ else
+ {
+ /*
+ * Now we know that one or more indexed attribute were updated and
+ * that there was at least one of those attributes were referenced
+ * by a non-summarizing index. HOT is not allowed.
+ */
+ hot_allowed = false;
+ }
+
+ bms_free(sum_attrs);
+ }
+
+ return hot_allowed;
+}
+
+/*
+ * If we're not updating any "key" attributes, we can grab a weaker lock type.
+ * This allows for more concurrency when we are running simultaneously with
+ * foreign key checks.
+ */
+LockTupleMode
+HeapUpdateDetermineLockmode(Relation relation, const Bitmapset *modified_idx_attrs)
+{
+ LockTupleMode lockmode = LockTupleExclusive;
+
+ Bitmapset *key_attrs = RelationGetIndexAttrBitmap(relation,
+ INDEX_ATTR_BITMAP_KEY);
+
+ if (!bms_overlap(modified_idx_attrs, key_attrs))
+ lockmode = LockTupleNoKeyExclusive;
+
+ bms_free(key_attrs);
+
+ return lockmode;
+}
+
+/*
+ * Return a Bitmapset that contains the set of modified (changed) indexed
+ * attributes between oldtup and newtup.
*/
static Bitmapset *
-HeapDetermineColumnsInfo(Relation relation,
- Bitmapset *interesting_cols,
- Bitmapset *external_cols,
- HeapTuple oldtup, HeapTuple newtup,
- bool *has_external)
+HeapUpdateModifiedIdxAttrs(Relation relation, HeapTuple oldtup, HeapTuple newtup)
{
int attidx;
- Bitmapset *modified = NULL;
+ Bitmapset *attrs,
+ *modified_idx_attrs = NULL;
TupleDesc tupdesc = RelationGetDescr(relation);
+ /* Get the set of all attributes across all indexes for this relation */
+ attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_INDEXED);
+
+ /* No indexed attributes, we're done */
+ if (bms_is_empty(attrs))
+ return NULL;
+
+ /*
+ * This heap update function is used outside the executor and so unlike
+ * heapam_tuple_update() where there is ResultRelInfo and EState to
+ * provide the concise set of attributes that might have been modified
+ * (via ExecGetAllUpdatedCols()) we simply check all indexed attributes to
+ * find the subset that changed value. That's the "modified indexed
+ * attributes" or "modified_idx_attrs".
+ */
attidx = -1;
- while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0)
+ while ((attidx = bms_next_member(attrs, attidx)) >= 0)
{
/* attidx is zero-based, attrnum is the normal attribute number */
AttrNumber attrnum = attidx + FirstLowInvalidHeapAttributeNumber;
@@ -4491,7 +4551,7 @@ HeapDetermineColumnsInfo(Relation relation,
*/
if (attrnum == 0)
{
- modified = bms_add_member(modified, attidx);
+ modified_idx_attrs = bms_add_member(modified_idx_attrs, attidx);
continue;
}
@@ -4504,7 +4564,7 @@ HeapDetermineColumnsInfo(Relation relation,
{
if (attrnum != TableOidAttributeNumber)
{
- modified = bms_add_member(modified, attidx);
+ modified_idx_attrs = bms_add_member(modified_idx_attrs, attidx);
continue;
}
}
@@ -4520,29 +4580,12 @@ HeapDetermineColumnsInfo(Relation relation,
if (!heap_attr_equals(tupdesc, attrnum, value1,
value2, isnull1, isnull2))
- {
- modified = bms_add_member(modified, attidx);
- continue;
- }
-
- /*
- * No need to check attributes that can't be stored externally. Note
- * that system attributes can't be stored externally.
- */
- if (attrnum < 0 || isnull1 ||
- TupleDescCompactAttr(tupdesc, attrnum - 1)->attlen != -1)
- continue;
-
- /*
- * Check if the old tuple's attribute is stored externally and is a
- * member of external_cols.
- */
- if (VARATT_IS_EXTERNAL((varlena *) DatumGetPointer(value1)) &&
- bms_is_member(attidx, external_cols))
- *has_external = true;
+ modified_idx_attrs = bms_add_member(modified_idx_attrs, attidx);
}
- return modified;
+ bms_free(attrs);
+
+ return modified_idx_attrs;
}
/*
@@ -4554,17 +4597,98 @@ HeapDetermineColumnsInfo(Relation relation,
* via ereport().
*/
void
-simple_heap_update(Relation relation, const ItemPointerData *otid, HeapTuple tup,
- TU_UpdateIndexes *update_indexes)
+simple_heap_update(Relation relation, const ItemPointerData *otid, HeapTuple tuple,
+ Bitmapset **modified_idx_attrs)
{
TM_Result result;
TM_FailureData tmfd;
LockTupleMode lockmode;
+ TupleTableSlot *slot;
+ BufferHeapTupleTableSlot *bslot;
+ HeapTuple oldtup;
+ bool shouldFree = true;
+ Bitmapset *idx_attrs;
+ Bitmapset *local_modified_idx_attrs;
+ bool hot_allowed;
+ Buffer buffer;
+
+ Assert(ItemPointerIsValid(otid));
+
+ /*
+ * Fetch this bitmap of interesting attributes from relcache before
+ * obtaining a buffer lock because if we are doing an update on one of the
+ * relevant system catalogs we could deadlock if we try to fetch them
+ * later on. Relcache will return copies of each bitmap, so we need not
+ * worry about relcache flush happening midway through this operation.
+ */
+ idx_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_INDEXED);
+
+ INJECTION_POINT("heap_update-before-pin", NULL);
+
+ /*
+ * To update a heap tuple we need to find the set of modified indexed
+ * attributes ("modified_idx_attrs") so as to see if a HOT update is
+ * allowable or not. When updating heap tuples via execution of UPDATE
+ * statements this set is constructed before calling into the table AM's
+ * tuple_update() function by the function ExecUpdateModifiedIdxAttrs()
+ * which compares the old/new TupleTableSlots. However, here we have the
+ * old TID and the new tuple, not two TupleTableSlots, but we still need
+ * to construct a similar bitmap so as to be able to know if HOT updates
+ * are allowed or not. To do that we first have to fetch the old tuple
+ * itself. Because heapam_fetch_row_version() is static, we have to
+ * replicate that code here. This is a bit repetitive because
+ * heap_update() will again find and form the old HeapTuple from the old
+ * TID and in most cases the callers (ignoring extensions, always catalog
+ * tuple updates) already had the set of changed attributes (e.g. the
+ * "replaces" array), but for now this minor repetition of work is
+ * necessary.
+ */
+
+ slot = MakeTupleTableSlot(RelationGetDescr(relation), &TTSOpsBufferHeapTuple);
+ bslot = (BufferHeapTupleTableSlot *) slot;
+
+ /*
+ * Set the TID in the slot and then fetch the old tuple so we can examine
+ * it
+ */
+ bslot->base.tupdata.t_self = *otid;
+ if (!heap_fetch(relation, SnapshotAny, &bslot->base.tupdata, &buffer, false))
+ {
+ /*
+ * heap_update() checks for !ItemIdIsNormal(lp) and will return false
+ * in those cases.
+ */
+ Assert(RelationSupportsSysCache(RelationGetRelid(relation)));
+
+ /* modified_idx_attrs not yet initialized */
+ bms_free(idx_attrs);
+ ExecDropSingleTupleTableSlot(slot);
+
+ elog(ERROR, "tuple concurrently deleted");
+
+ return;
+ }
+
+ Assert(buffer != InvalidBuffer);
+
+ /* Store in slot, transferring existing pin */
+ ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, slot, buffer);
+ oldtup = ExecFetchSlotHeapTuple(slot, false, &shouldFree);
+
+ local_modified_idx_attrs = HeapUpdateModifiedIdxAttrs(relation, oldtup, tuple);
+ lockmode = HeapUpdateDetermineLockmode(relation, local_modified_idx_attrs);
+ hot_allowed = HeapUpdateHotAllowable(relation, local_modified_idx_attrs);
+
+ result = heap_update(relation, otid, tuple, GetCurrentCommandId(true),
+ InvalidSnapshot, true /* wait for commit */ ,
+ &tmfd, lockmode, local_modified_idx_attrs, hot_allowed);
+
+ if (shouldFree)
+ heap_freetuple(oldtup);
+
+ ExecDropSingleTupleTableSlot(slot);
+ bms_free(idx_attrs);
- result = heap_update(relation, otid, tup,
- GetCurrentCommandId(true), InvalidSnapshot,
- true /* wait for commit */ ,
- &tmfd, &lockmode, update_indexes);
switch (result)
{
case TM_SelfModified:
@@ -4573,7 +4697,15 @@ simple_heap_update(Relation relation, const ItemPointerData *otid, HeapTuple tup
break;
case TM_Ok:
- /* done successfully */
+ /*
+ * If the tuple returned from heap_update() is marked heap-only,
+ * this was a HOT update and no non-summarizing indexes need
+ * updating. Otherwise, set the sentinel bit so the caller knows
+ * all indexes need updating.
+ */
+ if (!HeapTupleIsHeapOnly(tuple))
+ local_modified_idx_attrs = bms_add_member(local_modified_idx_attrs,
+ MODIFIED_IDX_ATTRS_ALL_IDX);
break;
case TM_Updated:
@@ -4588,8 +4720,9 @@ simple_heap_update(Relation relation, const ItemPointerData *otid, HeapTuple tup
elog(ERROR, "unrecognized heap_update status: %u", result);
break;
}
-}
+ *modified_idx_attrs = local_modified_idx_attrs;
+}
/*
* Return the MultiXactStatus corresponding to the given tuple lock mode.
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 5137d2510ea4c..5f7fa6a77d7dc 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -27,7 +27,6 @@
#include "access/syncscan.h"
#include "access/tableam.h"
#include "access/tsmapi.h"
-#include "access/visibilitymap.h"
#include "access/xact.h"
#include "catalog/catalog.h"
#include "catalog/index.h"
@@ -44,6 +43,7 @@
#include "storage/procarray.h"
#include "storage/smgr.h"
#include "utils/builtins.h"
+#include "utils/injection_point.h"
#include "utils/rel.h"
static void reform_and_rewrite_tuple(HeapTuple tuple,
@@ -316,41 +316,41 @@ heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid,
static TM_Result
heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot,
CommandId cid, Snapshot snapshot, Snapshot crosscheck,
- bool wait, TM_FailureData *tmfd,
- LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes)
+ bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode,
+ Bitmapset **modified_idx_attrs)
{
bool shouldFree = true;
HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
+ bool hot_allowed;
TM_Result result;
+ Assert(ItemPointerIsValid(otid));
+
+ hot_allowed = HeapUpdateHotAllowable(relation, *modified_idx_attrs);
+ *lockmode = HeapUpdateDetermineLockmode(relation, *modified_idx_attrs);
+
/* Update the tuple with table oid */
slot->tts_tableOid = RelationGetRelid(relation);
tuple->t_tableOid = slot->tts_tableOid;
result = heap_update(relation, otid, tuple, cid, crosscheck, wait,
- tmfd, lockmode, update_indexes);
+ tmfd, *lockmode, *modified_idx_attrs, hot_allowed);
ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
/*
- * Decide whether new index entries are needed for the tuple
+ * Decide whether new index entries are needed for the tuple.
*
* Note: heap_update returns the tid (location) of the new tuple in the
* t_self field.
*
- * If the update is not HOT, we must update all indexes. If the update is
- * HOT, it could be that we updated summarized columns, so we either
- * update only summarized indexes, or none at all.
+ * If the tuple returned from heap_update() is marked heap-only, this was
+ * a HOT update and no non-summarizing indexes need updating. Otherwise,
+ * set the MODIFIED_IDX_ATTRS_ALL_IDX sentinel bit so the executor knows
+ * all indexes need updating.
*/
- if (result != TM_Ok)
- {
- Assert(*update_indexes == TU_None);
- *update_indexes = TU_None;
- }
- else if (!HeapTupleIsHeapOnly(tuple))
- Assert(*update_indexes == TU_All);
- else
- Assert((*update_indexes == TU_Summarizing) ||
- (*update_indexes == TU_None));
+ if (result == TM_Ok && !HeapTupleIsHeapOnly(tuple))
+ *modified_idx_attrs = bms_add_member(*modified_idx_attrs,
+ MODIFIED_IDX_ATTRS_ALL_IDX);
if (shouldFree)
pfree(tuple);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 6d0a6f27f3f2e..54db4c68c36a0 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -170,6 +170,7 @@ bthandler(PG_FUNCTION_ARGS)
.amparallelrescan = btparallelrescan,
.amtranslatestrategy = bttranslatestrategy,
.amtranslatecmptype = bttranslatecmptype,
+ .amcomparedatums = NULL,
};
PG_RETURN_POINTER(&amroutine);
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 9f5379b87acbf..c2bb8d063c9f3 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -97,6 +97,7 @@ spghandler(PG_FUNCTION_ARGS)
.amparallelrescan = NULL,
.amtranslatestrategy = NULL,
.amtranslatecmptype = NULL,
+ .amcomparedatums = NULL,
};
PG_RETURN_POINTER(&amroutine);
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index dfda1af412ec3..695a232b9f12c 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -359,7 +359,7 @@ void
simple_table_tuple_update(Relation rel, ItemPointer otid,
TupleTableSlot *slot,
Snapshot snapshot,
- TU_UpdateIndexes *update_indexes)
+ Bitmapset **modified_idx_attrs)
{
TM_Result result;
TM_FailureData tmfd;
@@ -369,7 +369,8 @@ simple_table_tuple_update(Relation rel, ItemPointer otid,
GetCurrentCommandId(true),
snapshot, InvalidSnapshot,
true /* wait for commit */ ,
- &tmfd, &lockmode, update_indexes);
+ &tmfd, &lockmode,
+ modified_idx_attrs);
switch (result)
{
diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c
index 0a1a68e064481..4cd394d8e6c85 100644
--- a/src/backend/catalog/indexing.c
+++ b/src/backend/catalog/indexing.c
@@ -18,6 +18,7 @@
#include "access/genam.h"
#include "access/heapam.h"
#include "access/htup_details.h"
+#include "access/tableam.h"
#include "access/xact.h"
#include "catalog/index.h"
#include "catalog/indexing.h"
@@ -73,7 +74,7 @@ CatalogCloseIndexes(CatalogIndexState indstate)
*/
static void
CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple,
- TU_UpdateIndexes updateIndexes)
+ const Bitmapset *modified_idx_attrs)
{
int i;
int numIndexes;
@@ -83,7 +84,16 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple,
IndexInfo **indexInfoArray;
Datum values[INDEX_MAX_KEYS];
bool isnull[INDEX_MAX_KEYS];
- bool onlySummarized = (updateIndexes == TU_Summarizing);
+ bool allIndexes;
+ bool onlySummarized;
+
+ /*
+ * Determine whether all indexes need updating (non-HOT) or only
+ * summarizing indexes (HOT with summarized column changes).
+ */
+ allIndexes = (modified_idx_attrs == NULL) ||
+ bms_is_member(MODIFIED_IDX_ATTRS_ALL_IDX, modified_idx_attrs);
+ onlySummarized = !allIndexes && !bms_is_empty(modified_idx_attrs);
/*
* HOT update does not require index inserts. But with asserts enabled we
@@ -240,7 +250,7 @@ CatalogTupleInsert(Relation heapRel, HeapTuple tup)
simple_heap_insert(heapRel, tup);
- CatalogIndexInsert(indstate, tup, TU_All);
+ CatalogIndexInsert(indstate, tup, NULL);
CatalogCloseIndexes(indstate);
}
@@ -260,7 +270,7 @@ CatalogTupleInsertWithInfo(Relation heapRel, HeapTuple tup,
simple_heap_insert(heapRel, tup);
- CatalogIndexInsert(indstate, tup, TU_All);
+ CatalogIndexInsert(indstate, tup, NULL);
}
/*
@@ -291,7 +301,7 @@ CatalogTuplesMultiInsertWithInfo(Relation heapRel, TupleTableSlot **slot,
tuple = ExecFetchSlotHeapTuple(slot[i], true, &should_free);
tuple->t_tableOid = slot[i]->tts_tableOid;
- CatalogIndexInsert(indstate, tuple, TU_All);
+ CatalogIndexInsert(indstate, tuple, NULL);
if (should_free)
heap_freetuple(tuple);
@@ -313,15 +323,16 @@ void
CatalogTupleUpdate(Relation heapRel, const ItemPointerData *otid, HeapTuple tup)
{
CatalogIndexState indstate;
- TU_UpdateIndexes updateIndexes = TU_All;
+ Bitmapset *modified_idx_attrs = NULL;
CatalogTupleCheckConstraints(heapRel, tup);
indstate = CatalogOpenIndexes(heapRel);
- simple_heap_update(heapRel, otid, tup, &updateIndexes);
+ simple_heap_update(heapRel, otid, tup, &modified_idx_attrs);
- CatalogIndexInsert(indstate, tup, updateIndexes);
+ CatalogIndexInsert(indstate, tup, modified_idx_attrs);
+ bms_free(modified_idx_attrs);
CatalogCloseIndexes(indstate);
}
@@ -337,13 +348,14 @@ void
CatalogTupleUpdateWithInfo(Relation heapRel, const ItemPointerData *otid, HeapTuple tup,
CatalogIndexState indstate)
{
- TU_UpdateIndexes updateIndexes = TU_All;
+ Bitmapset *modified_idx_attrs = NULL;
CatalogTupleCheckConstraints(heapRel, tup);
- simple_heap_update(heapRel, otid, tup, &updateIndexes);
+ simple_heap_update(heapRel, otid, tup, &modified_idx_attrs);
- CatalogIndexInsert(indstate, tup, updateIndexes);
+ CatalogIndexInsert(indstate, tup, modified_idx_attrs);
+ bms_free(modified_idx_attrs);
}
/*
diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c
index c78dcea98c1f8..1f3560b7f86ea 100644
--- a/src/backend/catalog/toasting.c
+++ b/src/backend/catalog/toasting.c
@@ -300,8 +300,6 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid,
indexInfo->ii_Unique = true;
indexInfo->ii_NullsNotDistinct = false;
indexInfo->ii_ReadyForInserts = true;
- indexInfo->ii_CheckedUnchanged = false;
- indexInfo->ii_IndexUnchanged = false;
indexInfo->ii_Concurrent = false;
indexInfo->ii_BrokenHotChain = false;
indexInfo->ii_ParallelWorkers = 0;
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 98d402c0a3be7..bbe077a9ca900 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -2978,6 +2978,7 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate,
bool is_merge_update)
{
TriggerDesc *trigdesc = relinfo->ri_TrigDesc;
+ TupleDesc tupdesc = RelationGetDescr(relinfo->ri_RelationDesc);
TupleTableSlot *oldslot = ExecGetTriggerOldSlot(estate, relinfo);
HeapTuple newtuple = NULL;
HeapTuple trigtuple;
@@ -2985,7 +2986,9 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate,
bool should_free_new = false;
TriggerData LocTriggerData = {0};
int i;
- Bitmapset *updatedCols;
+ Bitmapset *updatedCols = NULL;
+ Bitmapset *remainingCols = NULL;
+ Bitmapset *modifiedCols;
LockTupleMode lockmode;
/* Determine lock mode to use */
@@ -3127,6 +3130,21 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate,
if (should_free_trig)
heap_freetuple(trigtuple);
+ /*
+ * Before UPDATE triggers may have updated attributes not known to
+ * ExecGetAllUpdatedColumns() using heap_modify_tuple() or
+ * heap_modifiy_tuple_by_cols(). Find and record those now.
+ */
+ remainingCols = bms_add_range(NULL, 1 - FirstLowInvalidHeapAttributeNumber,
+ tupdesc->natts - FirstLowInvalidHeapAttributeNumber);
+ remainingCols = bms_del_members(remainingCols, updatedCols);
+ modifiedCols = ExecCompareSlotAttrs(tupdesc, remainingCols, oldslot, newslot);
+ relinfo->ri_extraUpdatedCols =
+ bms_add_members(relinfo->ri_extraUpdatedCols, modifiedCols);
+
+ bms_free(remainingCols);
+ bms_free(modifiedCols);
+
return true;
}
diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile
index 11118d0ce0250..de469626f6600 100644
--- a/src/backend/executor/Makefile
+++ b/src/backend/executor/Makefile
@@ -22,6 +22,7 @@ OBJS = \
execIndexing.o \
execJunk.o \
execMain.o \
+ execMutation.o \
execParallel.o \
execPartition.o \
execProcnode.o \
diff --git a/src/backend/executor/execExpr.c b/src/backend/executor/execExpr.c
index 088eca24021dd..7e22c745194c4 100644
--- a/src/backend/executor/execExpr.c
+++ b/src/backend/executor/execExpr.c
@@ -30,11 +30,13 @@
*/
#include "postgres.h"
+#include "access/htup_details.h"
#include "access/nbtree.h"
#include "catalog/objectaccess.h"
#include "catalog/pg_proc.h"
#include "catalog/pg_type.h"
#include "executor/execExpr.h"
+#include "executor/execMutation.h"
#include "executor/nodeSubplan.h"
#include "funcapi.h"
#include "jit/jit.h"
@@ -50,6 +52,7 @@
#include "utils/jsonfuncs.h"
#include "utils/jsonpath.h"
#include "utils/lsyscache.h"
+#include "utils/syscache.h"
#include "utils/typcache.h"
@@ -386,6 +389,72 @@ ExecBuildProjectionInfo(List *targetList,
state->parent = parent;
state->ext_params = NULL;
+ /*
+ * If there's a pending SubattrTrackingContext in the EState (set up by
+ * ExecInitModifyTable for UPDATE operations), inject it now so that
+ * JSONB/XML mutation functions can report which indexed subpaths they
+ * modify. This enables HOT updates when only non-indexed subpaths are
+ * modified.
+ */
+ if (parent != NULL && parent->state != NULL &&
+ parent->state->es_pending_subpath_context != NULL)
+ {
+ SubattrTrackingContext *ctx;
+
+ state->es_subattr_context = parent->state->es_pending_subpath_context;
+ ctx = state->es_subattr_context;
+
+ /*
+ * Build resno->attnum mapping. The subplan's targetlist has entries
+ * with resno positions (1, 2, 3...), and we need to map them to the
+ * actual table column numbers (attnums) from updateColnos.
+ *
+ * For a query like "UPDATE t SET col2 = expr", updateColnos contains
+ * [2] and the subplan's targetlist has one non-junk entry with
+ * resno=1. So we map resno 1 -> attnum 2.
+ */
+ if (ctx->updateColnos != NULL && ctx->resno_to_attnum == NULL)
+ {
+ ListCell *lc_tle;
+ int max_resno = 0;
+ int updatecol_idx = 0;
+
+ /* First pass: find max resno */
+ foreach(lc_tle, targetList)
+ {
+ TargetEntry *tle = lfirst_node(TargetEntry, lc_tle);
+
+ if (!tle->resjunk && tle->resno > max_resno)
+ max_resno = tle->resno;
+ }
+
+ if (max_resno > 0)
+ {
+ /* Allocate array (indexed by resno-1, so size is max_resno) */
+ ctx->resno_to_attnum = palloc0(max_resno * sizeof(AttrNumber));
+ ctx->max_resno = max_resno;
+
+ /* Second pass: populate mapping */
+ foreach(lc_tle, targetList)
+ {
+ TargetEntry *tle = lfirst_node(TargetEntry, lc_tle);
+ AttrNumber attnum;
+
+ if (tle->resjunk)
+ continue;
+
+ /* Get corresponding attnum from updateColnos */
+ if (updatecol_idx < list_length(ctx->updateColnos))
+ {
+ attnum = (AttrNumber) list_nth_int(ctx->updateColnos, updatecol_idx);
+ ctx->resno_to_attnum[tle->resno - 1] = attnum;
+ updatecol_idx++;
+ }
+ }
+ }
+ }
+ }
+
state->resultslot = slot;
/* Insert setup steps as needed */
@@ -479,6 +548,8 @@ ExecBuildProjectionInfo(List *targetList,
}
else
{
+ AttrNumber saved_attnum;
+
/*
* Otherwise, compile the column expression normally.
*
@@ -487,9 +558,20 @@ ExecBuildProjectionInfo(List *targetList,
* matter) can change between executions. We instead evaluate
* into the ExprState's resvalue/resnull and then move.
*/
+
+ /*
+ * Track the target column number during expression compilation so
+ * that instrumented mutation functions (prosubattrmutator=true)
+ * know which column they're modifying.
+ */
+ saved_attnum = state->es_current_target_attnum;
+ state->es_current_target_attnum = tle->resno;
+
ExecInitExprRec(tle->expr, state,
&state->resvalue, &state->resnull);
+ state->es_current_target_attnum = saved_attnum;
+
/*
* Column might be referenced multiple times in upper nodes, so
* force value to R/O - but only if it could be an expanded datum.
@@ -574,6 +656,72 @@ ExecBuildUpdateProjection(List *targetList,
state->parent = parent;
state->ext_params = NULL;
+ /*
+ * If there's a pending SubattrTrackingContext in the EState (set up by
+ * ExecInitModifyTable for UPDATE operations), inject it now so that
+ * JSONB/XML mutation functions can report which indexed subpaths they
+ * modify. This enables HOT updates when only non-indexed subpaths are
+ * modified.
+ */
+ if (parent != NULL && parent->state != NULL &&
+ parent->state->es_pending_subpath_context != NULL)
+ {
+ SubattrTrackingContext *ctx;
+
+ state->es_subattr_context = parent->state->es_pending_subpath_context;
+ ctx = state->es_subattr_context;
+
+ /*
+ * Build resno->attnum mapping. The subplan's targetlist has entries
+ * with resno positions (1, 2, 3...), and we need to map them to the
+ * actual table column numbers (attnums) from targetColnos (which is
+ * the same as updateColnos for UPDATE operations).
+ */
+ if (ctx->updateColnos != NULL && ctx->resno_to_attnum == NULL)
+ {
+ ListCell *lc_tle;
+ int max_resno = 0;
+ int updatecol_idx = 0;
+
+ /* First pass: find max resno */
+ foreach(lc_tle, targetList)
+ {
+ TargetEntry *tle = lfirst_node(TargetEntry, lc_tle);
+
+ if (!tle->resjunk && tle->resno > max_resno)
+ max_resno = tle->resno;
+ }
+
+ if (max_resno > 0)
+ {
+ /* Allocate array (indexed by resno-1, so size is max_resno) */
+ ctx->resno_to_attnum = palloc0(max_resno * sizeof(AttrNumber));
+ ctx->max_resno = max_resno;
+
+ /* Second pass: populate mapping */
+ foreach(lc_tle, targetList)
+ {
+ TargetEntry *tle = lfirst_node(TargetEntry, lc_tle);
+ AttrNumber attnum;
+
+ if (tle->resjunk)
+ continue;
+
+ /*
+ * Get corresponding attnum from targetColnos (same as
+ * updateColnos)
+ */
+ if (updatecol_idx < list_length(targetColnos))
+ {
+ attnum = (AttrNumber) list_nth_int(targetColnos, updatecol_idx);
+ ctx->resno_to_attnum[tle->resno - 1] = attnum;
+ updatecol_idx++;
+ }
+ }
+ }
+ }
+ }
+
state->resultslot = slot;
/*
@@ -686,14 +834,30 @@ ExecBuildUpdateProjection(List *targetList,
/* OK, generate code to perform the assignment. */
if (evalTargetList)
{
+ AttrNumber saved_attnum;
+
/*
* We must evaluate the TLE's expression and assign it. We do not
* bother jumping through hoops for "safe" Vars like
* ExecBuildProjectionInfo does; this is a relatively less-used
* path and it doesn't seem worth expending code for that.
*/
+
+ /*
+ * Track the target column number during expression compilation so
+ * that instrumented mutation functions (prosubattrmutator=true)
+ * know which column they're modifying.
+ */
+ saved_attnum = state->es_current_target_attnum;
+ state->es_current_target_attnum = targetattnum;
+ fprintf(stderr, "DEBUG: ExecBuildUpdateProjection: setting es_current_target_attnum=%d for target column\n",
+ targetattnum);
+ fflush(stderr);
+
ExecInitExprRec(tle->expr, state,
&state->resvalue, &state->resnull);
+
+ state->es_current_target_attnum = saved_attnum;
/* Needn't worry about read-only-ness here, either. */
scratch.opcode = EEOP_ASSIGN_TMP;
scratch.d.assign_tmp.resultnum = targetattnum - 1;
@@ -2777,6 +2941,52 @@ ExecInitFunc(ExprEvalStep *scratch, Expr *node, List *args, Oid funcid,
argno++;
}
+ /*
+ * Check if this function is an instrumented sub-attribute mutator. Only
+ * relevant when the ExprState has a SubattrTrackingContext (i.e., this is
+ * the UPDATE projection for a relation with subpath-eligible indexes).
+ */
+ scratch->d.func.fn_tracks_subpaths = false;
+ scratch->d.func.fn_target_attnum = InvalidAttrNumber;
+
+ if (state->es_subattr_context != NULL)
+ {
+ HeapTuple procTup;
+
+ procTup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
+ if (HeapTupleIsValid(procTup))
+ {
+ Form_pg_proc procForm = (Form_pg_proc) GETSTRUCT(procTup);
+
+ if (procForm->prosubattrmutator)
+ {
+ SubattrTrackingContext *ctx = state->es_subattr_context;
+ AttrNumber table_attnum = InvalidAttrNumber;
+
+ /*
+ * Map resno (subplan result position) to table attnum using
+ * the resno_to_attnum mapping populated in
+ * ExecBuildProjectionInfo.
+ *
+ * es_current_target_attnum contains the resno (1-indexed
+ * position in the result tuple), not the actual table column
+ * number.
+ */
+ if (ctx->resno_to_attnum != NULL &&
+ AttributeNumberIsValid(state->es_current_target_attnum) &&
+ state->es_current_target_attnum > 0 &&
+ state->es_current_target_attnum <= ctx->max_resno)
+ {
+ table_attnum = ctx->resno_to_attnum[state->es_current_target_attnum - 1];
+ }
+
+ scratch->d.func.fn_tracks_subpaths = true;
+ scratch->d.func.fn_target_attnum = table_attnum;
+ }
+ ReleaseSysCache(procTup);
+ }
+ }
+
/* Insert appropriate opcode depending on strictness and stats level */
if (pgstat_track_functions <= flinfo->fn_stats)
{
diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c
index 61ff5ddc74c24..f3d35cdf3418e 100644
--- a/src/backend/executor/execExprInterp.c
+++ b/src/backend/executor/execExprInterp.c
@@ -60,6 +60,7 @@
#include "catalog/pg_type.h"
#include "commands/sequence.h"
#include "executor/execExpr.h"
+#include "executor/execMutation.h"
#include "executor/nodeSubplan.h"
#include "funcapi.h"
#include "miscadmin.h"
@@ -921,12 +922,30 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull)
{
FunctionCallInfo fcinfo = op->d.func.fcinfo_data;
Datum d;
+ Node *saved_context = NULL;
+ bool injected = false;
+
+ /*
+ * For instrumented sub-attribute mutators, inject
+ * SubattrTrackingContext so the function can report which indexed
+ * subpaths it affects.
+ */
+ if (op->d.func.fn_tracks_subpaths && state->es_subattr_context)
+ {
+ saved_context = fcinfo->context;
+ state->es_subattr_context->target_attnum = op->d.func.fn_target_attnum;
+ fcinfo->context = (Node *) state->es_subattr_context;
+ injected = true;
+ }
fcinfo->isnull = false;
d = op->d.func.fn_addr(fcinfo);
*op->resvalue = d;
*op->resnull = fcinfo->isnull;
+ if (injected)
+ fcinfo->context = saved_context;
+
EEO_NEXT();
}
@@ -937,6 +956,8 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull)
NullableDatum *args = fcinfo->args;
int nargs = op->d.func.nargs;
Datum d;
+ Node *saved_context = NULL;
+ bool injected = false;
Assert(nargs > 2);
@@ -949,11 +970,28 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull)
goto strictfail;
}
}
+
+ /*
+ * For instrumented sub-attribute mutators, inject
+ * SubattrTrackingContext so the function can report which indexed
+ * subpaths it affects.
+ */
+ if (op->d.func.fn_tracks_subpaths && state->es_subattr_context)
+ {
+ saved_context = fcinfo->context;
+ state->es_subattr_context->target_attnum = op->d.func.fn_target_attnum;
+ fcinfo->context = (Node *) state->es_subattr_context;
+ injected = true;
+ }
+
fcinfo->isnull = false;
d = op->d.func.fn_addr(fcinfo);
*op->resvalue = d;
*op->resnull = fcinfo->isnull;
+ if (injected)
+ fcinfo->context = saved_context;
+
strictfail:
EEO_NEXT();
}
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
index 9d071e495c64e..205c0dc4eae14 100644
--- a/src/backend/executor/execIndexing.c
+++ b/src/backend/executor/execIndexing.c
@@ -106,13 +106,14 @@
*/
#include "postgres.h"
+#include "access/amapi.h"
#include "access/genam.h"
#include "access/relscan.h"
+#include "access/sysattr.h"
#include "access/tableam.h"
#include "access/xact.h"
#include "catalog/index.h"
#include "executor/executor.h"
-#include "nodes/nodeFuncs.h"
#include "storage/lmgr.h"
#include "utils/injection_point.h"
#include "utils/multirangetypes.h"
@@ -139,11 +140,6 @@ static bool check_exclusion_or_unique_constraint(Relation heap, Relation index,
static bool index_recheck_constraint(Relation index, const Oid *constr_procs,
const Datum *existing_values, const bool *existing_isnull,
const Datum *new_values);
-static bool index_unchanged_by_update(ResultRelInfo *resultRelInfo,
- EState *estate, IndexInfo *indexInfo,
- Relation indexRelation);
-static bool index_expression_changed_walker(Node *node,
- Bitmapset *allUpdatedCols);
static void ExecWithoutOverlapsNotEmpty(Relation rel, NameData attname, Datum attval,
char typtype, Oid atttypid);
@@ -269,6 +265,96 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo)
*/
}
+/* ----------------------------------------------------------------
+ * ExecSetIndexUnchanged
+ *
+ * For each index on the result relation, determine whether the
+ * index values are unchanged by this UPDATE and set the per-index
+ * ii_IndexUnchanged flag accordingly.
+ *
+ * The modified_idx_attrs bitmapset contains the set of indexed
+ * attributes that changed value, using the
+ * FirstLowInvalidHeapAttributeNumber offset convention. The
+ * MODIFIED_IDX_ATTRS_ALL_IDX sentinel bit may be set to indicate
+ * a non-HOT update (the tuple got a new TID), meaning all indexes
+ * must be updated -- but we can still set ii_IndexUnchanged=true
+ * for indexes whose key values didn't change, as a hint to the
+ * index AM for bottom-up deletion optimization.
+ *
+ * For non-summarizing indexes during a HOT update (sentinel bit
+ * not set), the index doesn't need new entries at all, so we
+ * skip them entirely in ExecInsertIndexTuples().
+ * ----------------------------------------------------------------
+ */
+void
+ExecSetIndexUnchanged(ResultRelInfo *resultRelInfo,
+ const Bitmapset *modified_idx_attrs)
+{
+ int i;
+ int numIndices = resultRelInfo->ri_NumIndices;
+ RelationPtr relationDescs = resultRelInfo->ri_IndexRelationDescs;
+ IndexInfo **indexInfoArray = resultRelInfo->ri_IndexRelationInfo;
+
+ for (i = 0; i < numIndices; i++)
+ {
+ Relation indexRelation = relationDescs[i];
+ IndexInfo *indexInfo;
+ bool indexUnchanged;
+ int j;
+
+ if (indexRelation == NULL)
+ continue;
+
+ indexInfo = indexInfoArray[i];
+
+ /*
+ * Assume the index is unchanged until we find evidence to the
+ * contrary.
+ */
+ indexUnchanged = true;
+
+ for (j = 0; j < indexInfo->ii_NumIndexKeyAttrs; j++)
+ {
+ AttrNumber attnum = indexInfo->ii_IndexAttrNumbers[j];
+
+ if (attnum == 0)
+ {
+ /*
+ * Expression index column. We can't easily determine which
+ * table columns it references from IndexInfo alone, so be
+ * conservative: if any indexed column was modified, assume
+ * this expression may have changed too.
+ *
+ * We check for non-empty modified_idx_attrs (ignoring the
+ * sentinel bit) as a proxy.
+ */
+ Bitmapset *attrs_only = bms_del_member(bms_copy(modified_idx_attrs),
+ MODIFIED_IDX_ATTRS_ALL_IDX);
+
+ if (!bms_is_empty(attrs_only))
+ indexUnchanged = false;
+
+ bms_free(attrs_only);
+
+ if (!indexUnchanged)
+ break;
+ }
+ else
+ {
+ int bms_idx = attnum - FirstLowInvalidHeapAttributeNumber;
+
+ if (bms_is_member(bms_idx, modified_idx_attrs))
+ {
+ indexUnchanged = false;
+ break;
+ }
+ }
+ }
+
+ indexInfo->ii_IndexUnchanged = indexUnchanged;
+ }
+}
+
/* ----------------------------------------------------------------
* ExecInsertIndexTuples
*
@@ -276,24 +362,12 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo)
* into all the relations indexing the result relation
* when a heap tuple is inserted into the result relation.
*
- * When EIIT_IS_UPDATE is set and EIIT_ONLY_SUMMARIZING isn't,
- * executor is performing an UPDATE that could not use an
- * optimization like heapam's HOT (in more general terms a
- * call to table_tuple_update() took place and set
- * 'update_indexes' to TU_All). Receiving this hint makes
- * us consider if we should pass down the 'indexUnchanged'
- * hint in turn. That's something that we figure out for
- * each index_insert() call iff EIIT_IS_UPDATE is set.
- * (When that flag is not set we already know not to pass the
- * hint to any index.)
- *
- * If EIIT_ONLY_SUMMARIZING is set, an equivalent optimization to
- * HOT has been applied and any updated columns are indexed
- * only by summarizing indexes (or in more general terms a
- * call to table_tuple_update() took place and set
- * 'update_indexes' to TU_Summarizing). We can (and must)
- * therefore only update the indexes that have
- * 'amsummarizing' = true.
+ * When EIIT_IS_UPDATE is set, the executor is performing an
+ * UPDATE. The per-index ii_IndexUnchanged flag (populated by
+ * ExecSetIndexUnchanged()) indicates whether each index's key
+ * values are unchanged by this update. When ii_IndexUnchanged
+ * is true, we pass indexUnchanged=true to index_insert() as a
+ * hint for bottom-up deletion optimization.
*
* Unique and exclusion constraints are enforced at the same
* time. This returns a list of index OIDs for any unique or
@@ -358,21 +432,35 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo,
IndexUniqueCheck checkUnique;
bool indexUnchanged;
bool satisfiesConstraint;
+ RelSubattrInfo *subattrinfo;
if (indexRelation == NULL)
continue;
indexInfo = indexInfoArray[i];
+ /* TEST */
+ subattrinfo = RelationGetIdxSubattrs(indexRelation);
+ Assert(subattrinfo == subattrinfo);
+
/* If the index is marked as read-only, ignore it */
if (!indexInfo->ii_ReadyForInserts)
continue;
/*
- * Skip processing of non-summarizing indexes if we only update
- * summarizing indexes
+ * For UPDATE operations, use the per-index ii_IndexUnchanged flag
+ * (populated by ExecSetIndexUnchanged) to determine behavior.
+ *
+ * For HOT updates (EIIT_IS_UPDATE set, EIIT_ALL_INDEXES not set):
+ * skip non-summarizing indexes entirely since the heap-only tuple
+ * doesn't need new entries in them. Only summarizing indexes with
+ * modified columns get new entries.
+ *
+ * For non-HOT updates (EIIT_ALL_INDEXES set): all indexes get new
+ * entries because the tuple has a new TID.
*/
- if ((flags & EIIT_ONLY_SUMMARIZING) && !indexInfo->ii_Summarizing)
+ if ((flags & EIIT_IS_UPDATE) && !(flags & EIIT_ALL_INDEXES) &&
+ !indexInfo->ii_Summarizing)
continue;
/* Check for partial index */
@@ -437,13 +525,13 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo,
/*
* There's definitely going to be an index_insert() call for this
* index. If we're being called as part of an UPDATE statement,
- * consider if the 'indexUnchanged' = true hint should be passed.
+ * use the per-index ii_IndexUnchanged flag (populated by
+ * ExecSetIndexUnchanged) to hint whether the index values are
+ * unchanged. This helps the index AM optimize for bottom-up
+ * deletion of duplicate index entries.
*/
- indexUnchanged = ((flags & EIIT_IS_UPDATE) &&
- index_unchanged_by_update(resultRelInfo,
- estate,
- indexInfo,
- indexRelation));
+ indexUnchanged = (flags & EIIT_IS_UPDATE) ?
+ indexInfo->ii_IndexUnchanged : false;
satisfiesConstraint =
index_insert(indexRelation, /* index relation */
@@ -998,152 +1086,6 @@ index_recheck_constraint(Relation index, const Oid *constr_procs,
return true;
}
-/*
- * Check if ExecInsertIndexTuples() should pass indexUnchanged hint.
- *
- * When the executor performs an UPDATE that requires a new round of index
- * tuples, determine if we should pass 'indexUnchanged' = true hint for one
- * single index.
- */
-static bool
-index_unchanged_by_update(ResultRelInfo *resultRelInfo, EState *estate,
- IndexInfo *indexInfo, Relation indexRelation)
-{
- Bitmapset *updatedCols;
- Bitmapset *extraUpdatedCols;
- Bitmapset *allUpdatedCols;
- bool hasexpression = false;
- List *idxExprs;
-
- /*
- * Check cache first
- */
- if (indexInfo->ii_CheckedUnchanged)
- return indexInfo->ii_IndexUnchanged;
- indexInfo->ii_CheckedUnchanged = true;
-
- /*
- * Check for indexed attribute overlap with updated columns.
- *
- * Only do this for key columns. A change to a non-key column within an
- * INCLUDE index should not be counted here. Non-key column values are
- * opaque payload state to the index AM, a little like an extra table TID.
- *
- * Note that row-level BEFORE triggers won't affect our behavior, since
- * they don't affect the updatedCols bitmaps generally. It doesn't seem
- * worth the trouble of checking which attributes were changed directly.
- */
- updatedCols = ExecGetUpdatedCols(resultRelInfo, estate);
- extraUpdatedCols = ExecGetExtraUpdatedCols(resultRelInfo, estate);
- for (int attr = 0; attr < indexInfo->ii_NumIndexKeyAttrs; attr++)
- {
- int keycol = indexInfo->ii_IndexAttrNumbers[attr];
-
- if (keycol <= 0)
- {
- /*
- * Skip expressions for now, but remember to deal with them later
- * on
- */
- hasexpression = true;
- continue;
- }
-
- if (bms_is_member(keycol - FirstLowInvalidHeapAttributeNumber,
- updatedCols) ||
- bms_is_member(keycol - FirstLowInvalidHeapAttributeNumber,
- extraUpdatedCols))
- {
- /* Changed key column -- don't hint for this index */
- indexInfo->ii_IndexUnchanged = false;
- return false;
- }
- }
-
- /*
- * When we get this far and index has no expressions, return true so that
- * index_insert() call will go on to pass 'indexUnchanged' = true hint.
- *
- * The _absence_ of an indexed key attribute that overlaps with updated
- * attributes (in addition to the total absence of indexed expressions)
- * shows that the index as a whole is logically unchanged by UPDATE.
- */
- if (!hasexpression)
- {
- indexInfo->ii_IndexUnchanged = true;
- return true;
- }
-
- /*
- * Need to pass only one bms to expression_tree_walker helper function.
- * Avoid allocating memory in common case where there are no extra cols.
- */
- if (!extraUpdatedCols)
- allUpdatedCols = updatedCols;
- else
- allUpdatedCols = bms_union(updatedCols, extraUpdatedCols);
-
- /*
- * We have to work slightly harder in the event of indexed expressions,
- * but the principle is the same as before: try to find columns (Vars,
- * actually) that overlap with known-updated columns.
- *
- * If we find any matching Vars, don't pass hint for index. Otherwise
- * pass hint.
- */
- idxExprs = RelationGetIndexExpressions(indexRelation);
- hasexpression = index_expression_changed_walker((Node *) idxExprs,
- allUpdatedCols);
- list_free(idxExprs);
- if (extraUpdatedCols)
- bms_free(allUpdatedCols);
-
- if (hasexpression)
- {
- indexInfo->ii_IndexUnchanged = false;
- return false;
- }
-
- /*
- * Deliberately don't consider index predicates. We should even give the
- * hint when result rel's "updated tuple" has no corresponding index
- * tuple, which is possible with a partial index (provided the usual
- * conditions are met).
- */
- indexInfo->ii_IndexUnchanged = true;
- return true;
-}
-
-/*
- * Indexed expression helper for index_unchanged_by_update().
- *
- * Returns true when Var that appears within allUpdatedCols located.
- */
-static bool
-index_expression_changed_walker(Node *node, Bitmapset *allUpdatedCols)
-{
- if (node == NULL)
- return false;
-
- if (IsA(node, Var))
- {
- Var *var = (Var *) node;
-
- if (bms_is_member(var->varattno - FirstLowInvalidHeapAttributeNumber,
- allUpdatedCols))
- {
- /* Var was updated -- indicates that we should not hint */
- return true;
- }
-
- /* Still haven't found a reason to not pass the hint */
- return false;
- }
-
- return expression_tree_walker(node, index_expression_changed_walker,
- allUpdatedCols);
-}
-
/*
* ExecWithoutOverlapsNotEmpty - raise an error if the tuple has an empty
* range or multirange in the given attribute.
diff --git a/src/backend/executor/execMutation.c b/src/backend/executor/execMutation.c
new file mode 100644
index 0000000000000..f875c6827c18b
--- /dev/null
+++ b/src/backend/executor/execMutation.c
@@ -0,0 +1,216 @@
+/*-------------------------------------------------------------------------
+ *
+ * execMutation.c
+ * Sub-attribute mutation tracking for UPDATE HOT optimization.
+ *
+ * src/backend/executor/execMutation.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "executor/execMutation.h"
+#include "access/htup_details.h"
+#include "access/sysattr.h"
+#include "access/tupdesc.h"
+#include "fmgr.h"
+#include "nodes/bitmapset.h"
+#include "optimizer/cost.h"
+#include "utils/idxsubattr.h"
+#include "utils/memutils.h"
+#include "varatt.h"
+
+void
+slot_add_modified_idx_attr(TupleTableSlot *slot, AttrNumber attnum)
+{
+ MemoryContext oldcxt;
+ int attidx;
+
+ Assert(slot != NULL);
+ Assert(AttributeNumberIsValid(attnum));
+
+ attidx = attnum - FirstLowInvalidHeapAttributeNumber;
+
+ /*
+ * Allocate in the slot's memory context (typically the per-query
+ * context), not in the per-tuple expression context. This ensures the
+ * Bitmapset survives expression context resets between ExecProcNode and
+ * ExecCheckIndexedAttrsForChanges.
+ */
+ oldcxt = MemoryContextSwitchTo(slot->tts_mcxt);
+ slot->tts_modified_idx_attrs = bms_add_member(slot->tts_modified_idx_attrs, attidx);
+ MemoryContextSwitchTo(oldcxt);
+}
+
+/*----------
+ * HeapCheckSubattrChanges - refine modified index attributes via sub-attribute comparison
+ *
+ * For each attribute number in 'check_attrs' (encoded with
+ * FirstLowInvalidHeapAttributeNumber offset as used by the bitmapset
+ * conventions in heapam.c), check whether the indexed sub-attributes
+ * actually changed between oldtup and newtup.
+ *
+ * Returns a Bitmapset of attribute numbers (same encoding) where
+ * the indexed sub-attributes did NOT change -- these can be removed from
+ * the modified index attributes set.
+ *
+ * Dual-path architecture
+ * ----------------------
+ * Sub-attribute modification tracking uses two complementary strategies:
+ *
+ * 1. Instrumented path (executor only): Mutation functions
+ * (jsonb_set, jsonb_delete, xpath, etc.) that modify portions of
+ * an attribute receive a SubattrTrackingContext via fcinfo->context.
+ * When these functions modify a sub-attribute that is used in forming
+ * an index key, they call slot_add_modified_idx_attr() to record that
+ * the attribute was modified in a way that affects the index.
+ * ExecUpdateModifiedIdxAttrs reads the accumulated tts_modified_idx_attrs
+ * from the slot. This is the fast path -- it avoids re-reading and
+ * re-comparing the old/new values entirely.
+ *
+ * 2. Fallback path (this function): For non-executor callers
+ * (simple_heap_update, catalog operations) where instrumentation
+ * is unavailable, and for executor updates with uninstrumented
+ * mutation functions (direct assignment, opaque functions, etc.).
+ * Extracts old and new column values, then calls the type-specific
+ * comparator (e.g. jsonb_idx_compare, xml_idx_compare) to check
+ * each indexed sub-attribute individually.
+ *
+ * For typical JSONB workloads with expression indexes, the instrumented
+ * path avoids the full-value comparison, yielding significant speedups
+ * (9-126x in benchmarks depending on document size and update pattern).
+ *
+ * TOAST safety
+ * ------------
+ * This function handles TOAST values correctly:
+ * - Inline-compressed values: decompressed in-memory (safe).
+ * - Externally-TOASTed values: skipped conservatively. Detoasting
+ * external values would read TOAST relation pages, risking
+ * lock-ordering issues when the caller holds a buffer lock.
+ * Skipping means we treat the column as changed, which is safe
+ * (correctly identifies the attribute as modified but may be conservative).
+ *----------
+ */
+Bitmapset *
+HeapCheckSubattrChanges(Relation relation,
+ HeapTuple oldtup,
+ HeapTuple newtup,
+ Bitmapset *check_attrs)
+{
+ RelSubattrInfo *subattr_info;
+ TupleDesc tupdesc;
+ Bitmapset *safe_attrs = NULL;
+ int bms_idx;
+
+ if (!enable_subpath_hot)
+ return NULL;
+
+ subattr_info = RelationGetIdxSubattrs(relation);
+ if (subattr_info == NULL)
+ return NULL;
+
+ tupdesc = RelationGetDescr(relation);
+
+ bms_idx = -1;
+ while ((bms_idx = bms_next_member(check_attrs, bms_idx)) >= 0)
+ {
+ AttrNumber realattnum;
+ AttrSubattrInfo *attr_info;
+ bool old_isnull;
+ bool new_isnull;
+ Datum old_val;
+ Datum new_val;
+ bool subpath_changed;
+
+ realattnum = bms_idx + FirstLowInvalidHeapAttributeNumber;
+
+ elog(LOG, "HeapCheckSubattrChanges: checking column %d (bms_idx %d)", realattnum, bms_idx);
+
+ /* Only user-defined attributes can have subpath info */
+ if (realattnum < 1 || realattnum > tupdesc->natts)
+ continue;
+
+ /*
+ * Skip attributes that are also referenced by a simple (whole-column)
+ * index. For those, any byte change requires an index update
+ * regardless of subpath analysis.
+ */
+ if (bms_is_member(bms_idx, subattr_info->simple_indexed_attrs))
+ continue;
+
+ /* Quick membership test before linear scan */
+ if (!bms_is_member(bms_idx, subattr_info->subattr_attrs))
+ continue;
+
+ /* Look up subpath info for this attribute */
+ attr_info = NULL;
+ for (int i = 0; i < subattr_info->nattrs; i++)
+ {
+ if (subattr_info->attrs[i].attnum == realattnum)
+ {
+ attr_info = &subattr_info->attrs[i];
+ break;
+ }
+ }
+
+ if (attr_info == NULL || !attr_info->has_comparefn)
+ continue;
+
+ /* Extract old and new values */
+ old_val = heap_getattr(oldtup, realattnum, tupdesc, &old_isnull);
+ new_val = heap_getattr(newtup, realattnum, tupdesc, &new_isnull);
+
+ /* NULL transitions always count as changed */
+ if (old_isnull != new_isnull)
+ continue;
+
+ /* Both NULL: effectively unchanged for index purposes */
+ if (old_isnull)
+ {
+ safe_attrs = bms_add_member(safe_attrs, bms_idx);
+ continue;
+ }
+
+ /*
+ * For varlena types, skip externally-TOASTed values. We cannot
+ * safely detoast while the caller holds a buffer lock because
+ * detoasting reads from the TOAST relation (acquires buffer pins on
+ * different pages, risking lock-ordering issues).
+ *
+ * Inline-compressed values are fine -- decompression is purely
+ * in-memory.
+ */
+ if (TupleDescAttr(tupdesc, realattnum - 1)->attlen == -1)
+ {
+ struct varlena *old_ptr = (struct varlena *) DatumGetPointer(old_val);
+ struct varlena *new_ptr = (struct varlena *) DatumGetPointer(new_val);
+
+ if (VARATT_IS_EXTERNAL(old_ptr) || VARATT_IS_EXTERNAL(new_ptr))
+ continue; /* conservative: treat as changed */
+ }
+
+ /*
+ * Call the type-specific subpath comparator. The function receives
+ * the old value, new value, descriptor array, and descriptor count.
+ * Returns true if any indexed subpath value differs between old and
+ * new.
+ */
+ subpath_changed = DatumGetBool(
+ FunctionCall4(&attr_info->comparefn,
+ old_val,
+ new_val,
+ PointerGetDatum(attr_info->descriptors),
+ Int32GetDatum(attr_info->ndescriptors)));
+
+ elog(LOG, "HeapCheckSubattrChanges: jsonb_idx_compare returned %s for column %d",
+ subpath_changed ? "true (changed)" : "false (unchanged)", realattnum);
+
+ if (!subpath_changed)
+ {
+ elog(LOG, "HeapCheckSubattrChanges: adding column %d to safe_attrs", realattnum);
+ safe_attrs = bms_add_member(safe_attrs, bms_idx);
+ }
+ }
+
+ return safe_attrs;
+}
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index 2497ee7edc510..88fbbf1cb4b26 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -33,6 +33,7 @@
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
+#include "utils/relcache.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
#include "utils/typcache.h"
@@ -906,6 +907,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo,
bool skip_tuple = false;
Relation rel = resultRelInfo->ri_RelationDesc;
ItemPointer tid = &(searchslot->tts_tid);
+ Bitmapset *modified_idx_attrs;
/*
* We support only non-system tables, with
@@ -928,7 +930,6 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo,
if (!skip_tuple)
{
List *recheckIndexes = NIL;
- TU_UpdateIndexes update_indexes;
List *conflictindexes;
bool conflict = false;
@@ -944,25 +945,35 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo,
if (rel->rd_rel->relispartition)
ExecPartitionCheck(resultRelInfo, slot, estate, true);
+ modified_idx_attrs = ExecUpdateModifiedIdxAttrs(resultRelInfo,
+ estate, searchslot, slot);
+
simple_table_tuple_update(rel, tid, slot, estate->es_snapshot,
- &update_indexes);
+ &modified_idx_attrs);
conflictindexes = resultRelInfo->ri_onConflictArbiterIndexes;
- if (resultRelInfo->ri_NumIndices > 0 && (update_indexes != TU_None))
+ if (resultRelInfo->ri_NumIndices > 0 &&
+ !bms_is_empty(modified_idx_attrs))
{
bits32 flags = EIIT_IS_UPDATE;
if (conflictindexes != NIL)
flags |= EIIT_NO_DUPE_ERROR;
- if (update_indexes == TU_Summarizing)
- flags |= EIIT_ONLY_SUMMARIZING;
+ if (bms_is_member(MODIFIED_IDX_ATTRS_ALL_IDX,
+ modified_idx_attrs))
+ flags |= EIIT_ALL_INDEXES;
+
+ ExecSetIndexUnchanged(resultRelInfo, modified_idx_attrs);
+
recheckIndexes = ExecInsertIndexTuples(resultRelInfo,
estate, flags,
slot, conflictindexes,
&conflict);
}
+ bms_free(modified_idx_attrs);
+
/*
* Refer to the comments above the call to CheckAndReportConflict() in
* ExecSimpleRelationInsert to understand why this check is done at
diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c
index b768eae9e53d4..9ff69994c81a9 100644
--- a/src/backend/executor/execTuples.c
+++ b/src/backend/executor/execTuples.c
@@ -66,6 +66,7 @@
#include "nodes/nodeFuncs.h"
#include "storage/bufmgr.h"
#include "utils/builtins.h"
+#include "utils/datum.h"
#include "utils/expandeddatum.h"
#include "utils/lsyscache.h"
#include "utils/typcache.h"
@@ -1342,6 +1343,8 @@ MakeTupleTableSlot(TupleDesc tupleDesc,
PinTupleDesc(tupleDesc);
}
+ slot->tts_modified_idx_attrs = NULL;
+
/*
* And allow slot type specific initialization.
*/
@@ -1929,6 +1932,83 @@ ExecFetchSlotHeapTupleDatum(TupleTableSlot *slot)
return ret;
}
+/*
+ * ExecCompareSlotAttrs
+ *
+ * Compare the subset of attributes in attrs bewtween TupleTableSlots to detect
+ * which attributes have changed.
+ *
+ * Returns a Bitmapset of attribute indices (using
+ * FirstLowInvalidHeapAttributeNumber convention) that differ between the two
+ * slots.
+ */
+Bitmapset *
+ExecCompareSlotAttrs(TupleDesc tupdesc, const Bitmapset *attrs,
+ TupleTableSlot *s1, TupleTableSlot *s2)
+{
+ int attidx = -1;
+ Bitmapset *modified = NULL;
+
+ /* XXX what if slots don't share the same tupleDescriptor... */
+ /* Assert(s1->tts_tupleDescriptor == s2->tts_tupleDescriptor); */
+
+ while ((attidx = bms_next_member(attrs, attidx)) >= 0)
+ {
+ /* attidx is zero-based, attrnum is the normal attribute number */
+ AttrNumber attrnum = attidx + FirstLowInvalidHeapAttributeNumber;
+ Datum value1,
+ value2;
+ bool null1,
+ null2;
+ CompactAttribute *att;
+
+ /*
+ * If it's a whole-tuple reference, say "not equal". It's not really
+ * worth supporting this case, since it could only succeed after a
+ * no-op update, which is hardly a case worth optimizing for.
+ */
+ if (attrnum == 0)
+ {
+ modified = bms_add_member(modified, attidx);
+ continue;
+ }
+
+ /*
+ * Likewise, automatically say "not equal" for any system attribute
+ * other than tableOID; we cannot expect these to be consistent in a
+ * HOT chain, or even to be set correctly yet in the new tuple.
+ */
+ if (attrnum < 0)
+ {
+ if (attrnum != TableOidAttributeNumber)
+ {
+ modified = bms_add_member(modified, attidx);
+ continue;
+ }
+ }
+
+ att = TupleDescCompactAttr(tupdesc, attrnum - 1);
+ value1 = slot_getattr(s1, attrnum, &null1);
+ value2 = slot_getattr(s2, attrnum, &null2);
+
+ /* A change to/from NULL, so not equal */
+ if (null1 != null2)
+ {
+ modified = bms_add_member(modified, attidx);
+ continue;
+ }
+
+ /* Both NULL, no change/unmodified */
+ if (null2)
+ continue;
+
+ if (!datum_image_eq(value1, value2, att->attbyval, att->attlen))
+ modified = bms_add_member(modified, attidx);
+ }
+
+ return modified;
+}
+
/* ----------------------------------------------------------------
* convenience initialization routines
* ----------------------------------------------------------------
diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c
index a7955e476f903..da592f4cd37a5 100644
--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@@ -132,6 +132,8 @@ CreateExecutorState(void)
estate->es_insert_pending_result_relations = NIL;
estate->es_insert_pending_modifytables = NIL;
+ estate->es_pending_subpath_context = NULL;
+
estate->es_param_list_info = NULL;
estate->es_param_exec_vals = NULL;
diff --git a/src/backend/executor/meson.build b/src/backend/executor/meson.build
index dc45be0b2ce97..2c0c292f2b74e 100644
--- a/src/backend/executor/meson.build
+++ b/src/backend/executor/meson.build
@@ -10,6 +10,7 @@ backend_sources += files(
'execIndexing.c',
'execJunk.c',
'execMain.c',
+ 'execMutation.c',
'execParallel.c',
'execPartition.c',
'execProcnode.c',
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 327c27abff9c8..e4c99b8eebc17 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -17,6 +17,7 @@
* ExecModifyTable - retrieve the next tuple from the node
* ExecEndModifyTable - shut down the ModifyTable node
* ExecReScanModifyTable - rescan the ModifyTable node
+ * ExecUpdateModifiedIdxAttrs - find set of updated indexed columns
*
* NOTES
* The ModifyTable node receives input from its outerPlan, which is
@@ -54,23 +55,31 @@
#include "access/htup_details.h"
#include "access/tableam.h"
+#include "access/tupdesc.h"
#include "access/xact.h"
#include "commands/trigger.h"
+#include "catalog/pg_proc.h"
+#include "executor/execExpr.h"
+#include "executor/execMutation.h"
#include "executor/execPartition.h"
#include "executor/executor.h"
#include "executor/nodeModifyTable.h"
#include "foreign/fdwapi.h"
#include "miscadmin.h"
#include "nodes/nodeFuncs.h"
+#include "optimizer/cost.h"
#include "optimizer/optimizer.h"
#include "rewrite/rewriteHandler.h"
#include "rewrite/rewriteManip.h"
#include "storage/lmgr.h"
#include "utils/builtins.h"
#include "utils/datum.h"
+#include "utils/idxsubattr.h"
#include "utils/injection_point.h"
#include "utils/rel.h"
#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+
typedef struct MTTargetRelLookup
@@ -123,7 +132,14 @@ typedef struct ModifyTableContext
typedef struct UpdateContext
{
bool crossPartUpdate; /* was it a cross-partition update? */
- TU_UpdateIndexes updateIndexes; /* Which index updates are required? */
+
+ /*
+ * Modified indexed attributes bitmapset, set by ExecUpdateAct().
+ * After table_tuple_update(), the MODIFIED_IDX_ATTRS_ALL_IDX sentinel
+ * bit may be set to indicate a non-HOT update requiring all indexes
+ * to be updated.
+ */
+ Bitmapset *modifiedIdxAttrs;
/*
* Lock mode to acquire on the latest tuple version before performing
@@ -187,7 +203,271 @@ static TupleTableSlot *ExecMergeMatched(ModifyTableContext *context,
static TupleTableSlot *ExecMergeNotMatched(ModifyTableContext *context,
ResultRelInfo *resultRelInfo,
bool canSetTag);
+static bool ExecSubattributeCompare(Relation rel, AttrNumber attnum,
+ Datum old_val, Datum new_val);
+static void InitModifiedIdxTracking(ModifyTableState *mtstate,
+ ResultRelInfo *resultRelInfo,
+ PlanState *subplanstate,
+ List *updateColnos);
+static bool HasCompleteModificationTracking(Node *expr, AttrNumber target_attnum);
+
+/*
+ * ExecSubattributeCompare
+ *
+ * Call the type's typidxcompare function to check whether any indexed
+ * subpath on this attribute has a different value between old and new.
+ *
+ * Returns true if any indexed subpath value changed.
+ */
+static bool
+ExecSubattributeCompare(Relation rel, AttrNumber attnum,
+ Datum old_val, Datum new_val)
+{
+ AttrSubattrInfo *attrinfo;
+ attrinfo = RelationGetAttrSubattrInfo(rel, attnum);
+
+ /* No compare function; conservatively assume changed */
+ if (attrinfo == NULL || !attrinfo->has_comparefn)
+ return true;
+
+ /*
+ * typidxcompare(old, new, descriptors_array, ndescriptors) -> bool
+ *
+ * The descriptors are passed as an internal pointer + count. The function
+ * returns true if any indexed subpath value differs.
+ */
+ return DatumGetBool(FunctionCall4(&attrinfo->comparefn,
+ old_val,
+ new_val,
+ PointerGetDatum(attrinfo->descriptors),
+ Int32GetDatum(attrinfo->ndescriptors)));
+}
+
+/*
+ * ExecUpdateModifiedIdxAttrs
+ *
+ * Find the set of attributes referenced by this relation and used in this
+ * UPDATE that now differ in value. This is done by reviewing slot datum that
+ * are in the UPDATE statement and are known to be referenced by at least one
+ * index in some way. This set is called the "modified indexed attributes" or
+ * "modified_idx_attrs". An overlap of a single index's attributes and this
+ * set signals that the attributes in the new_tts used to form the index datum
+ * have changed.
+ *
+ * Returns a Bitmapset that contains the set of modified (changed) indexed
+ * attributes between oldtup and newtup.
+ *
+ * We byte-compare (datum_is_equal) most non-sub-attribute indexed
+ * columns. For sub-attribute-aware columns the logic is:
+ *
+ * (a) Fully instrumented (mutation fns tracked all changes):
+ * - attnum IN modified_idx_attrs -> changed
+ * - attnum NOT IN modified_idx_attrs -> unchanged
+ *
+ * (b) Not fully instrumented (direct assignment, opaque fns, etc.):
+ * - attnum IN modified_idx_attrs -> changed
+ * - attnum NOT IN modified_idx_attrs:
+ * bytes equal -> unchanged
+ * bytes differ -> call typidxcompare:
+ * true -> changed
+ * false -> unchanged (sub-attributes same despite byte diff)
+ *
+ * NOTE: There is a similar function called HeapUpdateModifiedIdxAttrs() that
+ * operates on the old TID and new HeapTuple rather than the old/new
+ * TupleTableSlots as this function does. These two functions should mirror
+ * one another until someday when catalog tuple updates track their changes
+ * avoiding the need to re-discover them in simple_heap_update().
+ */
+Bitmapset *
+ExecUpdateModifiedIdxAttrs(ResultRelInfo *resultRelInfo,
+ EState *estate,
+ TupleTableSlot *old_tts,
+ TupleTableSlot *new_tts)
+{
+ Relation relation = resultRelInfo->ri_RelationDesc;
+ TupleDesc tupdesc = RelationGetDescr(relation);
+ RelSubattrInfo *subattrinfo;
+ Bitmapset *instrumented = resultRelInfo->ri_InstrumentedIdxAttrs;
+ Bitmapset *idx_attrs;
+ Bitmapset *acc_attrs = NULL;
+ Bitmapset *com_attrs = NULL;
+ Bitmapset *sub_attrs = NULL;
+ Bitmapset *result = NULL;
+ int attidx;
+
+ /* If no indexes, we're done */
+ if (resultRelInfo->ri_NumIndices == 0)
+ return NULL;
+
+ /*
+ * Skip subpath optimization for system catalog tables.
+ * RelationGetIdxSubattrs() triggers syscache lookups which can see
+ * inconsistent catalog state during catalog updates (e.g., ALTER TYPE
+ * RENAME). System catalogs never have JSONB/XML expression indexes
+ * anyway.
+ */
+ if (IsSystemRelation(relation))
+ subattrinfo = NULL;
+ else
+ subattrinfo = RelationGetIdxSubattrs(relation);
+
+ /*
+ * Build the union of all "interesting" attribute sets. This must cover
+ * every column that heap_update()'s HeapSatisfiesHOTandKeyUpdate will
+ * check, otherwise we risk incorrect satisfies_key or satisfies_id
+ * decisions. In particular, REPLICA IDENTITY FULL includes non-indexed
+ * columns in IDENTITY_KEY; we must detect changes to those columns for
+ * correct logical decoding.
+ */
+ idx_attrs = RelationGetIndexAttrBitmap(relation,
+ INDEX_ATTR_BITMAP_INDEXED);
+ idx_attrs = bms_add_members(idx_attrs,
+ RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY));
+ idx_attrs = bms_add_members(idx_attrs,
+ RelationGetIndexAttrBitmap(relation,
+ INDEX_ATTR_BITMAP_IDENTITY_KEY));
+
+ /*
+ * Fetch the set of attributes explicitly SET in the UPDATE statement or
+ * set by a before row trigger (even if not mentioned in the SQL) from the
+ * executor state and then find the intersection with the indexed
+ * attributes. Attributes that are SET might not change value, so we have
+ * to examine them for changes.
+ */
+ idx_attrs = bms_int_members(idx_attrs, ExecGetAllUpdatedCols(resultRelInfo, estate));
+
+ /*
+ * Read the accumulated mix tracking bitmapset from the slot. NULL means
+ * "no mutation function reported any change" but that doesn't mean the
+ * are no modified indexed attributes, we still need to check here.
+ */
+ if (resultRelInfo->ri_MixSlot != NULL)
+ acc_attrs = resultRelInfo->ri_MixSlot->tts_modified_idx_attrs;
+
+ /*----------
+ * Split SET/indexed attributes into two groups:
+ *
+ * com_attrs - standard byte compare (no subpath info)
+ * sub_attrs - eligible for subpath comparison
+ *
+ * An attribute is "subpath only" when it has subpath descriptors
+ * AND is not referenced by any simple (whole-column) index.
+ *
+ * XXX cache (relcache?) these?
+ *----------
+ */
+ attidx = -1;
+ while ((attidx = bms_next_member(idx_attrs, attidx)) >= 0)
+ {
+ AttrNumber attrnum = attidx + FirstLowInvalidHeapAttributeNumber;
+
+ if (subattrinfo != NULL &&
+ attrnum > 0 &&
+ bms_is_member(attidx, subattrinfo->subattr_attrs) &&
+ !bms_is_member(attidx, subattrinfo->simple_indexed_attrs))
+ sub_attrs = bms_add_member(sub_attrs, attidx);
+ else
+ com_attrs = bms_add_member(com_attrs, attidx);
+ }
+
+ /* Simple attributes */
+ if (!bms_is_empty(com_attrs))
+ {
+ Bitmapset *changed = ExecCompareSlotAttrs(tupdesc, com_attrs,
+ old_tts, new_tts);
+
+ result = bms_union(result, changed);
+ bms_free(changed);
+ }
+
+ /* sub-attribute-aware attributes */
+ if (!bms_is_empty(sub_attrs))
+ {
+ /* First compare ALL subpath-only attrs */
+ Bitmapset *changed = ExecCompareSlotAttrs(tupdesc, sub_attrs,
+ old_tts, new_tts);
+
+ attidx = -1;
+ while ((attidx = bms_next_member(sub_attrs, attidx)) >= 0)
+ {
+ AttrNumber attrnum;
+ bool in_mix;
+ bool is_instrumented;
+ bool bytes_differ;
+
+ attrnum = attidx + FirstLowInvalidHeapAttributeNumber;
+ in_mix = bms_is_member(attidx, acc_attrs);
+ is_instrumented = bms_is_member(attidx, instrumented);
+ bytes_differ = bms_is_member(attidx, changed);
+
+ /* A mutation function already recorded a change */
+ if (in_mix)
+ {
+ result = bms_add_member(result, attidx);
+ continue;
+ }
+
+ /*
+ * Fully instrumented, but mutation functions did NOT report a
+ * change. They checked all indexed subpaths and found none
+ * changed. Safe to skip, even if the column's bytes differ
+ * (non-indexed subpaths changed).
+ */
+ if (is_instrumented)
+ continue;
+
+ /*----------
+ * Not fully instrumented and not in modified_idx_attrs.
+ * This covers:
+ * - Direct assignment (SET data = '...'::jsonb)
+ * - Opaque/uninstrumented functions (e.g. XML,
+ * or JSONB methods without mutation tracking)
+ *
+ * Byte compare as fast path, then type-specific
+ * subpath compare for ambiguous cases.
+ *----------
+ */
+ if (bytes_differ)
+ {
+ Datum old_val,
+ new_val;
+ bool old_null,
+ new_null;
+
+ /*
+ * Bytes differ, so call the type's comparison function to
+ * check if any indexed subpath value actually changed.
+ */
+ old_val = slot_getattr(old_tts, attrnum, &old_null);
+ new_val = slot_getattr(new_tts, attrnum, &new_null);
+
+ /*
+ * A NULL transition (NULL->non-NULL or non-NULL->NULL) always
+ * counts as a change. We cannot call the type-specific
+ * subpath comparator on NULL values.
+ */
+ if (old_null || new_null)
+ {
+ result = bms_add_member(result, attidx);
+ continue;
+ }
+
+ if (ExecSubattributeCompare(relation, attrnum, old_val, new_val))
+ result = bms_add_member(result, attidx);
+ /* else: bytes differ but indexed subpaths unchanged, so skip */
+ }
+ }
+
+ bms_free(changed);
+ }
+
+ bms_free(idx_attrs);
+ bms_free(com_attrs);
+ bms_free(sub_attrs);
+
+ return result;
+}
/*
* Verify that the tuples to be produced by INSERT match the
@@ -766,6 +1046,85 @@ ExecInitUpdateProjection(ModifyTableState *mtstate,
&mtstate->ps);
resultRelInfo->ri_projectNewInfoValid = true;
+
+ /*
+ * Initialize SubattrTrackingContext for sub-attribute mutation tracking
+ * if this relation has subpath-eligible indexes.
+ *
+ * Skip for system catalog tables to avoid syscache lookups during catalog
+ * updates which can see inconsistent state.
+ */
+ resultRelInfo->ri_InstrumentedIdxAttrs = NULL;
+ resultRelInfo->ri_MixSlot = resultRelInfo->ri_newTupleSlot;
+
+ if (!IsSystemRelation(resultRelInfo->ri_RelationDesc) &&
+ RelationGetIdxSubattrs(resultRelInfo->ri_RelationDesc) != NULL)
+ {
+ RelSubattrInfo *sainfo = RelationGetIdxSubattrs(resultRelInfo->ri_RelationDesc);
+ SubattrTrackingContext *subattr_ctx;
+ ListCell *lc;
+ ListCell *lc2;
+
+ /*
+ * Create a SubattrTrackingContext that will be shared by all
+ * instrumented function calls in this relation's UPDATE projection.
+ * target_attnum is set per-step during expression evaluation.
+ */
+ subattr_ctx = makeNode(SubattrTrackingContext);
+ subattr_ctx->rel = resultRelInfo->ri_RelationDesc;
+ subattr_ctx->target_attnum = InvalidAttrNumber; /* set per-step */
+ subattr_ctx->modified_idx_slot = resultRelInfo->ri_newTupleSlot;
+
+ /*
+ * Walk targetlist and updateColnos in parallel to find
+ * fully-instrumented columns. We must use updateColnos to get the
+ * actual table attnum for each target entry, because tle->resno is
+ * the subplan output position, which may differ from the table column
+ * number.
+ */
+ forboth(lc, subplan->targetlist, lc2, updateColnos)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(lc);
+ AttrNumber attnum = lfirst_int(lc2);
+ bool has_subpath;
+ int i;
+
+ Assert(!tle->resjunk);
+
+ /* Check if this column has subpath descriptors */
+ has_subpath = false;
+ for (i = 0; i < sainfo->nattrs; i++)
+ {
+ if (sainfo->attrs[i].attnum == attnum)
+ {
+ has_subpath = true;
+ break;
+ }
+ }
+
+ if (!has_subpath)
+ continue;
+
+ /*
+ * Check if the SET expression for this column is fully covered by
+ * instrumented mutation functions.
+ */
+ if (HasCompleteModificationTracking((Node *) tle->expr, attnum))
+ resultRelInfo->ri_InstrumentedIdxAttrs =
+ bms_add_member(resultRelInfo->ri_InstrumentedIdxAttrs,
+ attnum - FirstLowInvalidHeapAttributeNumber);
+ }
+
+ /*
+ * Attach SubattrTrackingContext to the projection's ExprState so
+ * EEOP_FUNCEXPR steps can find it.
+ */
+ if (resultRelInfo->ri_InstrumentedIdxAttrs != NULL &&
+ resultRelInfo->ri_projectNew != NULL)
+ {
+ resultRelInfo->ri_projectNew->pi_state.es_subattr_context = subattr_ctx;
+ }
+ }
}
/*
@@ -825,6 +1184,7 @@ ExecGetUpdateNewTuple(ResultRelInfo *relinfo,
{
ProjectionInfo *newProj = relinfo->ri_projectNew;
ExprContext *econtext;
+ TupleTableSlot *result;
/* Use a few extra Asserts to protect against outside callers */
Assert(relinfo->ri_projectNewInfoValid);
@@ -834,7 +1194,24 @@ ExecGetUpdateNewTuple(ResultRelInfo *relinfo,
econtext = newProj->pi_exprContext;
econtext->ecxt_outertuple = planSlot;
econtext->ecxt_scantuple = oldSlot;
- return ExecProject(newProj);
+ result = ExecProject(newProj);
+
+ /*
+ * Copy the modified indexed attributes bitmap from the plan slot to the
+ * result slot. This bitmap was populated during SET expression evaluation
+ * (in planSlot) by instrumented mutation functions, and needs to be
+ * propagated to the result slot so ExecUpdateModifiedIdxAttrs can read
+ * it.
+ */
+ if (planSlot->tts_modified_idx_attrs != NULL)
+ {
+ MemoryContext oldcxt = MemoryContextSwitchTo(result->tts_mcxt);
+
+ result->tts_modified_idx_attrs = bms_copy(planSlot->tts_modified_idx_attrs);
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ return result;
}
/* ----------------------------------------------------------------
@@ -2195,14 +2572,17 @@ ExecUpdatePrepareSlot(ResultRelInfo *resultRelInfo,
*/
static TM_Result
ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo,
- ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot,
- bool canSetTag, UpdateContext *updateCxt)
+ ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *oldSlot,
+ TupleTableSlot *slot, bool canSetTag, UpdateContext *updateCxt)
{
EState *estate = context->estate;
Relation resultRelationDesc = resultRelInfo->ri_RelationDesc;
bool partition_constraint_failed;
TM_Result result;
+ /* The set of modified indexed attributes that trigger new index entries */
+ Bitmapset *modified_idx_attrs = NULL;
+
updateCxt->crossPartUpdate = false;
/*
@@ -2319,13 +2699,25 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo,
ExecConstraints(resultRelInfo, slot, estate);
/*
- * replace the heap tuple
+ * Next up we need to find out the set of indexed attributes that have
+ * changed in value and should trigger a new index tuple. We could start
+ * with the set of updated columns via ExecGetUpdatedCols(), but if we do
+ * we will overlook attributes directly modified by heap_modify_tuple()
+ * which are not known to ExecGetUpdatedCols().
+ */
+ modified_idx_attrs = ExecUpdateModifiedIdxAttrs(resultRelInfo, estate, oldSlot, slot);
+
+ /*
+ * Call into the table AM to update the heap tuple.
*
* Note: if es_crosscheck_snapshot isn't InvalidSnapshot, we check that
* the row to be updated is visible to that snapshot, and throw a
* can't-serialize error if not. This is a special-case behavior needed
* for referential integrity updates in transaction-snapshot mode
* transactions.
+ *
+ * The table AM may set the MODIFIED_IDX_ATTRS_ALL_IDX sentinel bit in
+ * modified_idx_attrs to signal that this was a non-HOT update.
*/
result = table_tuple_update(resultRelationDesc, tupleid, slot,
estate->es_output_cid,
@@ -2333,7 +2725,10 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo,
estate->es_crosscheck_snapshot,
true /* wait for commit */ ,
&context->tmfd, &updateCxt->lockmode,
- &updateCxt->updateIndexes);
+ &modified_idx_attrs);
+
+ /* Save modified_idx_attrs for use by ExecUpdateEpilogue */
+ updateCxt->modifiedIdxAttrs = modified_idx_attrs;
return result;
}
@@ -2353,17 +2748,35 @@ ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt,
List *recheckIndexes = NIL;
/* insert index entries for tuple if necessary */
- if (resultRelInfo->ri_NumIndices > 0 && (updateCxt->updateIndexes != TU_None))
+ if (resultRelInfo->ri_NumIndices > 0 &&
+ !bms_is_empty(updateCxt->modifiedIdxAttrs))
{
bits32 flags = EIIT_IS_UPDATE;
- if (updateCxt->updateIndexes == TU_Summarizing)
- flags |= EIIT_ONLY_SUMMARIZING;
+ /*
+ * Check the MODIFIED_IDX_ATTRS_ALL_IDX sentinel bit to determine if
+ * this is a non-HOT update (all indexes need entries) or a HOT update
+ * (only summarizing indexes with modified columns need entries).
+ */
+ if (bms_is_member(MODIFIED_IDX_ATTRS_ALL_IDX,
+ updateCxt->modifiedIdxAttrs))
+ flags |= EIIT_ALL_INDEXES;
+
+ /*
+ * Determine per-index unchanged status. This populates
+ * ii_IndexUnchanged on each IndexInfo, which ExecInsertIndexTuples()
+ * uses to determine per-index behavior.
+ */
+ ExecSetIndexUnchanged(resultRelInfo, updateCxt->modifiedIdxAttrs);
+
recheckIndexes = ExecInsertIndexTuples(resultRelInfo, context->estate,
flags, slot, NIL,
NULL);
}
+ bms_free(updateCxt->modifiedIdxAttrs);
+ updateCxt->modifiedIdxAttrs = NULL;
+
/* AFTER ROW UPDATE Triggers */
ExecARUpdateTriggers(context->estate, resultRelInfo,
NULL, NULL,
@@ -2555,8 +2968,8 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo,
*/
redo_act:
lockedtid = *tupleid;
- result = ExecUpdateAct(context, resultRelInfo, tupleid, oldtuple, slot,
- canSetTag, &updateCxt);
+ result = ExecUpdateAct(context, resultRelInfo, tupleid, oldtuple, oldSlot,
+ slot, canSetTag, &updateCxt);
/*
* If ExecUpdateAct reports that a cross-partition update was done,
@@ -3406,8 +3819,8 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo,
Assert(oldtuple == NULL);
result = ExecUpdateAct(context, resultRelInfo, tupleid,
- NULL, newslot, canSetTag,
- &updateCxt);
+ NULL, resultRelInfo->ri_oldTupleSlot,
+ newslot, canSetTag, &updateCxt);
/*
* As in ExecUpdate(), if ExecUpdateAct() reports that a
@@ -4450,6 +4863,22 @@ ExecModifyTable(PlanState *pstate)
continue; /* continue with the next tuple */
}
+ /* Reset the mix accumulator before SET expression evaluation */
+ if (resultRelInfo->ri_MixSlot != NULL)
+ {
+ TupleTableSlot *modified_idx_slot = resultRelInfo->ri_MixSlot;
+
+ if (modified_idx_slot->tts_modified_idx_attrs != NULL)
+ {
+ /*
+ * Free in the slot's memory context, where it was allocated
+ * by slot_add_modified_idx_attr.
+ */
+ pfree(modified_idx_slot->tts_modified_idx_attrs);
+ modified_idx_slot->tts_modified_idx_attrs = NULL;
+ }
+ }
+
/* Fetch the next row from subplan */
context.planSlot = ExecProcNode(subplanstate);
context.cpDeletedSlot = NULL;
@@ -4544,7 +4973,7 @@ ExecModifyTable(PlanState *pstate)
* For UPDATE/DELETE/MERGE, fetch the row identity info for the tuple
* to be updated/deleted/merged. For a heap relation, that's a TID;
* otherwise we may have a wholerow junk attr that carries the old
- * tuple in toto. Keep this in step with the part of
+ * tuple in total. Keep this in step with the part of
* ExecInitModifyTable that sets up ri_RowIdAttNo.
*/
if (operation == CMD_UPDATE || operation == CMD_DELETE ||
@@ -4968,6 +5397,9 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
mtstate->rootResultRelInfo = makeNode(ResultRelInfo);
ExecInitResultRelation(estate, mtstate->rootResultRelInfo,
node->rootRelation);
+ /* Initialize new struct fields to prevent garbage reads */
+ mtstate->rootResultRelInfo->ri_MixSlot = NULL;
+ mtstate->rootResultRelInfo->ri_InstrumentedIdxAttrs = NULL;
}
else
{
@@ -4976,6 +5408,9 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
mtstate->rootResultRelInfo = mtstate->resultRelInfo;
ExecInitResultRelation(estate, mtstate->resultRelInfo,
linitial_int(resultRelations));
+ /* Initialize new struct fields to prevent garbage reads */
+ mtstate->resultRelInfo->ri_MixSlot = NULL;
+ mtstate->resultRelInfo->ri_InstrumentedIdxAttrs = NULL;
}
/* set up epqstate with dummy subplan data for the moment */
@@ -5009,6 +5444,9 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
if (resultRelInfo != mtstate->rootResultRelInfo)
{
ExecInitResultRelation(estate, resultRelInfo, resultRelation);
+ /* Initialize new struct fields to prevent garbage reads */
+ resultRelInfo->ri_MixSlot = NULL;
+ resultRelInfo->ri_InstrumentedIdxAttrs = NULL;
/*
* For child result relations, store the root result relation
@@ -5033,11 +5471,70 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
i++;
}
+ /*
+ * For UPDATE operations, set up pending SubattrTrackingContext so that
+ * ExecBuildUpdateProjection can inject it during expression compilation.
+ * This enables HOT updates when only non-indexed JSONB/XML subpaths are
+ * modified.
+ */
+ if (operation == CMD_UPDATE && enable_subpath_hot)
+ {
+ ResultRelInfo *firstResultRelInfo = mtstate->resultRelInfo;
+ Relation resultRel = firstResultRelInfo->ri_RelationDesc;
+ RelSubattrInfo *subattrinfo;
+
+ /* Check if this relation has sub-attribute expression indexes */
+ if (!IsSystemRelation(resultRel))
+ {
+ subattrinfo = RelationGetIdxSubattrs(resultRel);
+ if (subattrinfo != NULL)
+ {
+ SubattrTrackingContext *pending_context;
+ List *updateColnos;
+
+ /* Get updateColnos for the first result relation */
+ updateColnos = (List *) linitial(mtstate->mt_updateColnosLists);
+
+ /* Create the context */
+ pending_context = makeNode(SubattrTrackingContext);
+ pending_context->rel = resultRel;
+ pending_context->modified_idx_slot = NULL; /* Will be set to
+ * subplan's result slot */
+ pending_context->target_attnum = InvalidAttrNumber; /* Set per-function
+ * during execution */
+ pending_context->resno_to_attnum = NULL; /* Will be populated in
+ * ExecBuildProjectionInfo */
+ pending_context->max_resno = 0;
+ pending_context->updateColnos = updateColnos; /* Store for
+ * resno->attnum mapping */
+
+ /* Store in EState for ExecBuildUpdateProjection to find */
+ estate->es_pending_subpath_context = pending_context;
+ }
+ }
+ }
+
/*
* Now we may initialize the subplan.
*/
outerPlanState(mtstate) = ExecInitNode(subplan, estate, eflags);
+ /*
+ * Update modified_idx_slot now that subplan initialization is complete. DON'T
+ * clear the pending context yet - it needs to remain available for
+ * ExecBuildUpdateProjection which is called lazily during execution.
+ */
+ if (estate->es_pending_subpath_context != NULL)
+ {
+ /* Update modified_idx_slot to point to the subplan's result slot */
+ if (outerPlanState(mtstate) != NULL &&
+ outerPlanState(mtstate)->ps_ResultTupleSlot != NULL)
+ {
+ estate->es_pending_subpath_context->modified_idx_slot =
+ outerPlanState(mtstate)->ps_ResultTupleSlot;
+ }
+ }
+
/*
* Do additional per-result-relation initialization.
*/
@@ -5332,6 +5829,19 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
if (mtstate->operation == CMD_MERGE)
ExecInitMerge(mtstate, estate);
+
+ if (operation == CMD_UPDATE)
+ {
+ int whichrel = resultRelInfo - mtstate->resultRelInfo;
+ List *updateColnos;
+
+ Assert(whichrel >= 0 && whichrel < mtstate->mt_nrels);
+ updateColnos = (List *) list_nth(mtstate->mt_updateColnosLists,
+ whichrel);
+ InitModifiedIdxTracking(mtstate, resultRelInfo,
+ outerPlanState(mtstate), updateColnos);
+ }
+
EvalPlanQualSetPlan(&mtstate->mt_epqstate, subplan, arowmarks);
/*
@@ -5489,3 +5999,369 @@ ExecReScanModifyTable(ModifyTableState *node)
*/
elog(ERROR, "ExecReScanModifyTable is not implemented");
}
+
+/*
+ * HasCompleteModificationTracking
+ *
+ * Returns true if 'expr' is a chain of prosubattrmutator functions whose
+ * source-datum argument (arg[0]) ultimately traces back to a Var
+ * referencing 'target_attnum'.
+ *
+ * This means every transformation of the column value is instrumented:
+ * mutation functions will detect any change to indexed subpaths.
+ *
+ * Returns false for direct assignment (Const), opaque functions,
+ * CASE/COALESCE wrappers, or any expression shape we can't verify.
+ */
+static bool
+HasCompleteModificationTracking(Node *expr, AttrNumber target_attnum)
+{
+ if (expr == NULL)
+ return false;
+
+ /* Strip implicit casts */
+ if (IsA(expr, RelabelType))
+ return HasCompleteModificationTracking(
+ (Node *) ((RelabelType *) expr)->arg, target_attnum);
+
+ if (IsA(expr, CoerceViaIO))
+ return false; /* IO coercion can change representation */
+
+ /* Base case: Var referencing the same column */
+ if (IsA(expr, Var))
+ {
+ Var *var = (Var *) expr;
+
+ return (var->varattno == target_attnum);
+ }
+
+ /* Recursive case: prosubattrmutator function */
+ if (IsA(expr, FuncExpr))
+ {
+ FuncExpr *func = (FuncExpr *) expr;
+ HeapTuple procTup;
+ bool is_mutator;
+
+ procTup = SearchSysCache1(PROCOID,
+ ObjectIdGetDatum(func->funcid));
+ if (!HeapTupleIsValid(procTup))
+ return false;
+
+ is_mutator = ((Form_pg_proc) GETSTRUCT(procTup))->prosubattrmutator;
+ ReleaseSysCache(procTup);
+
+ if (!is_mutator)
+ return false;
+
+ /* Source datum must be arg[0] */
+ if (list_length(func->args) < 1)
+ return false;
+
+ return HasCompleteModificationTracking(linitial(func->args),
+ target_attnum);
+ }
+
+ /* OpExpr (operators like ||): check underlying function */
+ if (IsA(expr, OpExpr))
+ {
+ OpExpr *op = (OpExpr *) expr;
+ HeapTuple procTup;
+ bool is_mutator;
+
+ procTup = SearchSysCache1(PROCOID,
+ ObjectIdGetDatum(op->opfuncid));
+ if (!HeapTupleIsValid(procTup))
+ return false;
+
+ is_mutator = ((Form_pg_proc) GETSTRUCT(procTup))->prosubattrmutator;
+ ReleaseSysCache(procTup);
+
+ if (!is_mutator)
+ return false;
+
+ if (list_length(op->args) < 1)
+ return false;
+
+ return HasCompleteModificationTracking(linitial(op->args),
+ target_attnum);
+ }
+
+ /* Any other node type — not verifiable */
+ return false;
+}
+
+/*
+ * InjectMixContextIntoExprState
+ *
+ * Walk the compiled ExprState steps backward. For each EEOP_FUNCEXPR*
+ * step whose function has prosubattrmutator=true, and which belongs to a
+ * SET target on a sub-attribute-aware column, inject a SubattrTrackingContext into
+ * fcinfo->context.
+ *
+ * The backward walk uses EEOP_ASSIGN_TMP* steps to determine which
+ * target column the preceding computation steps belong to:
+ *
+ * ... computation steps for column N ...
+ * EEOP_ASSIGN_TMP resultnum = (attnum - 1)
+ * ... computation steps for column N+1 ...
+ * EEOP_ASSIGN_TMP resultnum = (attnum_next - 1)
+ *
+ * Walking backward, each ASSIGN sets the "current target attnum",
+ * and all FUNCEXPR steps between two ASSIGNs belong to that target.
+ */
+static void
+InjectMixContextIntoExprState(ExprState *state,
+ Relation rel,
+ TupleTableSlot *modified_idx_slot,
+ RelSubattrInfo *subattrinfo)
+{
+ AttrNumber current_attnum = InvalidAttrNumber;
+
+ if (state == NULL || state->steps == NULL || state->steps_len == 0)
+ return;
+
+ if (subattrinfo == NULL)
+ return;
+
+ for (int i = state->steps_len - 1; i >= 0; i--)
+ {
+ ExprEvalStep *step = &state->steps[i];
+
+ switch (step->opcode)
+ {
+ /*
+ * EEOP_ASSIGN_TMP variants: expression-computed result being
+ * stored into the target slot. Update current_attnum.
+ */
+ case EEOP_ASSIGN_TMP:
+ case EEOP_ASSIGN_TMP_MAKE_RO:
+ {
+ AttrNumber attnum = step->d.assign_tmp.resultnum + 1;
+ int attidx = attnum - FirstLowInvalidHeapAttributeNumber;
+
+ if (bms_is_member(attidx, subattrinfo->subattr_attrs) &&
+ !bms_is_member(attidx, subattrinfo->simple_indexed_attrs))
+ {
+ current_attnum = attnum;
+ }
+ else
+ {
+ current_attnum = InvalidAttrNumber;
+ }
+ break;
+ }
+
+ /*
+ * EEOP_ASSIGN_*_VAR: simple slot-to-slot copy (non-SET
+ * columns). No expression computation involved.
+ */
+ case EEOP_ASSIGN_SCAN_VAR:
+ case EEOP_ASSIGN_INNER_VAR:
+ case EEOP_ASSIGN_OUTER_VAR:
+ current_attnum = InvalidAttrNumber;
+ break;
+
+ /*
+ * FUNCEXPR variants: potential mutation function.
+ */
+ case EEOP_FUNCEXPR:
+ case EEOP_FUNCEXPR_STRICT:
+ case EEOP_FUNCEXPR_STRICT_1:
+ case EEOP_FUNCEXPR_STRICT_2:
+ case EEOP_FUNCEXPR_FUSAGE:
+ case EEOP_FUNCEXPR_STRICT_FUSAGE:
+ {
+ FunctionCallInfo fcinfo;
+ HeapTuple procTup;
+ bool is_mutator;
+ SubattrTrackingContext *mc;
+
+ if (!AttributeNumberIsValid(current_attnum))
+ break;
+
+ fcinfo = step->d.func.fcinfo_data;
+
+ /* Don't overwrite existing context (SRF, aggregate) */
+ if (fcinfo->context != NULL)
+ break;
+
+ /* Check if this function is a sub-attribute mutator */
+ procTup = SearchSysCache1(PROCOID,
+ ObjectIdGetDatum(fcinfo->flinfo->fn_oid));
+ if (!HeapTupleIsValid(procTup))
+ break;
+
+ is_mutator = ((Form_pg_proc)
+ GETSTRUCT(procTup))->prosubattrmutator;
+ ReleaseSysCache(procTup);
+
+ if (!is_mutator)
+ break;
+
+ /*
+ * Allocate SubattrTrackingContext in the executor's
+ * per-query context. It lives for the entire query
+ * duration — one allocation per function step, not per
+ * row.
+ */
+ mc = makeNode(SubattrTrackingContext);
+ mc->modified_idx_slot = modified_idx_slot;
+ mc->target_attnum = current_attnum;
+ mc->rel = rel;
+
+ fcinfo->context = (Node *) mc;
+ break;
+ }
+
+ default:
+ break;
+ }
+ }
+}
+
+/*
+ * InitModifiedIdxTracking
+ *
+ * Called from ExecInitModifyTable for UPDATE operations.
+ * Sets up ri_InstrumentedIdxAttrs, ri_MixSlot, and injects SubattrTrackingContext
+ * into compiled ExprState steps.
+ */
+static void
+InitModifiedIdxTracking(ModifyTableState *mtstate,
+ ResultRelInfo *resultRelInfo,
+ PlanState *subplanstate,
+ List *updateColnos)
+{
+ Relation rel = resultRelInfo->ri_RelationDesc;
+ RelSubattrInfo *subattrinfo;
+ Plan *subplan;
+ TupleTableSlot *modified_idx_slot;
+ ListCell *lc;
+ ListCell *lc2;
+
+ /* Default: no tracking */
+ resultRelInfo->ri_InstrumentedIdxAttrs = NULL;
+ resultRelInfo->ri_MixSlot = NULL;
+
+ /* Bail out early if the feature is disabled */
+ if (!enable_subpath_hot)
+ return;
+
+ /* Bail out early for system catalog tables to avoid syscache lookups */
+ if (IsSystemRelation(rel))
+ return;
+
+ /* Bail out early if no subplan state (shouldn't happen for UPDATE) */
+ if (subplanstate == NULL)
+ return;
+
+ /* Bail out early if no sub-attribute expression indexes */
+ subattrinfo = RelationGetIdxSubattrs(rel);
+ if (subattrinfo == NULL)
+ return;
+
+ subplan = subplanstate->plan;
+ if (subplan == NULL)
+ return; /* Shouldn't happen, but be defensive */
+
+ modified_idx_slot = subplanstate->ps_ResultTupleSlot;
+ if (modified_idx_slot == NULL)
+ return; /* Shouldn't happen, but be defensive */
+
+ resultRelInfo->ri_MixSlot = modified_idx_slot;
+
+ /*
+ * Determine which SET targets are fully instrumented. Iterate over
+ * updateColnos (the columns being SET) and find the corresponding
+ * TargetEntry in the subplan's targetlist. We cannot use forboth()
+ * because the two lists may have different lengths.
+ */
+ if (subplan->targetlist == NULL || updateColnos == NULL)
+ return; /* No targets to track */
+
+ foreach(lc, updateColnos)
+ {
+ AttrNumber attnum = (AttrNumber) lfirst_int(lc);
+ TargetEntry *tle;
+ int attidx;
+
+ /* Find the TargetEntry for this column in the targetlist */
+ tle = NULL;
+ foreach(lc2, subplan->targetlist)
+ {
+ TargetEntry *tmp_tle = (TargetEntry *) lfirst(lc2);
+
+ if (tmp_tle->resjunk)
+ continue;
+
+ /* Check if this TLE corresponds to our target column */
+ if (IsA(tmp_tle->expr, Var))
+ {
+ Var *var = (Var *) tmp_tle->expr;
+
+ if (var->varattno == attnum)
+ {
+ tle = tmp_tle;
+ break;
+ }
+ }
+ else
+ {
+ /*
+ * For non-Var expressions, assume the tle->resno matches
+ * position
+ */
+ /*
+ * This is a simplified check - in reality we'd need more
+ * logic
+ */
+ tle = tmp_tle;
+ break;
+ }
+ }
+
+ if (tle == NULL)
+ continue; /* Column not in targetlist? */
+
+ attidx = attnum - FirstLowInvalidHeapAttributeNumber;
+
+ /* Only check columns with subpath-only indexes */
+ if (!bms_is_member(attidx, subattrinfo->subattr_attrs))
+ continue;
+ if (bms_is_member(attidx, subattrinfo->simple_indexed_attrs))
+ continue;
+
+ /* Simple Var pass-through: column not being SET */
+ if (IsA(tle->expr, Var) &&
+ ((Var *) tle->expr)->varattno == attnum)
+ continue;
+
+ if (HasCompleteModificationTracking((Node *) tle->expr, attnum))
+ {
+ resultRelInfo->ri_InstrumentedIdxAttrs =
+ bms_add_member(resultRelInfo->ri_InstrumentedIdxAttrs, attidx);
+ }
+ }
+
+ /*
+ * Inject SubattrTrackingContext into compiled ExprState steps.
+ *
+ * Walk the subplan's projection ExprState AND ri_projectNew's ExprState.
+ * SET expression evaluation may occur in either one depending on plan
+ * shape. Injection is idempotent (only when fcinfo->context == NULL), so
+ * double-walking is safe.
+ */
+ if (subplanstate->ps_ProjInfo != NULL)
+ {
+ InjectMixContextIntoExprState(
+ &subplanstate->ps_ProjInfo->pi_state,
+ rel, modified_idx_slot, subattrinfo);
+ }
+
+ if (resultRelInfo->ri_projectNew != NULL)
+ {
+ InjectMixContextIntoExprState(
+ &resultRelInfo->ri_projectNew->pi_state,
+ rel, modified_idx_slot, subattrinfo);
+ }
+}
diff --git a/src/backend/nodes/Makefile b/src/backend/nodes/Makefile
index 77ddb9ca53f1e..aec408805fd85 100644
--- a/src/backend/nodes/Makefile
+++ b/src/backend/nodes/Makefile
@@ -61,7 +61,8 @@ node_headers = \
nodes/replnodes.h \
nodes/supportnodes.h \
nodes/value.h \
- utils/rel.h
+ utils/rel.h \
+ executor/execMutation.h
# see also catalog/Makefile for an explanation of these make rules
diff --git a/src/backend/nodes/gen_node_support.pl b/src/backend/nodes/gen_node_support.pl
index 4308751f787e6..d9690f00ec766 100644
--- a/src/backend/nodes/gen_node_support.pl
+++ b/src/backend/nodes/gen_node_support.pl
@@ -74,6 +74,7 @@ sub elem
nodes/supportnodes.h
nodes/value.h
utils/rel.h
+ executor/execMutation.h
);
# Nodes from these input files are automatically treated as nodetag_only.
diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c
index 2caec621d73db..73ee4eb3ada20 100644
--- a/src/backend/nodes/makefuncs.c
+++ b/src/backend/nodes/makefuncs.c
@@ -845,8 +845,6 @@ makeIndexInfo(int numattrs, int numkeyattrs, Oid amoid, List *expressions,
n->ii_Unique = unique;
n->ii_NullsNotDistinct = nulls_not_distinct;
n->ii_ReadyForInserts = isready;
- n->ii_CheckedUnchanged = false;
- n->ii_IndexUnchanged = false;
n->ii_Concurrent = concurrent;
n->ii_Summarizing = summarizing;
n->ii_WithoutOverlaps = withoutoverlaps;
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 40990143927e7..9692ac8edad9f 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -18,6 +18,7 @@
#include "access/attnum.h"
#include "common/shortest_dec.h"
+#include "executor/execMutation.h"
#include "lib/stringinfo.h"
#include "miscadmin.h"
#include "nodes/bitmapset.h"
@@ -745,6 +746,8 @@ outNode(StringInfo str, const void *obj)
_outString(str, (const String *) obj);
else if (IsA(obj, BitString))
_outBitString(str, (const BitString *) obj);
+ else if (IsA(obj, SubattrTrackingContext))
+ _outSubattrTrackingContext(str, (const SubattrTrackingContext *) obj);
else if (IsA(obj, Bitmapset))
outBitmapset(str, (const Bitmapset *) obj);
else
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 89ca4e08bf156..dbdc8e2cd7dc0 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -163,6 +163,7 @@ bool enable_parallel_hash = true;
bool enable_partition_pruning = true;
bool enable_presorted_aggregate = true;
bool enable_async_append = true;
+bool enable_subpath_hot = true;
typedef struct
{
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index a8fd680589f72..06a073c294602 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -51,6 +51,7 @@ OBJS = \
json.o \
jsonb.o \
jsonb_gin.o \
+ jsonb_idx.o \
jsonb_op.o \
jsonb_util.o \
jsonfuncs.o \
diff --git a/src/backend/utils/adt/jsonb_idx.c b/src/backend/utils/adt/jsonb_idx.c
new file mode 100644
index 0000000000000..07f694770be09
--- /dev/null
+++ b/src/backend/utils/adt/jsonb_idx.c
@@ -0,0 +1,565 @@
+/*-------------------------------------------------------------------------
+ *
+ * jsonb_idx.c
+ * Support functions for HOT updates with JSONB expression indexes
+ *
+ * This file implements the type-specific index support functions for JSONB:
+ * - jsonb_idx_extract: Extract indexed subpaths from index expressions
+ * - jsonb_idx_compare: Compare old/new JSONB values at indexed subpaths
+ *
+ * Copyright (c) 2014-2026, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/adt/jsonb_idx.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "catalog/pg_operator.h"
+#include "catalog/pg_proc.h"
+#include "catalog/pg_type.h"
+#include "fmgr.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "nodes/primnodes.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/fmgroids.h"
+#include "utils/idxsubattr.h"
+#include "utils/jsonb.h"
+#include "utils/lsyscache.h"
+
+/* OIDs for JSONB operators */
+#define JSONB_OBJECT_FIELD_OID 3211 /* jsonb -> text */
+#define JSONB_OBJECT_FIELD_TEXT_OID 3477 /* jsonb ->> text */
+#define JSONB_ARRAY_ELEMENT_OID 3212 /* jsonb -> int4 */
+#define JSONB_ARRAY_ELEMENT_TEXT_OID 3481 /* jsonb ->> int4 */
+
+/* Operator OIDs for JSONB path operators */
+#define JSONB_EXTRACT_PATH_OP_OID 3213 /* jsonb #> text[] */
+#define JSONB_EXTRACT_PATH_TEXT_OP_OID 3206 /* jsonb #>> text[] */
+
+/* Function OIDs for JSONB path operators */
+#define JSONB_EXTRACT_PATH_FN_OID 3217 /* jsonb_extract_path */
+#define JSONB_EXTRACT_PATH_TEXT_FN_OID 3940 /* jsonb_extract_path_text */
+
+/* Helper function prototypes */
+static List *extract_jsonb_path_from_expr(Node *expr, AttrNumber target_attnum,
+ bool *success);
+static ArrayType *text_list_to_array(List *text_list);
+static List *array_to_text_list(ArrayType *arr);
+static JsonbValue *extract_jsonb_value_by_path(Jsonb *jb, List *path_elements);
+static bool jsonb_values_equal(JsonbValue *v1, JsonbValue *v2);
+
+/*
+ * extract_jsonb_path_from_expr
+ *
+ * Recursively walk an expression tree to extract a JSONB access path.
+ * Returns a List of text values representing the path elements, or NIL if
+ * the expression doesn't match a recognized pattern.
+ *
+ * Recognized patterns:
+ * 1. Var -> 'key' => {"key"}
+ * 2. Var -> 'a' -> 'b' => {"a", "b"}
+ * 3. Var #> ARRAY['a', 'b'] => {"a", "b"}
+ * 4. (Var -> 'a')::text => {"a"} (with cast)
+ */
+static List *
+extract_jsonb_path_from_expr(Node *expr, AttrNumber target_attnum, bool *success)
+{
+ *success = false;
+
+ if (expr == NULL)
+ return NIL;
+
+ /* Skip past any RelabelType (casts) */
+ while (IsA(expr, RelabelType))
+ expr = (Node *) ((RelabelType *) expr)->arg;
+
+ /* Case 1 & 2: Binary operator (-> or ->>) for single field access */
+ if (IsA(expr, OpExpr))
+ {
+ OpExpr *opexpr = (OpExpr *) expr;
+ Oid opno = opexpr->opno;
+ Node *leftarg;
+ Node *rightarg;
+
+ if (list_length(opexpr->args) != 2)
+ return NIL;
+
+ leftarg = (Node *) linitial(opexpr->args);
+ rightarg = (Node *) lsecond(opexpr->args);
+
+ /* Single field access: -> or ->> with text or int4 key */
+ if (opno == JSONB_OBJECT_FIELD_OID ||
+ opno == JSONB_OBJECT_FIELD_TEXT_OID ||
+ opno == JSONB_ARRAY_ELEMENT_OID ||
+ opno == JSONB_ARRAY_ELEMENT_TEXT_OID)
+ {
+ List *prefix_path;
+ Const *key_const;
+ text *key_text;
+ bool prefix_success;
+
+ /* Recursively extract path from left side */
+ prefix_path = extract_jsonb_path_from_expr(leftarg, target_attnum,
+ &prefix_success);
+
+ if (!prefix_success)
+ return NIL;
+
+ /* Right side must be a Const (the key or index) */
+ if (!IsA(rightarg, Const))
+ {
+ list_free_deep(prefix_path);
+ return NIL;
+ }
+
+ key_const = (Const *) rightarg;
+
+ if (key_const->constisnull)
+ {
+ list_free_deep(prefix_path);
+ return NIL;
+ }
+
+ /* Convert the key to text */
+ if (key_const->consttype == TEXTOID)
+ {
+ key_text = DatumGetTextPP(key_const->constvalue);
+ }
+ else if (key_const->consttype == INT4OID)
+ {
+ /* Convert integer array index to text */
+ int32 idx = DatumGetInt32(key_const->constvalue);
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "%d", idx);
+ key_text = cstring_to_text(buf);
+ }
+ else
+ {
+ list_free_deep(prefix_path);
+ return NIL;
+ }
+
+ /* Append this key to the path */
+ prefix_path = lappend(prefix_path, key_text);
+ *success = true;
+ return prefix_path;
+ }
+
+ /* Path access: #> or #>> with text[] array */
+ if (opno == JSONB_EXTRACT_PATH_OP_OID ||
+ opno == JSONB_EXTRACT_PATH_TEXT_OP_OID)
+ {
+ Const *path_const;
+ ArrayType *path_array;
+ List *prefix_path;
+ List *path_list;
+ bool prefix_success;
+
+ /* Recursively extract path from left side */
+ prefix_path = extract_jsonb_path_from_expr(leftarg, target_attnum,
+ &prefix_success);
+
+ if (!prefix_success)
+ return NIL;
+
+ /* Right side should be a Const array of path elements */
+ if (!IsA(rightarg, Const))
+ {
+ list_free_deep(prefix_path);
+ return NIL;
+ }
+
+ path_const = (Const *) rightarg;
+ if (path_const->constisnull)
+ {
+ list_free_deep(prefix_path);
+ return NIL;
+ }
+
+ /* Extract the text[] array */
+ path_array = DatumGetArrayTypeP(path_const->constvalue);
+ path_list = array_to_text_list(path_array);
+
+ /* Combine prefix path with extracted path elements */
+ prefix_path = list_concat(prefix_path, path_list);
+ *success = true;
+ return prefix_path;
+ }
+
+ /* Unrecognised operator */
+ return NIL;
+ }
+
+ /* Case 3: FuncExpr for #> or #>> operators */
+ if (IsA(expr, FuncExpr))
+ {
+ FuncExpr *funcexpr = (FuncExpr *) expr;
+ Node *leftarg;
+ Node *rightarg;
+ Const *path_const;
+ Var *var;
+ ArrayType *path_array;
+ List *path_list;
+
+ /* Check if this is jsonb_extract_path or jsonb_extract_path_text */
+ if (funcexpr->funcid != JSONB_EXTRACT_PATH_FN_OID &&
+ funcexpr->funcid != JSONB_EXTRACT_PATH_TEXT_FN_OID)
+ return NIL;
+
+ if (list_length(funcexpr->args) != 2)
+ return NIL;
+
+ leftarg = (Node *) linitial(funcexpr->args);
+ rightarg = (Node *) lsecond(funcexpr->args);
+
+ /* Left side should be a Var referencing our target column */
+ if (!IsA(leftarg, Var))
+ return NIL;
+
+ var = (Var *) leftarg;
+ if (var->varattno != target_attnum)
+ return NIL;
+
+ /* Right side should be a Const array of path elements */
+ if (!IsA(rightarg, Const))
+ return NIL;
+
+ path_const = (Const *) rightarg;
+ if (path_const->constisnull)
+ return NIL;
+
+ /* Extract the text[] array */
+ path_array = DatumGetArrayTypeP(path_const->constvalue);
+ path_list = array_to_text_list(path_array);
+
+ *success = true;
+ return path_list;
+ }
+
+ /* Base case: Var node - check if it's our target attribute */
+ if (IsA(expr, Var))
+ {
+ Var *var = (Var *) expr;
+
+ if (var->varattno == target_attnum)
+ {
+ /* This is just a bare column reference with no path */
+ *success = true;
+ return NIL; /* Empty path = whole column */
+ }
+ }
+
+ return NIL;
+}
+
+/*
+ * text_list_to_array
+ *
+ * Convert a List of text datums to a PostgreSQL text[] array.
+ */
+static ArrayType *
+text_list_to_array(List *text_list)
+{
+ Datum *datums;
+ int ndatums;
+ ListCell *lc;
+ int i;
+
+ ndatums = list_length(text_list);
+ if (ndatums == 0)
+ return NULL;
+
+ datums = (Datum *) palloc(ndatums * sizeof(Datum));
+
+ i = 0;
+ foreach(lc, text_list)
+ {
+ text *t = (text *) lfirst(lc);
+
+ datums[i++] = PointerGetDatum(t);
+ }
+
+ return construct_array(datums, ndatums, TEXTOID, -1, false, TYPALIGN_INT);
+}
+
+/*
+ * array_to_text_list
+ *
+ * Convert a PostgreSQL text[] array to a List of text datums.
+ */
+static List *
+array_to_text_list(ArrayType *arr)
+{
+ Datum *elems;
+ bool *nulls;
+ int nelems;
+ List *result = NIL;
+ int i;
+
+ deconstruct_array(arr, TEXTOID, -1, false, TYPALIGN_INT,
+ &elems, &nulls, &nelems);
+
+ for (i = 0; i < nelems; i++)
+ {
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+ errmsg("path element cannot be null")));
+
+ result = lappend(result, DatumGetTextPP(elems[i]));
+ }
+
+ return result;
+}
+
+/*
+ * extract_jsonb_value_by_path
+ *
+ * Navigate through a JSONB value following a path of keys.
+ * Returns the JsonbValue at the end of the path, or NULL if not found.
+ */
+static JsonbValue *
+extract_jsonb_value_by_path(Jsonb *jb, List *path_elements)
+{
+ JsonbContainer *container = &jb->root;
+ JsonbValue *result = NULL;
+ ListCell *lc;
+
+ if (path_elements == NIL)
+ {
+ /* Empty path means the whole value */
+ result = palloc(sizeof(JsonbValue));
+ if (!JsonbExtractScalar(container, result))
+ {
+ /* Not a scalar, return the whole container as binary */
+ result->type = jbvBinary;
+ result->val.binary.data = container;
+ result->val.binary.len = VARSIZE_ANY_EXHDR(jb);
+ }
+ return result;
+ }
+
+ /* Walk through each path element */
+ foreach(lc, path_elements)
+ {
+ text *key_text = (text *) lfirst(lc);
+ JsonbValue key_val;
+
+ /* Set up the key as a JsonbValue */
+ key_val.type = jbvString;
+ key_val.val.string.val = VARDATA_ANY(key_text);
+ key_val.val.string.len = VARSIZE_ANY_EXHDR(key_text);
+
+ /* Find the value at this key in the current container */
+ result = findJsonbValueFromContainer(container,
+ JB_FOBJECT | JB_FARRAY,
+ &key_val);
+
+ if (result == NULL)
+ return NULL; /* Key not found */
+
+ /* If result is a container and we have more keys, continue */
+ if (result->type == jbvBinary && lnext(path_elements, lc) != NULL)
+ {
+ container = result->val.binary.data;
+ }
+ else if (lnext(path_elements, lc) != NULL)
+ {
+ /* Need to go deeper but current value is not a container */
+ return NULL;
+ }
+ }
+
+ return result;
+}
+
+/*
+ * jsonb_values_equal
+ *
+ * Compare two JsonbValue structures for equality.
+ */
+static bool
+jsonb_values_equal(JsonbValue *v1, JsonbValue *v2)
+{
+ if (v1 == NULL && v2 == NULL)
+ return true;
+ if (v1 == NULL || v2 == NULL)
+ return false;
+
+ if (v1->type != v2->type)
+ return false;
+
+ switch (v1->type)
+ {
+ case jbvNull:
+ return true;
+
+ case jbvString:
+ if (v1->val.string.len != v2->val.string.len)
+ return false;
+ return memcmp(v1->val.string.val, v2->val.string.val,
+ v1->val.string.len) == 0;
+
+ case jbvNumeric:
+ return DatumGetBool(DirectFunctionCall2(numeric_eq,
+ PointerGetDatum(v1->val.numeric),
+ PointerGetDatum(v2->val.numeric)));
+
+ case jbvBool:
+ return v1->val.boolean == v2->val.boolean;
+
+ case jbvBinary:
+ {
+ /* Use JSONB comparison for complex values */
+ Jsonb *jb1,
+ *jb2;
+
+ jb1 = JsonbValueToJsonb(v1);
+ jb2 = JsonbValueToJsonb(v2);
+
+ return DatumGetBool(DirectFunctionCall2(jsonb_eq,
+ JsonbPGetDatum(jb1),
+ JsonbPGetDatum(jb2)));
+ }
+
+ default:
+ elog(ERROR, "unknown jsonb value type %d", v1->type);
+ return false;
+ }
+}
+
+/*
+ * jsonb_idx_extract
+ *
+ * Extract the indexed subpath from a JSONB index expression.
+ * This function is called at CREATE INDEX time to identify what part
+ * of a JSONB column the index actually covers.
+ *
+ * Arguments:
+ * arg[0]: internal - Node *expr (the index expression tree)
+ * arg[1]: int2 - AttrNumber (which column in the relation)
+ *
+ * Returns:
+ * internal - ArrayType* (text[]) of path elements, or NULL if the
+ * expression pattern is not recognized.
+ *
+ * Examples:
+ * CREATE INDEX idx ON t((data->'status'))
+ * => returns {"status"}
+ *
+ * CREATE INDEX idx ON t((data->'user'->'name'))
+ * => returns {"user", "name"}
+ *
+ * CREATE INDEX idx ON t((data #> ARRAY['a', 'b']))
+ * => returns {"a", "b"}
+ */
+Datum
+jsonb_idx_extract(PG_FUNCTION_ARGS)
+{
+ Node *expr;
+ AttrNumber target_attnum;
+ List *path_list;
+ ArrayType *path_array;
+ bool success;
+
+ /* Argument 0: expression tree */
+ expr = (Node *) PG_GETARG_POINTER(0);
+
+ /* Argument 1: target attribute number */
+ target_attnum = PG_GETARG_INT16(1);
+
+ /* Extract the path from the expression */
+ path_list = extract_jsonb_path_from_expr(expr, target_attnum, &success);
+
+ if (!success || path_list == NIL)
+ {
+ /* Unrecognized pattern or bare column reference */
+ PG_RETURN_POINTER(NULL);
+ }
+
+ /* Convert the path list to an array */
+ path_array = text_list_to_array(path_list);
+
+ /* Clean up */
+ list_free(path_list);
+
+ PG_RETURN_POINTER(path_array);
+}
+
+/*
+ * jsonb_idx_compare
+ *
+ * Compare old and new JSONB values at specific indexed subpaths.
+ * This function is called during UPDATE operations to determine if
+ * any indexed subpath has changed.
+ *
+ * Arguments:
+ * arg[0]: jsonb - old value
+ * arg[1]: jsonb - new value
+ * arg[2]: internal - IdxSubattrDesc* array (indexed subpath descriptors)
+ * arg[3]: int4 - number of descriptors
+ *
+ * Returns:
+ * bool - true if any indexed subpath has changed, false otherwise
+ *
+ * This function extracts the value at each indexed subpath from both
+ * the old and new JSONB values and compares them. If any differ,
+ * the index needs to be updated.
+ */
+Datum
+jsonb_idx_compare(PG_FUNCTION_ARGS)
+{
+ Jsonb *old_jb;
+ Jsonb *new_jb;
+ IdxSubattrDesc *descriptors;
+ int ndescriptors;
+ int i;
+
+ /* Get arguments */
+ old_jb = PG_GETARG_JSONB_P(0);
+ new_jb = PG_GETARG_JSONB_P(1);
+ descriptors = (IdxSubattrDesc *) PG_GETARG_POINTER(2);
+ ndescriptors = PG_GETARG_INT32(3);
+
+ /* Compare each indexed subpath */
+ for (i = 0; i < ndescriptors; i++)
+ {
+ IdxSubattrDesc *desc = &descriptors[i];
+ ArrayType *path_array;
+ List *path_elements;
+ JsonbValue *old_val;
+ JsonbValue *new_val;
+
+ /* Get the path array from the descriptor */
+ if (DatumGetPointer(desc->descriptor) == NULL)
+ {
+ /* NULL descriptor means whole column */
+ path_elements = NIL;
+ }
+ else
+ {
+ path_array = DatumGetArrayTypeP(desc->descriptor);
+ path_elements = array_to_text_list(path_array);
+ }
+
+ /* Extract values at this path from both old and new */
+ old_val = extract_jsonb_value_by_path(old_jb, path_elements);
+ new_val = extract_jsonb_value_by_path(new_jb, path_elements);
+
+ /* Compare the values */
+ if (!jsonb_values_equal(old_val, new_val))
+ {
+ /* This indexed subpath changed */
+ PG_RETURN_BOOL(true);
+ }
+ }
+
+ /* No indexed subpaths changed */
+ PG_RETURN_BOOL(false);
+}
diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c
index d5b64d7fca568..8f7bb08847cec 100644
--- a/src/backend/utils/adt/jsonfuncs.c
+++ b/src/backend/utils/adt/jsonfuncs.c
@@ -21,6 +21,7 @@
#include "common/int.h"
#include "common/jsonapi.h"
#include "common/string.h"
+#include "executor/execMutation.h"
#include "fmgr.h"
#include "funcapi.h"
#include "lib/stringinfo.h"
@@ -32,6 +33,7 @@
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/hsearch.h"
+#include "utils/idxsubattr.h"
#include "utils/json.h"
#include "utils/jsonb.h"
#include "utils/jsonfuncs.h"
@@ -4647,6 +4649,138 @@ jsonb_concat(PG_FUNCTION_ARGS)
PG_RETURN_JSONB_P(JsonbValueToJsonb(state.result));
}
+/*
+ * ========================================================================
+ * Helper functions for JSONB mutation tracking (HOT updates)
+ * ========================================================================
+ */
+
+/*
+ * array_to_jsonb_path_list
+ *
+ * Convert a text[] array to a List of text datums representing a JSONB path.
+ */
+static List *
+array_to_jsonb_path_list(ArrayType *path_array)
+{
+ Datum *path_elems;
+ bool *path_nulls;
+ int path_len;
+ List *result = NIL;
+ int i;
+
+ if (path_array == NULL)
+ return NIL;
+
+ deconstruct_array_builtin(path_array, TEXTOID, &path_elems, &path_nulls, &path_len);
+
+ for (i = 0; i < path_len; i++)
+ {
+ if (path_nulls[i])
+ continue; /* Skip NULL elements */
+
+ result = lappend(result, DatumGetTextPP(path_elems[i]));
+ }
+
+ return result;
+}
+
+/*
+ * jsonb_paths_intersect
+ *
+ * Check if two JSONB paths intersect (one is a prefix of the other).
+ * Returns true if modifying path1 could affect an index on path2.
+ *
+ * Examples:
+ * path1={a,b}, path2={a} => true (path2 is parent)
+ * path1={a,b}, path2={a,b,c} => true (path1 is parent)
+ * path1={a,b}, path2={a,b} => true (exact match)
+ * path1={a,b}, path2={c} => false (disjoint)
+ */
+static bool
+jsonb_paths_intersect(List *path1, List *path2)
+{
+ ListCell *lc1,
+ *lc2;
+ int len1 = list_length(path1);
+ int len2 = list_length(path2);
+ int min_len = (len1 < len2) ? len1 : len2;
+ int i = 0;
+
+ /* Empty paths don't match */
+ if (len1 == 0 || len2 == 0)
+ return false;
+
+ /* Check if the shorter path is a prefix of the longer */
+ forboth(lc1, path1, lc2, path2)
+ {
+ text *key1 = (text *) lfirst(lc1);
+ text *key2 = (text *) lfirst(lc2);
+ int keylen1 = VARSIZE_ANY_EXHDR(key1);
+ int keylen2 = VARSIZE_ANY_EXHDR(key2);
+
+ if (i >= min_len)
+ break;
+
+ /* Compare the text values */
+ if (keylen1 != keylen2 ||
+ memcmp(VARDATA_ANY(key1), VARDATA_ANY(key2), keylen1) != 0)
+ return false; /* Keys differ, paths diverge */
+
+ i++;
+ }
+
+ /* If we got here, one path is a prefix of the other */
+ return true;
+}
+
+/*
+ * jsonb_path_intersects_indexed
+ *
+ * Check if a mutation path intersects with any indexed subpath for this attribute.
+ * Returns true if the mutation affects an indexed subpath.
+ */
+static bool
+jsonb_path_intersects_indexed(List *mutation_path, AttrSubattrInfo *attrinfo)
+{
+ int i;
+
+ if (attrinfo == NULL || mutation_path == NIL)
+ return false;
+
+ /* Check against each indexed subpath descriptor */
+ for (i = 0; i < attrinfo->ndescriptors; i++)
+ {
+ IdxSubattrDesc *desc = &attrinfo->descriptors[i];
+ ArrayType *indexed_path_array;
+ List *indexed_path;
+
+ /* Get the indexed path from the descriptor */
+ if (DatumGetPointer(desc->descriptor) == NULL)
+ continue; /* Skip NULL descriptors */
+
+ indexed_path_array = DatumGetArrayTypeP(desc->descriptor);
+ indexed_path = array_to_jsonb_path_list(indexed_path_array);
+
+ /* Check if paths intersect */
+ if (jsonb_paths_intersect(mutation_path, indexed_path))
+ {
+ list_free(indexed_path);
+ return true;
+ }
+
+ list_free(indexed_path);
+ }
+
+ return false;
+}
+
+/*
+ * ========================================================================
+ * End of mutation tracking helpers
+ * ========================================================================
+ */
+
/*
* SQL function jsonb_delete (jsonb, text)
@@ -4667,6 +4801,33 @@ jsonb_delete(PG_FUNCTION_ARGS)
bool skipNested = false;
JsonbIteratorToken r;
+ /*
+ * Mutation tracking for HOT updates: check if this deletion affects an
+ * indexed subpath. jsonb_delete deletes a single top-level key.
+ */
+ if (fcinfo->context != NULL && IsA(fcinfo->context, SubattrTrackingContext))
+ {
+ SubattrTrackingContext *subattr_ctx = (SubattrTrackingContext *) fcinfo->context;
+ List *mutation_path;
+ AttrSubattrInfo *attrinfo;
+
+ /* Create a single-element path with the deleted key */
+ mutation_path = list_make1(key);
+
+ /* Get indexed subpaths for this column */
+ attrinfo = RelationGetAttrSubattrInfo(subattr_ctx->rel, subattr_ctx->target_attnum);
+
+ if (attrinfo != NULL &&
+ jsonb_path_intersects_indexed(mutation_path, attrinfo))
+ {
+ /* This mutation affects an indexed subpath */
+ slot_add_modified_idx_attr(subattr_ctx->modified_idx_slot, subattr_ctx->target_attnum);
+ }
+
+ /* Clean up */
+ list_free(mutation_path);
+ }
+
if (JB_ROOT_IS_SCALAR(in))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -4863,6 +5024,37 @@ jsonb_set(PG_FUNCTION_ARGS)
JsonbIterator *it;
JsonbInState st = {0};
+ /*
+ * Mutation tracking for HOT updates: check if this modification affects
+ * an indexed subpath.
+ */
+ if (fcinfo->context != NULL && IsA(fcinfo->context, SubattrTrackingContext))
+ {
+ SubattrTrackingContext *subattr_ctx = (SubattrTrackingContext *) fcinfo->context;
+ List *mutation_path;
+ AttrSubattrInfo *attrinfo;
+ bool intersects;
+
+ /* Extract the path being modified from the function arguments */
+ mutation_path = array_to_jsonb_path_list(path);
+
+ /* Get indexed subpaths for this column */
+ attrinfo = RelationGetAttrSubattrInfo(subattr_ctx->rel, subattr_ctx->target_attnum);
+
+ intersects = (attrinfo != NULL &&
+ jsonb_path_intersects_indexed(mutation_path, attrinfo));
+
+ if (intersects)
+ {
+ /* This mutation affects an indexed subpath */
+ slot_add_modified_idx_attr(subattr_ctx->modified_idx_slot, subattr_ctx->target_attnum);
+ }
+
+ /* Clean up */
+ if (mutation_path != NIL)
+ list_free(mutation_path);
+ }
+
JsonbToJsonbValue(newjsonb, &newval);
if (ARR_NDIM(path) > 1)
@@ -4901,6 +5093,38 @@ jsonb_set_lax(PG_FUNCTION_ARGS)
text *handle_null;
char *handle_val;
+ /*
+ * Mutation tracking for HOT updates: check if this modification affects
+ * an indexed subpath. Note: jsonb_set_lax delegates to jsonb_set or
+ * jsonb_delete_path, which are also instrumented, but we track here too
+ * in case the delegation path changes.
+ */
+ if (fcinfo->context != NULL && IsA(fcinfo->context, SubattrTrackingContext) &&
+ !PG_ARGISNULL(1))
+ {
+ SubattrTrackingContext *subattr_ctx = (SubattrTrackingContext *) fcinfo->context;
+ ArrayType *path = PG_GETARG_ARRAYTYPE_P(1);
+ List *mutation_path;
+ AttrSubattrInfo *attrinfo;
+
+ /* Extract the path being modified */
+ mutation_path = array_to_jsonb_path_list(path);
+
+ /* Get indexed subpaths for this column */
+ attrinfo = RelationGetAttrSubattrInfo(subattr_ctx->rel, subattr_ctx->target_attnum);
+
+ if (attrinfo != NULL &&
+ jsonb_path_intersects_indexed(mutation_path, attrinfo))
+ {
+ /* This mutation affects an indexed subpath */
+ slot_add_modified_idx_attr(subattr_ctx->modified_idx_slot, subattr_ctx->target_attnum);
+ }
+
+ /* Clean up */
+ if (mutation_path != NIL)
+ list_free(mutation_path);
+ }
+
if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(3))
PG_RETURN_NULL();
@@ -4969,6 +5193,34 @@ jsonb_delete_path(PG_FUNCTION_ARGS)
JsonbIterator *it;
JsonbInState st = {0};
+ /*
+ * Mutation tracking for HOT updates: check if this deletion affects an
+ * indexed subpath.
+ */
+ if (fcinfo->context != NULL && IsA(fcinfo->context, SubattrTrackingContext))
+ {
+ SubattrTrackingContext *subattr_ctx = (SubattrTrackingContext *) fcinfo->context;
+ List *mutation_path;
+ AttrSubattrInfo *attrinfo;
+
+ /* Extract the path being deleted from the function arguments */
+ mutation_path = array_to_jsonb_path_list(path);
+
+ /* Get indexed subpaths for this column */
+ attrinfo = RelationGetAttrSubattrInfo(subattr_ctx->rel, subattr_ctx->target_attnum);
+
+ if (attrinfo != NULL &&
+ jsonb_path_intersects_indexed(mutation_path, attrinfo))
+ {
+ /* This mutation affects an indexed subpath */
+ slot_add_modified_idx_attr(subattr_ctx->modified_idx_slot, subattr_ctx->target_attnum);
+ }
+
+ /* Clean up */
+ if (mutation_path != NIL)
+ list_free(mutation_path);
+ }
+
if (ARR_NDIM(path) > 1)
ereport(ERROR,
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
@@ -5012,6 +5264,34 @@ jsonb_insert(PG_FUNCTION_ARGS)
JsonbIterator *it;
JsonbInState st = {0};
+ /*
+ * Mutation tracking for HOT updates: check if this insertion affects an
+ * indexed subpath.
+ */
+ if (fcinfo->context != NULL && IsA(fcinfo->context, SubattrTrackingContext))
+ {
+ SubattrTrackingContext *subattr_ctx = (SubattrTrackingContext *) fcinfo->context;
+ List *mutation_path;
+ AttrSubattrInfo *attrinfo;
+
+ /* Extract the path being inserted at from the function arguments */
+ mutation_path = array_to_jsonb_path_list(path);
+
+ /* Get indexed subpaths for this column */
+ attrinfo = RelationGetAttrSubattrInfo(subattr_ctx->rel, subattr_ctx->target_attnum);
+
+ if (attrinfo != NULL &&
+ jsonb_path_intersects_indexed(mutation_path, attrinfo))
+ {
+ /* This mutation affects an indexed subpath */
+ slot_add_modified_idx_attr(subattr_ctx->modified_idx_slot, subattr_ctx->target_attnum);
+ }
+
+ /* Clean up */
+ if (mutation_path != NIL)
+ list_free(mutation_path);
+ }
+
JsonbToJsonbValue(newjsonb, &newval);
if (ARR_NDIM(path) > 1)
diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build
index fb8294d7e4a3e..1493e4905ca32 100644
--- a/src/backend/utils/adt/meson.build
+++ b/src/backend/utils/adt/meson.build
@@ -50,6 +50,7 @@ backend_sources += files(
'json.c',
'jsonb.c',
'jsonb_gin.c',
+ 'jsonb_idx.c',
'jsonb_op.c',
'jsonb_util.c',
'jsonbsubs.c',
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index 79f6cf7b4fa76..758ac9a75d40f 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -98,6 +98,7 @@
#include "utils/builtins.h"
#include "utils/date.h"
#include "utils/datetime.h"
+#include "utils/idxsubattr.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/syscache.h"
@@ -5161,3 +5162,159 @@ XmlTableDestroyOpaque(TableFuncScanState *state)
NO_XML_SUPPORT();
#endif /* not USE_LIBXML */
}
+
+/*
+ * xml_idx_extract - Extract indexed subpath from XML expression
+ *
+ * Recognizes xpath() function calls and extracts the XPath expression
+ * as a descriptor for subpath tracking.
+ *
+ * Signature: xml_idx_extract(expr Node, attnum int2) returns text
+ *
+ * expr: The index expression tree (e.g., xpath('/path', xml_col))
+ * attnum: The base table column number
+ *
+ * Returns: The XPath expression as text, or NULL if not an xpath() call
+ */
+Datum
+xml_idx_extract(PG_FUNCTION_ARGS)
+{
+#ifdef USE_LIBXML
+ Node *expr = (Node *) PG_GETARG_POINTER(0);
+ AttrNumber attnum = PG_GETARG_INT16(1);
+ FuncExpr *funcexpr;
+ Const *xpath_const;
+ text *xpath_text;
+ Node *first_arg;
+ Node *second_arg;
+ Var *var;
+
+ if (expr == NULL || !IsA(expr, FuncExpr))
+ PG_RETURN_NULL();
+
+ funcexpr = (FuncExpr *) expr;
+
+ /*
+ * Check if this is xpath() or xpath_exists() function. OID 3050 =
+ * xpath(text, xml, text[]) OID 3051 = xpath_exists(text, xml, text[]) OID
+ * 4146 = xpath(text, xml) OID 3053 = xmlexists(text, xml)
+ */
+ if (funcexpr->funcid != 3050 && funcexpr->funcid != 3051 &&
+ funcexpr->funcid != 4146 && funcexpr->funcid != 3053)
+ PG_RETURN_NULL();
+
+ /*
+ * The first argument should be a Const containing the XPath expression.
+ * The second argument should be a Var referencing our target column.
+ */
+ if (list_length(funcexpr->args) < 2)
+ PG_RETURN_NULL();
+
+ first_arg = (Node *) linitial(funcexpr->args);
+ second_arg = (Node *) lsecond(funcexpr->args);
+
+ if (!IsA(first_arg, Const))
+ PG_RETURN_NULL();
+
+ if (!IsA(second_arg, Var))
+ PG_RETURN_NULL();
+
+ var = (Var *) second_arg;
+
+ if (var->varattno != attnum)
+ PG_RETURN_NULL();
+
+ xpath_const = (Const *) first_arg;
+
+ if (xpath_const->constisnull)
+ PG_RETURN_NULL();
+
+ /* Extract the XPath expression text */
+ xpath_text = DatumGetTextPP(xpath_const->constvalue);
+
+ /* Return a copy of the XPath as our descriptor */
+ PG_RETURN_TEXT_P(xpath_text);
+#else
+ PG_RETURN_NULL();
+#endif
+}
+
+/*
+ * xml_idx_compare - Compare XML values at indexed subpaths
+ *
+ * Evaluates XPath expressions on old and new XML values and compares
+ * the results to determine if any indexed subpath changed.
+ *
+ * Signature: xml_idx_compare(old_val xml, new_val xml,
+ * descriptors internal, ndescriptors int4)
+ * returns bool
+ *
+ * Returns true if any indexed XPath result differs between old and new.
+ */
+Datum
+xml_idx_compare(PG_FUNCTION_ARGS)
+{
+#ifdef USE_LIBXML
+ xmltype *old_xml = PG_GETARG_XML_P(0);
+ xmltype *new_xml = PG_GETARG_XML_P(1);
+ IdxSubattrDesc *descriptors = (IdxSubattrDesc *) PG_GETARG_POINTER(2);
+ int32 ndescriptors = PG_GETARG_INT32(3);
+ int i;
+
+ /*
+ * For each descriptor (XPath expression), evaluate it on both old and new
+ * XML values and compare the results.
+ */
+ for (i = 0; i < ndescriptors; i++)
+ {
+ text *xpath_expr;
+ Datum old_result;
+ Datum new_result;
+ int old_nitems,
+ new_nitems;
+ ArrayBuildState *old_astate,
+ *new_astate;
+ Datum comparison;
+
+ xpath_expr = DatumGetTextPP(descriptors[i].descriptor);
+
+ /*
+ * Evaluate XPath on old value. We use xpath_internal() which is the
+ * same function used by the xpath() SQL function.
+ */
+ old_astate = initArrayResult(XMLOID, CurrentMemoryContext, true);
+ xpath_internal(xpath_expr, old_xml, NULL, &old_nitems, old_astate);
+ old_result = makeArrayResult(old_astate, CurrentMemoryContext);
+
+ /* Evaluate XPath on new value */
+ new_astate = initArrayResult(XMLOID, CurrentMemoryContext, true);
+ xpath_internal(xpath_expr, new_xml, NULL, &new_nitems, new_astate);
+ new_result = makeArrayResult(new_astate, CurrentMemoryContext);
+
+ /*
+ * Compare the results. If the number of results differs or the arrays
+ * differ, then this XPath result changed.
+ */
+ if (old_nitems != new_nitems)
+ PG_RETURN_BOOL(true);
+
+ /*
+ * Compare the arrays element by element. We use array_eq() for
+ * simplicity.
+ */
+ comparison = DirectFunctionCall2(array_eq, old_result, new_result);
+
+ if (!DatumGetBool(comparison))
+ PG_RETURN_BOOL(true); /* Arrays differ - indexed subpath changed */
+ }
+
+ /* No indexed XPath results changed */
+ PG_RETURN_BOOL(false);
+#else
+ /*
+ * Without libxml, conservatively assume changed to be safe. This path
+ * shouldn't be reached since xml_idx_extract returns NULL without libxml.
+ */
+ PG_RETURN_BOOL(true);
+#endif
+}
diff --git a/src/backend/utils/cache/Makefile b/src/backend/utils/cache/Makefile
index 77b3e1a037b9b..92a013660b0eb 100644
--- a/src/backend/utils/cache/Makefile
+++ b/src/backend/utils/cache/Makefile
@@ -17,6 +17,7 @@ OBJS = \
catcache.o \
evtcache.o \
funccache.o \
+ idxsubattr.o \
inval.o \
lsyscache.o \
partcache.o \
diff --git a/src/backend/utils/cache/idxsubattr.c b/src/backend/utils/cache/idxsubattr.c
new file mode 100644
index 0000000000000..849b98461211d
--- /dev/null
+++ b/src/backend/utils/cache/idxsubattr.c
@@ -0,0 +1,468 @@
+/*-------------------------------------------------------------------------
+ *
+ * idxsubpath.c
+ * Build and manage the per-relation indexed-subpath cache
+ * (RelationData.rd_idxsubattrs).
+ *
+ * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
+ *
+ * src/backend/utils/cache/idxsubpath.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/htup_details.h"
+#include "access/sysattr.h"
+#include "catalog/pg_type.h"
+#include "fmgr.h"
+#include "optimizer/optimizer.h" /* pull_var_clause */
+#include "utils/datum.h"
+#include "utils/catcache.h"
+#include "utils/idxsubattr.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+
+/*
+ * Temporary accumulator used only during RelationBuildIdxSubattrs.
+ */
+typedef struct SubpathAccumEntry
+{
+ AttrNumber attnum;
+ Oid typoid;
+ Oid comparefn_oid;
+ List *descs; /* List of IdxSubattrDesc (palloc'd) */
+} SubpathAccumEntry;
+
+/* Forward declarations */
+static SubpathAccumEntry *FindOrCreateAccumEntry(List **accum,
+ AttrNumber attnum,
+ Oid typoid,
+ Oid comparefn_oid);
+static RelSubattrInfo *FinalizeAccum(List *accum,
+ Bitmapset *simple_indexed_attrs);
+
+
+/*
+ * RelationBuildIdxSubattrs
+ *
+ * Scan all indexes on 'rel', and for each expression-index column whose
+ * base-table attribute has a type with typidxextract, call that function
+ * to extract a subpath descriptor. Accumulate descriptors per attribute
+ * and store the result in rel->rd_idxsubattrs.
+ *
+ * Results live in CacheMemoryContext and persist until relcache
+ * invalidation.
+ */
+static void
+RelationBuildIdxSubattrs(Relation rel)
+{
+ List *indexoidlist;
+ ListCell *lc;
+ List *accum = NIL; /* List of SubpathAccumEntry */
+ Bitmapset *simple_indexed_attrs = NULL;
+ MemoryContext buildcxt;
+ MemoryContext oldcxt;
+
+ Assert(!rel->rd_idxsubattrsvalid);
+
+ indexoidlist = RelationGetIndexList(rel);
+ if (indexoidlist == NIL)
+ {
+ rel->rd_idxsubattrs = NULL;
+ rel->rd_idxsubattrsvalid = true;
+ return;
+ }
+
+ /*
+ * Use a temporary context for intermediate allocations (expression trees,
+ * Var lists, etc.). Final results are copied to CacheMemoryContext by
+ * FinalizeAccum().
+ */
+ buildcxt = AllocSetContextCreate(CurrentMemoryContext,
+ "IdxSubpath build",
+ ALLOCSET_SMALL_SIZES);
+ oldcxt = MemoryContextSwitchTo(buildcxt);
+
+ foreach(lc, indexoidlist)
+ {
+ Oid indexoid = lfirst_oid(lc);
+ Relation idxrel;
+ Form_pg_index idxform;
+ List *indexprs;
+ int exprno;
+
+ idxrel = index_open(indexoid, AccessShareLock);
+ idxform = idxrel->rd_index;
+
+ /*
+ * RelationGetIndexExpressions returns a deep copy of the expression
+ * list, allocated in the current memory context.
+ */
+ indexprs = RelationGetIndexExpressions(idxrel);
+
+ /*
+ * Walk index columns. For each expression column (indkey = 0),
+ * consume the next expression from indexprs.
+ */
+ exprno = 0;
+ for (int col = 0; col < idxform->indnatts; col++)
+ {
+ AttrNumber indkey = idxform->indkey.values[col];
+ Node *expr;
+ List *vars;
+ ListCell *vc;
+
+ /* Simple column reference — record in simple_indexed_attrs */
+ if (indkey != 0)
+ {
+ int attidx = indkey - FirstLowInvalidHeapAttributeNumber;
+
+ simple_indexed_attrs = bms_add_member(simple_indexed_attrs, attidx);
+ continue;
+ }
+
+ if (exprno >= list_length(indexprs))
+ break; /* shouldn't happen, but be safe */
+
+ expr = (Node *) list_nth(indexprs, exprno);
+ exprno++;
+
+ /*
+ * Extract all Var references from the expression. Each Var
+ * references a base-table column.
+ */
+ vars = pull_var_clause(expr, 0);
+
+ foreach(vc, vars)
+ {
+ Var *var = (Var *) lfirst(vc);
+ HeapTuple typeTup;
+ Form_pg_type typeForm;
+ Oid extractfn_oid;
+ Oid comparefn_oid;
+ Datum descriptor;
+ SubpathAccumEntry *entry;
+ IdxSubattrDesc *desc;
+
+ if (!IsA(var, Var))
+ continue;
+
+ /*
+ * In index expressions, varno is always 1 (the indexed table)
+ * and varattno is the base-table column number.
+ */
+ if (var->varno != 1 || var->varattno <= 0)
+ continue;
+
+ /* Look up the type's subpath functions */
+ typeTup = SearchSysCache1(TYPEOID,
+ ObjectIdGetDatum(var->vartype));
+ if (!HeapTupleIsValid(typeTup))
+ continue;
+
+ typeForm = (Form_pg_type) GETSTRUCT(typeTup);
+ extractfn_oid = typeForm->typidxextract;
+ comparefn_oid = typeForm->typidxcompare;
+ ReleaseSysCache(typeTup);
+
+ /* Type doesn't support subpath extraction */
+ if (!OidIsValid(extractfn_oid))
+ continue;
+
+ /*
+ * Call typidxextract(expr, varattno).
+ *
+ * The function inspects the expression tree, recognizes
+ * access patterns for its type (e.g., -> and ->> for JSONB,
+ * xpath() for XML), and returns an opaque subpath descriptor.
+ * Returns NULL if the expression cannot be decomposed into a
+ * subpath access.
+ */
+ descriptor = OidFunctionCall2(extractfn_oid,
+ PointerGetDatum(expr),
+ Int16GetDatum(var->varattno));
+
+ /* Can't decompose, whole-column dependency */
+ if (descriptor == (Datum) 0)
+ continue;
+
+ /*
+ * Accumulate the descriptor for this attribute.
+ */
+ entry = FindOrCreateAccumEntry(&accum,
+ var->varattno,
+ var->vartype,
+ comparefn_oid);
+
+ desc = (IdxSubattrDesc *) palloc(sizeof(IdxSubattrDesc));
+ desc->descriptor = descriptor; /* in buildcxt for now */
+ desc->indexoid = indexoid;
+ desc->indexcol = col;
+
+ entry->descs = lappend(entry->descs, desc);
+ }
+
+ list_free(vars);
+ }
+
+ index_close(idxrel, AccessShareLock);
+ }
+
+ MemoryContextSwitchTo(oldcxt);
+
+ /*
+ * Convert accumulator to the final RelSubattrInfo in CacheMemoryContext.
+ * This deep-copies descriptors out of buildcxt.
+ */
+ rel->rd_idxsubattrs = FinalizeAccum(accum, simple_indexed_attrs);
+ rel->rd_idxsubattrsvalid = true;
+
+ MemoryContextDelete(buildcxt);
+ list_free(indexoidlist);
+}
+
+
+/*
+ * FindOrCreateAccumEntry
+ *
+ * Find the accumulator entry for 'attnum', or create a new one.
+ * 'accum' is a List of SubpathAccumEntry pointers (modified in place).
+ */
+static SubpathAccumEntry *
+FindOrCreateAccumEntry(List **accum, AttrNumber attnum,
+ Oid typoid, Oid comparefn_oid)
+{
+ ListCell *lc;
+ SubpathAccumEntry *entry;
+
+ foreach(lc, *accum)
+ {
+ entry = (SubpathAccumEntry *) lfirst(lc);
+ if (entry->attnum == attnum)
+ return entry;
+ }
+
+ entry = (SubpathAccumEntry *) palloc0(sizeof(SubpathAccumEntry));
+ entry->attnum = attnum;
+ entry->typoid = typoid;
+ entry->comparefn_oid = comparefn_oid;
+ entry->descs = NIL;
+
+ *accum = lappend(*accum, entry);
+ return entry;
+}
+
+
+/*
+ * FinalizeAccum
+ *
+ * Convert the List-of-Lists accumulator into a compact RelSubattrInfo
+ * structure in CacheMemoryContext. Deep-copies all descriptor Datums.
+ *
+ * Returns NULL if the accumulator is empty (no subpath indexes found).
+ */
+static RelSubattrInfo *
+FinalizeAccum(List *accum, Bitmapset *simple_indexed_attrs)
+{
+ RelSubattrInfo *result;
+ MemoryContext oldcxt;
+ int nattrs;
+ int i = 0;
+ ListCell *lc;
+
+ nattrs = list_length(accum);
+ if (nattrs == 0)
+ return NULL;
+
+ oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
+
+ result = (RelSubattrInfo *) palloc0(sizeof(RelSubattrInfo));
+ result->nattrs = nattrs;
+ result->attrs = (AttrSubattrInfo *) palloc0(sizeof(AttrSubattrInfo) * nattrs);
+ result->subattr_attrs = NULL;
+ result->simple_indexed_attrs = bms_copy(simple_indexed_attrs);
+
+ foreach(lc, accum)
+ {
+ SubpathAccumEntry *entry = (SubpathAccumEntry *) lfirst(lc);
+ AttrSubattrInfo *attr = &result->attrs[i];
+ int ndesc = list_length(entry->descs);
+ int j;
+ ListCell *dc;
+ int attidx;
+
+ attr->attnum = entry->attnum;
+ attr->typoid = entry->typoid;
+ attr->ndescriptors = ndesc;
+ attr->descriptors = (IdxSubattrDesc *)
+ palloc(sizeof(IdxSubattrDesc) * ndesc);
+
+ /* Cache the compare function for runtime use */
+ if (OidIsValid(entry->comparefn_oid))
+ {
+ fmgr_info_cxt(entry->comparefn_oid,
+ &attr->comparefn,
+ CacheMemoryContext);
+ attr->has_comparefn = true;
+ }
+ else
+ {
+ attr->has_comparefn = false;
+ }
+
+ /* Deep-copy each descriptor into CacheMemoryContext */
+ j = 0;
+ foreach(dc, entry->descs)
+ {
+ IdxSubattrDesc *src = (IdxSubattrDesc *) lfirst(dc);
+ IdxSubattrDesc *dst = &attr->descriptors[j];
+
+ /*
+ * Descriptors are varlena by convention. datumCopy with
+ * typByVal=false, typLen=-1 handles detoasted varlena.
+ */
+ dst->descriptor = datumCopy(src->descriptor, false, -1);
+ dst->indexoid = src->indexoid;
+ dst->indexcol = src->indexcol;
+ j++;
+ }
+
+ /* Add to the quick-lookup bitmapset */
+ attidx = entry->attnum - FirstLowInvalidHeapAttributeNumber;
+ result->subattr_attrs = bms_add_member(result->subattr_attrs, attidx);
+
+ i++;
+ }
+
+ MemoryContextSwitchTo(oldcxt);
+ return result;
+}
+
+
+/* ----------------------------------------------------------------
+ * Public accessor functions
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * RelationGetIdxSubattrs
+ *
+ * Return the cached subpath info, building it if necessary.
+ * Returns NULL if the relation has no sub-attribute expression indexes.
+ */
+RelSubattrInfo *
+RelationGetIdxSubattrs(Relation rel)
+{
+ if (!rel->rd_idxsubattrsvalid)
+ RelationBuildIdxSubattrs(rel);
+ return rel->rd_idxsubattrs;
+}
+
+/*
+ * attr_has_subattr_indexes
+ *
+ * Quick check: does this base-table attribute have any expression-index
+ * columns backed by subpath descriptors?
+ */
+bool
+attr_has_subattr_indexes(Relation rel, AttrNumber attnum)
+{
+ RelSubattrInfo *info = RelationGetIdxSubattrs(rel);
+ int attidx;
+
+ if (info == NULL)
+ return false;
+
+ attidx = attnum - FirstLowInvalidHeapAttributeNumber;
+ return bms_is_member(attidx, info->subattr_attrs);
+}
+
+/*
+ * attr_subattr_only
+ *
+ * Returns true if 'attnum' has subpath descriptors AND is NOT referenced
+ * by any simple (whole-column) index. Only in this case can the subpath
+ * optimization avoid an index update.
+ */
+bool
+attr_subattr_only(Relation rel, AttrNumber attnum)
+{
+ RelSubattrInfo *info = RelationGetIdxSubattrs(rel);
+ int attidx;
+
+ if (info == NULL)
+ return false;
+
+ attidx = attnum - FirstLowInvalidHeapAttributeNumber;
+ return (bms_is_member(attidx, info->subattr_attrs) &&
+ !bms_is_member(attidx, info->simple_indexed_attrs));
+}
+
+/*
+ * RelationGetAttrSubattrInfo
+ *
+ * Return the AttrSubattrInfo for a specific attribute, or NULL.
+ */
+AttrSubattrInfo *
+RelationGetAttrSubattrInfo(Relation rel, AttrNumber attnum)
+{
+ RelSubattrInfo *info = RelationGetIdxSubattrs(rel);
+
+ if (info == NULL)
+ return NULL;
+
+ for (int i = 0; i < info->nattrs; i++)
+ {
+ if (info->attrs[i].attnum == attnum)
+ return &info->attrs[i];
+ }
+ return NULL;
+}
+
+
+/* ----------------------------------------------------------------
+ * Invalidation / cleanup
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * FreeIdxSubattrs
+ *
+ * Free a RelSubattrInfo and all its contents. Called from
+ * RelationClearRelation() during relcache invalidation.
+ */
+void
+FreeIdxSubattrs(RelSubattrInfo *info)
+{
+ if (info == NULL)
+ return;
+
+ for (int i = 0; i < info->nattrs; i++)
+ {
+ AttrSubattrInfo *attr = &info->attrs[i];
+
+ for (int j = 0; j < attr->ndescriptors; j++)
+ {
+ /*
+ * Descriptors are varlena allocated in CacheMemoryContext. pfree
+ * them individually.
+ */
+ if (DatumGetPointer(attr->descriptors[j].descriptor) != NULL)
+ pfree(DatumGetPointer(attr->descriptors[j].descriptor));
+ }
+ if (attr->descriptors)
+ pfree(attr->descriptors);
+ }
+
+ if (info->attrs)
+ pfree(info->attrs);
+ if (info->subattr_attrs)
+ bms_free(info->subattr_attrs);
+ if (info->simple_indexed_attrs)
+ bms_free(info->simple_indexed_attrs);
+
+ pfree(info);
+}
diff --git a/src/backend/utils/cache/meson.build b/src/backend/utils/cache/meson.build
index a4435e0c3c634..c0297846846cc 100644
--- a/src/backend/utils/cache/meson.build
+++ b/src/backend/utils/cache/meson.build
@@ -5,6 +5,7 @@ backend_sources += files(
'catcache.c',
'evtcache.c',
'funccache.c',
+ 'idxsubattr.c',
'inval.c',
'lsyscache.c',
'partcache.c',
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index a1c88c6b1b695..5c7fd8bbb0218 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -1219,6 +1219,10 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
relation->rd_partcheckvalid = false;
relation->rd_partcheckcxt = NULL;
+ /* indexed-subpath data is not loaded till asked for */
+ relation->rd_idxsubattrs = NULL;
+ relation->rd_idxsubattrsvalid = false;
+
/*
* initialize access method information
*/
@@ -2475,8 +2479,8 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc)
bms_free(relation->rd_keyattr);
bms_free(relation->rd_pkattr);
bms_free(relation->rd_idattr);
- bms_free(relation->rd_hotblockingattr);
bms_free(relation->rd_summarizedattr);
+ bms_free(relation->rd_indexedattr);
if (relation->rd_pubdesc)
pfree(relation->rd_pubdesc);
if (relation->rd_options)
@@ -2501,6 +2505,8 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc)
MemoryContextDelete(relation->rd_pddcxt);
if (relation->rd_partcheckcxt)
MemoryContextDelete(relation->rd_partcheckcxt);
+ if (relation->rd_idxsubattrs != NULL)
+ FreeIdxSubattrs(relation->rd_idxsubattrs);
pfree(relation);
}
@@ -2521,6 +2527,14 @@ RelationInvalidateRelation(Relation relation)
*/
RelationCloseSmgr(relation);
+ /* Free indexed sub-path descriptors, if any */
+ if (relation->rd_idxsubattrs != NULL)
+ {
+ FreeIdxSubattrs(relation->rd_idxsubattrs);
+ relation->rd_idxsubattrs = NULL;
+ }
+ relation->rd_idxsubattrsvalid = false;
+
/* Free AM cached data, if any */
if (relation->rd_amcache)
pfree(relation->rd_amcache);
@@ -5276,8 +5290,8 @@ RelationGetIndexPredicate(Relation relation)
* (beware: even if PK is deferrable!)
* INDEX_ATTR_BITMAP_IDENTITY_KEY Columns in the table's replica identity
* index (empty if FULL)
- * INDEX_ATTR_BITMAP_HOT_BLOCKING Columns that block updates from being HOT
- * INDEX_ATTR_BITMAP_SUMMARIZED Columns included in summarizing indexes
+ * INDEX_ATTR_BITMAP_SUMMARIZED Columns only included in summarizing indexes
+ * INDEX_ATTR_BITMAP_INDEXED Columns referenced by indexes
*
* Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that
* we can include system attributes (e.g., OID) in the bitmap representation.
@@ -5300,8 +5314,8 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind)
Bitmapset *uindexattrs; /* columns in unique indexes */
Bitmapset *pkindexattrs; /* columns in the primary index */
Bitmapset *idindexattrs; /* columns in the replica identity */
- Bitmapset *hotblockingattrs; /* columns with HOT blocking indexes */
- Bitmapset *summarizedattrs; /* columns with summarizing indexes */
+ Bitmapset *summarizedattrs; /* columns only in summarizing indexes */
+ Bitmapset *indexedattrs; /* columns referenced by indexes */
List *indexoidlist;
List *newindexoidlist;
Oid relpkindex;
@@ -5320,10 +5334,10 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind)
return bms_copy(relation->rd_pkattr);
case INDEX_ATTR_BITMAP_IDENTITY_KEY:
return bms_copy(relation->rd_idattr);
- case INDEX_ATTR_BITMAP_HOT_BLOCKING:
- return bms_copy(relation->rd_hotblockingattr);
case INDEX_ATTR_BITMAP_SUMMARIZED:
return bms_copy(relation->rd_summarizedattr);
+ case INDEX_ATTR_BITMAP_INDEXED:
+ return bms_copy(relation->rd_indexedattr);
default:
elog(ERROR, "unknown attrKind %u", attrKind);
}
@@ -5366,8 +5380,8 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind)
uindexattrs = NULL;
pkindexattrs = NULL;
idindexattrs = NULL;
- hotblockingattrs = NULL;
summarizedattrs = NULL;
+ indexedattrs = NULL;
foreach(l, indexoidlist)
{
Oid indexOid = lfirst_oid(l);
@@ -5426,7 +5440,7 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind)
if (indexDesc->rd_indam->amsummarizing)
attrs = &summarizedattrs;
else
- attrs = &hotblockingattrs;
+ attrs = &indexedattrs;
/* Collect simple attribute references */
for (i = 0; i < indexDesc->rd_index->indnatts; i++)
@@ -5435,9 +5449,9 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind)
/*
* Since we have covering indexes with non-key columns, we must
- * handle them accurately here. non-key columns must be added into
- * hotblockingattrs or summarizedattrs, since they are in index,
- * and update shouldn't miss them.
+ * handle them accurately here. Non-key columns must be added into
+ * indexedattrs or summarizedattrs, since they are in index, and
+ * update shouldn't miss them.
*
* Summarizing indexes do not block HOT, but do need to be updated
* when the column value changes, thus require a separate
@@ -5498,12 +5512,20 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind)
bms_free(uindexattrs);
bms_free(pkindexattrs);
bms_free(idindexattrs);
- bms_free(hotblockingattrs);
bms_free(summarizedattrs);
+ bms_free(indexedattrs);
goto restart;
}
+ /*
+ * Record what attributes are only referenced by summarizing indexes. Then
+ * add that into the other indexed attributes to track all referenced
+ * attributes.
+ */
+ summarizedattrs = bms_del_members(summarizedattrs, indexedattrs);
+ indexedattrs = bms_add_members(indexedattrs, summarizedattrs);
+
/* Don't leak the old values of these bitmaps, if any */
relation->rd_attrsvalid = false;
bms_free(relation->rd_keyattr);
@@ -5512,10 +5534,10 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind)
relation->rd_pkattr = NULL;
bms_free(relation->rd_idattr);
relation->rd_idattr = NULL;
- bms_free(relation->rd_hotblockingattr);
- relation->rd_hotblockingattr = NULL;
bms_free(relation->rd_summarizedattr);
relation->rd_summarizedattr = NULL;
+ bms_free(relation->rd_indexedattr);
+ relation->rd_indexedattr = NULL;
/*
* Now save copies of the bitmaps in the relcache entry. We intentionally
@@ -5528,8 +5550,8 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind)
relation->rd_keyattr = bms_copy(uindexattrs);
relation->rd_pkattr = bms_copy(pkindexattrs);
relation->rd_idattr = bms_copy(idindexattrs);
- relation->rd_hotblockingattr = bms_copy(hotblockingattrs);
relation->rd_summarizedattr = bms_copy(summarizedattrs);
+ relation->rd_indexedattr = bms_copy(indexedattrs);
relation->rd_attrsvalid = true;
MemoryContextSwitchTo(oldcxt);
@@ -5542,10 +5564,10 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind)
return pkindexattrs;
case INDEX_ATTR_BITMAP_IDENTITY_KEY:
return idindexattrs;
- case INDEX_ATTR_BITMAP_HOT_BLOCKING:
- return hotblockingattrs;
case INDEX_ATTR_BITMAP_SUMMARIZED:
return summarizedattrs;
+ case INDEX_ATTR_BITMAP_INDEXED:
+ return indexedattrs;
default:
elog(ERROR, "unknown attrKind %u", attrKind);
return NULL;
@@ -6515,6 +6537,8 @@ load_relcache_init_file(bool shared)
rel->rd_droppedSubid = InvalidSubTransactionId;
rel->rd_amcache = NULL;
rel->pgstat_info = NULL;
+ rel->rd_idxsubattrs = NULL;
+ rel->rd_idxsubattrsvalid = false;
/*
* Recompute lock and physical addressing info. This is needed in
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index a5a0edf2534aa..615e4afcc5d06 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -984,6 +984,14 @@
boot_val => 'true',
},
+{ name => 'enable_subpath_hot', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
+ short_desc => 'Enables sub-attribute analysis for HOT update eligibility.',
+ long_desc => 'When enabled, updates to complex types like JSONB are analyzed at the sub-attribute level to determine if indexed subpaths have changed, potentially allowing HOT updates even when the column\'s bytes differ.',
+ flags => 'GUC_EXPLAIN',
+ variable => 'enable_subpath_hot',
+ boot_val => 'true',
+},
+
{ name => 'enable_tidscan', type => 'bool', context => 'PGC_USERSET', group => 'QUERY_TUNING_METHOD',
short_desc => 'Enables the planner\'s use of TID scan plans.',
flags => 'GUC_EXPLAIN',
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index e686d88afc427..4d6834b9690e9 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -429,6 +429,7 @@
#enable_presorted_aggregate = on
#enable_seqscan = on
#enable_sort = on
+#enable_subpath_hot = on
#enable_tidscan = on
#enable_group_by_reordering = on
#enable_distinct_reordering = on
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index ecfbd017d66dc..6b88bca36b3e1 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -225,6 +225,12 @@ typedef void (*aminitparallelscan_function) (void *target);
/* (re)start parallel index scan */
typedef void (*amparallelrescan_function) (IndexScanDesc scan);
+/* compare datums to determine if index update is needed */
+typedef bool (*amcomparedatums_function) (Relation indexRelation,
+ int attnum,
+ Datum oldValue, bool oldIsNull,
+ Datum newValue, bool newIsNull);
+
/*
* API struct for an index AM. Note we expect index AMs to allocate these
* structs statically; the core code never copies nor frees them.
@@ -322,6 +328,9 @@ typedef struct IndexAmRoutine
/* interface functions to support planning */
amtranslate_strategy_function amtranslatestrategy; /* can be NULL */
amtranslate_cmptype_function amtranslatecmptype; /* can be NULL */
+
+ /* interface function to compare datums on update */
+ amcomparedatums_function amcomparedatums; /* can be NULL */
} IndexAmRoutine;
diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h
index 7c3b4db94cd6a..14035c1c417ea 100644
--- a/src/include/access/gin_private.h
+++ b/src/include/access/gin_private.h
@@ -105,6 +105,9 @@ extern OffsetNumber gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple);
extern Datum gintuple_get_key(GinState *ginstate, IndexTuple tuple,
GinNullCategory *category);
extern char *ginbuildphasename(int64 phasenum);
+extern bool gincomparedatums(Relation index, int attnum,
+ Datum old_datum, bool old_isnull,
+ Datum new_datum, bool new_isnull);
/* gininsert.c */
extern IndexBuildResult *ginbuild(Relation heap, Relation index,
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index ad993c07311c8..5691b097bc618 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -378,10 +378,9 @@ extern TM_Result heap_delete(Relation relation, const ItemPointerData *tid,
extern void heap_finish_speculative(Relation relation, const ItemPointerData *tid);
extern void heap_abort_speculative(Relation relation, const ItemPointerData *tid);
extern TM_Result heap_update(Relation relation, const ItemPointerData *otid,
- HeapTuple newtup,
- CommandId cid, Snapshot crosscheck, bool wait,
- TM_FailureData *tmfd, LockTupleMode *lockmode,
- TU_UpdateIndexes *update_indexes);
+ HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait,
+ TM_FailureData *tmfd, const LockTupleMode lockmode,
+ const Bitmapset *modified_idx_attrs, const bool hot_allowed);
extern TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple,
CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
bool follow_updates,
@@ -416,7 +415,7 @@ extern bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple);
extern void simple_heap_insert(Relation relation, HeapTuple tup);
extern void simple_heap_delete(Relation relation, const ItemPointerData *tid);
extern void simple_heap_update(Relation relation, const ItemPointerData *otid,
- HeapTuple tup, TU_UpdateIndexes *update_indexes);
+ HeapTuple tup, Bitmapset **modified_idx_attrs);
extern TransactionId heap_index_delete_tuples(Relation rel,
TM_IndexDeleteOp *delstate);
@@ -443,6 +442,11 @@ extern void log_heap_prune_and_freeze(Relation relation, Buffer buffer,
OffsetNumber *dead, int ndead,
OffsetNumber *unused, int nunused);
+/* in heap/heapam.c */
+extern bool HeapUpdateHotAllowable(Relation relation, const Bitmapset *modified_idx_attrs);
+extern LockTupleMode HeapUpdateDetermineLockmode(Relation relation,
+ const Bitmapset *modified_idx_attrs);
+
/* in heap/vacuumlazy.c */
extern void heap_vacuum_rel(Relation rel,
const VacuumParams params, BufferAccessStrategy bstrategy);
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 06084752245d5..6ba61224c7ea5 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -104,20 +104,20 @@ typedef enum TM_Result
} TM_Result;
/*
- * Result codes for table_update(..., update_indexes*..).
- * Used to determine which indexes to update.
+ * Sentinel bit in modified_idx_attrs bitmapset.
+ *
+ * When set by the table AM in the modified_idx_attrs bitmapset (via the
+ * tuple_update callback), this indicates that the update was non-HOT and
+ * all indexes need to be updated. The executor checks this bit to
+ * determine whether per-index update decisions are needed.
+ *
+ * Bit 0 in the bitmapset corresponds to FirstLowInvalidHeapAttributeNumber
+ * which is never a valid heap attribute, making it safe to use as a sentinel.
+ *
+ * Special bit value used in modified_idx_attrs bitmapset to signal that
+ * all indexes need updating (non-HOT update).
*/
-typedef enum TU_UpdateIndexes
-{
- /* No indexed columns were updated (incl. TID addressing of tuple) */
- TU_None,
-
- /* A non-summarizing indexed column was updated, or the TID has changed */
- TU_All,
-
- /* Only summarized columns were updated, TID is unchanged */
- TU_Summarizing,
-} TU_UpdateIndexes;
+#define MODIFIED_IDX_ATTRS_ALL_IDX (0) /* -FirstLowInvalidHeapAttributeNumber */
/*
* When table_tuple_update, table_tuple_delete, or table_tuple_lock fail
@@ -549,7 +549,7 @@ typedef struct TableAmRoutine
bool wait,
TM_FailureData *tmfd,
LockTupleMode *lockmode,
- TU_UpdateIndexes *update_indexes);
+ Bitmapset **modified_idx_attrs);
/* see table_tuple_lock() for reference about parameters */
TM_Result (*tuple_lock) (Relation rel,
@@ -1498,12 +1498,15 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid,
* crosscheck - if not InvalidSnapshot, also check old tuple against this
* wait - true if should wait for any conflicting update to commit/abort
*
+ * Input/Output parameters:
+ * modified_idx_attrs - on input, the set of indexed attributes whose values
+ * changed. On output, the table AM may set the MODIFIED_IDX_ATTRS_ALL_IDX
+ * sentinel bit to indicate that all indexes need updating (non-HOT update).
+ *
* Output parameters:
* slot - newly constructed tuple data to store
* tmfd - filled in failure cases (see below)
* lockmode - filled with lock mode acquired on tuple
- * update_indexes - in success cases this is set if new index entries
- * are required for this tuple; see TU_UpdateIndexes
*
* Normal, successful return value is TM_Ok, which means we did actually
* update it. Failure return codes are TM_SelfModified, TM_Updated, and
@@ -1523,12 +1526,12 @@ static inline TM_Result
table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot,
CommandId cid, Snapshot snapshot, Snapshot crosscheck,
bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode,
- TU_UpdateIndexes *update_indexes)
+ Bitmapset **modified_idx_attrs)
{
return rel->rd_tableam->tuple_update(rel, otid, slot,
cid, snapshot, crosscheck,
- wait, tmfd,
- lockmode, update_indexes);
+ wait, tmfd, lockmode,
+ modified_idx_attrs);
}
/*
@@ -2009,7 +2012,7 @@ extern void simple_table_tuple_delete(Relation rel, ItemPointer tid,
Snapshot snapshot);
extern void simple_table_tuple_update(Relation rel, ItemPointer otid,
TupleTableSlot *slot, Snapshot snapshot,
- TU_UpdateIndexes *update_indexes);
+ Bitmapset **modified_idx_attrs);
/* ----------------------------------------------------------------------------
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 90f46b0350237..a51d06fde6948 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -56,6 +56,11 @@
* catalog changes on the same day...)
*/
+/*
+ * 202603061 - Add pg_type.typidxextract/typidxcompare, pg_proc.prosubattrmutator
+ * for HOT updates on expression indexes; changes Table AM API
+ */
+
/* yyyymmddN */
#define CATALOG_VERSION_NO 202603101
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 361e2cfffebe9..34df869c38078 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -4803,6 +4803,16 @@
proname => 'float8', prorettype => 'float8', proargtypes => 'jsonb',
prosrc => 'jsonb_float8' },
+# JSONB subpath support
+{ oid => '6071', descr => 'extract indexed subpath from expression (jsonb)',
+ proname => 'jsonb_idx_extract', prorettype => 'internal',
+ proargtypes => 'internal int2', provolatile => 'i',
+ prosrc => 'jsonb_idx_extract' },
+{ oid => '6072', descr => 'compare jsonb datums at indexed subpaths',
+ proname => 'jsonb_idx_compare', prorettype => 'bool',
+ proargtypes => 'jsonb jsonb internal int4', provolatile => 'i',
+ prosrc => 'jsonb_idx_compare' },
+
# formatting
{ oid => '1770', descr => 'format timestamp with time zone to text',
proname => 'to_char', provolatile => 's', prorettype => 'text',
@@ -9366,6 +9376,16 @@
proname => 'xml_is_well_formed_content', prorettype => 'bool',
proargtypes => 'text', prosrc => 'xml_is_well_formed_content' },
+# XML subpath support
+{ oid => '6082', descr => 'extract indexed subpath from expression (xml)',
+ proname => 'xml_idx_extract', prorettype => 'internal',
+ proargtypes => 'internal int2', provolatile => 'i',
+ prosrc => 'xml_idx_extract' },
+{ oid => '6081', descr => 'compare xml datums at indexed subpaths',
+ proname => 'xml_idx_compare', prorettype => 'bool',
+ proargtypes => 'xml xml internal int4', provolatile => 'i',
+ prosrc => 'xml_idx_compare' },
+
# json
{ oid => '321', descr => 'I/O',
proname => 'json_in', prorettype => 'json', proargtypes => 'cstring',
@@ -10592,6 +10612,7 @@
proargtypes => 'jsonb jsonb', prosrc => 'jsonb_concat' },
{ oid => '3302',
proname => 'jsonb_delete', prorettype => 'jsonb', proargtypes => 'jsonb text',
+ prosubattrmutator => 'true',
prosrc => 'jsonb_delete' },
{ oid => '3303',
proname => 'jsonb_delete', prorettype => 'jsonb', proargtypes => 'jsonb int4',
@@ -10603,18 +10624,21 @@
prosrc => 'jsonb_delete_array' },
{ oid => '3304',
proname => 'jsonb_delete_path', prorettype => 'jsonb',
+ prosubattrmutator => 'true',
proargtypes => 'jsonb _text', prosrc => 'jsonb_delete_path' },
{ oid => '5054', descr => 'Set part of a jsonb, handle NULL value',
proname => 'jsonb_set_lax', proisstrict => 'f', prorettype => 'jsonb',
proargtypes => 'jsonb _text jsonb bool text',
proargnames => '{jsonb_in,path,replacement,create_if_missing,null_value_treatment}',
proargdefaults => '{true,use_json_null}',
+ prosubattrmutator => 'true',
prosrc => 'jsonb_set_lax' },
{ oid => '3305', descr => 'Set part of a jsonb',
proname => 'jsonb_set', prorettype => 'jsonb',
proargtypes => 'jsonb _text jsonb bool',
proargnames => '{jsonb_in,path,replacement,create_if_missing}',
proargdefaults => '{true}',
+ prosubattrmutator => 'true',
prosrc => 'jsonb_set' },
{ oid => '3306', descr => 'Indented text from jsonb',
proname => 'jsonb_pretty', prorettype => 'text', proargtypes => 'jsonb',
@@ -10624,6 +10648,7 @@
proargtypes => 'jsonb _text jsonb bool',
proargnames => '{jsonb_in,path,replacement,insert_after}',
proargdefaults => '{false}',
+ prosubattrmutator => 'true',
prosrc => 'jsonb_insert' },
# jsonpath
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 2f9e0b695e26b..3d9126cdafae5 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -66,6 +66,19 @@ CATALOG(pg_proc,1255,ProcedureRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(81,Proce
/* is it a leakproof function? */
bool proleakproof BKI_DEFAULT(f);
+ /*
+ * prosubattrmutator: true if this function is a sub-attribute mutator
+ * that performs mix tracking via slot_add_modified_idx_attr() when a
+ * SubattrTrackingContext is provided through fcinfo->context.
+ *
+ * When true, the function's first argument is assumed to be the source
+ * datum (the value being mutated). The executor uses this to determine
+ * whether a SET expression is "fully instrumented" — i.e., all
+ * transformation steps are mutators tracing back to a Var of the same
+ * column.
+ */
+ bool prosubattrmutator BKI_DEFAULT(f);
+
/* strict with respect to NULLs? */
bool proisstrict BKI_DEFAULT(t);
diff --git a/src/include/catalog/pg_type.dat b/src/include/catalog/pg_type.dat
index a1a753d17978c..c111e24ac4ec7 100644
--- a/src/include/catalog/pg_type.dat
+++ b/src/include/catalog/pg_type.dat
@@ -141,6 +141,7 @@
typsend => 'json_send', typalign => 'i', typstorage => 'x' },
{ oid => '142', array_type_oid => '143', descr => 'XML content',
typname => 'xml', typlen => '-1', typbyval => 'f', typcategory => 'U',
+ typidxextract => 'xml_idx_extract', typidxcompare => 'xml_idx_compare',
typinput => 'xml_in', typoutput => 'xml_out', typreceive => 'xml_recv',
typsend => 'xml_send', typalign => 'i', typstorage => 'x' },
{ oid => '194', descr => 'string representing an internal node tree',
@@ -450,6 +451,7 @@
{ oid => '3802', array_type_oid => '3807', descr => 'Binary JSON',
typname => 'jsonb', typlen => '-1', typbyval => 'f', typcategory => 'U',
typsubscript => 'jsonb_subscript_handler', typinput => 'jsonb_in',
+ typidxextract => 'jsonb_idx_extract', typidxcompare => 'jsonb_idx_compare',
typoutput => 'jsonb_out', typreceive => 'jsonb_recv', typsend => 'jsonb_send',
typalign => 'i', typstorage => 'x' },
{ oid => '4072', array_type_oid => '4073', descr => 'JSON path',
diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h
index 74183ec5a2e43..35c6aad327880 100644
--- a/src/include/catalog/pg_type.h
+++ b/src/include/catalog/pg_type.h
@@ -110,6 +110,29 @@ CATALOG(pg_type,1247,TypeRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(71,TypeRelati
*/
regproc typsubscript BKI_DEFAULT(-) BKI_ARRAY_DEFAULT(array_subscript_handler) BKI_LOOKUP_OPT(pg_proc);
+ /*
+ * typidxextract: function to extract an indexed-subpath descriptor from
+ * an expression tree. Called at relcache build time. Zero if the type
+ * does not support sub-attribute index tracking.
+ *
+ * Signature: (internal, int2) -> internal arg0: Node * (expression tree
+ * from indexprs) arg1: AttrNumber (base-table column to analyze) returns:
+ * palloc'd varlena descriptor, or NULL
+ */
+ Oid typidxextract BKI_DEFAULT(0) BKI_LOOKUP_OPT(pg_proc);
+
+ /*
+ * typidxcompare: function to compare old and new datums for changes at
+ * indexed subpaths. Called at UPDATE time as fallback when no
+ * instrumented mutation function handled the tracking. Zero if not
+ * supported (implies whole-column comparison).
+ *
+ * Signature: (type, type, internal, int4) -> bool arg0: old datum arg1:
+ * new datum arg2: Datum * (array of subpath descriptors) arg3: int (count
+ * of descriptors) returns: true if any indexed subpath value changed
+ */
+ Oid typidxcompare BKI_DEFAULT(0) BKI_LOOKUP_OPT(pg_proc);
+
/*
* If typelem is not 0 then it identifies another row in pg_type, defining
* the type yielded by subscripting. This should be 0 if typsubscript is
diff --git a/src/include/executor/execExpr.h b/src/include/executor/execExpr.h
index aa9b361fa318d..10ce004756fe1 100644
--- a/src/include/executor/execExpr.h
+++ b/src/include/executor/execExpr.h
@@ -391,6 +391,16 @@ typedef struct ExprEvalStep
PGFunction fn_addr; /* actual call address */
int nargs; /* number of arguments */
bool make_ro; /* make arg0 R/O (used only for NULLIF) */
+
+ /*
+ * Sub-attribute mutation tracking: set during ExecInitExprRec for
+ * functions marked prosubattrmutator=true. fn_tracks_subpaths
+ * causes the interpreter to inject SubattrTrackingContext into
+ * fcinfo->context. fn_target_attnum is the target column number
+ * (from TargetEntry.resno).
+ */
+ bool fn_tracks_subpaths;
+ AttrNumber fn_target_attnum;
} func;
/* for EEOP_BOOL_*_STEP */
diff --git a/src/include/executor/execMutation.h b/src/include/executor/execMutation.h
new file mode 100644
index 0000000000000..c950bbed31c02
--- /dev/null
+++ b/src/include/executor/execMutation.h
@@ -0,0 +1,85 @@
+/*-------------------------------------------------------------------------
+ *
+ * execMutation.h
+ * Declarations for sub-attribute mutation tracking during UPDATE.
+ *
+ * src/include/executor/execMutation.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef EXEC_MUTATION_H
+#define EXEC_MUTATION_H
+
+#include "nodes/nodes.h"
+#include "nodes/bitmapset.h"
+#include "access/htup.h"
+#include "executor/tuptable.h"
+#include "utils/rel.h"
+
+/*
+ * SubattrTrackingContext — passed through fcinfo->context to mutation functions.
+ *
+ * Allocated once per SET-target column at ExecInitModifyTable time.
+ * Mutation functions use IsA(fcinfo->context, SubattrTrackingContext) to detect it.
+ * Non-UPDATE code paths and uninstrumented functions see context == NULL.
+ */
+typedef struct SubattrTrackingContext
+{
+ pg_node_attr(no_copy_equal, no_read, no_query_jumble)
+
+ NodeTag type; /* T_MixContext */
+
+ Relation rel pg_node_attr(read_write_ignore);
+ AttrNumber target_attnum;
+ TupleTableSlot *modified_idx_slot pg_node_attr(read_write_ignore);
+
+ /*
+ * Mapping from subplan result tuple position (resno) to table column
+ * number (attnum). Array indexed by (resno - 1). Value is the actual
+ * table column number. Used during expression compilation to set correct
+ * fn_target_attnum.
+ */
+ AttrNumber *resno_to_attnum pg_node_attr(read_write_ignore);
+ int max_resno; /* Size of resno_to_attnum array */
+
+ /*
+ * List of table column numbers being modified (updateColnos from
+ * ModifyTable). Used in ExecBuildProjectionInfo to populate
+ * resno_to_attnum mapping.
+ */
+ List *updateColnos pg_node_attr(read_write_ignore);
+} SubattrTrackingContext;
+
+/*
+ * slot_add_modified_idx_attr
+ *
+ * Record that a mutation to the given base-table attribute affected an
+ * indexed subpath. Called by sub-attribute-aware mutation functions
+ * (jsonb_set, etc.) during UPDATE SET expression evaluation.
+ *
+ * The Bitmapset is additive: successive calls from different mutation
+ * functions (or nested calls on the same column) union their results.
+ */
+extern void slot_add_modified_idx_attr(TupleTableSlot *slot, AttrNumber attnum);
+
+/*
+ * HeapCheckSubattrChanges
+ *
+ * Fallback subpath comparison for non-executor code paths (e.g.,
+ * simple_heap_update used by catalog operations) and for executor
+ * updates with uninstrumented mutation functions. For each attribute
+ * in check_attrs that has subpath descriptors, compares old and new
+ * values using the type's typidxcompare function. Returns the subset
+ * of check_attrs where no indexed subpath actually changed (safe to
+ * remove from the HOT-blocking set).
+ *
+ * See the detailed "Dual-path architecture" comment in execMutation.c
+ * for the relationship between this fallback path and the instrumented
+ * path (SubattrTrackingContext / slot_add_modified_idx_attr).
+ */
+extern Bitmapset *HeapCheckSubattrChanges(Relation relation,
+ HeapTuple oldtup,
+ HeapTuple newtup,
+ Bitmapset *check_attrs);
+
+#endif /* EXEC_MUTATION_H */
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index d46ba59895d62..efb92a6da13e2 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -17,6 +17,7 @@
#include "datatype/timestamp.h"
#include "executor/execdesc.h"
#include "fmgr.h"
+#include "nodes/execnodes.h"
#include "nodes/lockoptions.h"
#include "nodes/parsenodes.h"
#include "utils/memutils.h"
@@ -606,6 +607,10 @@ extern TupleDesc ExecCleanTypeFromTL(List *targetList);
extern TupleDesc ExecTypeFromExprList(List *exprList);
extern void ExecTypeSetColNames(TupleDesc typeInfo, List *namesList);
extern void UpdateChangedParamSet(PlanState *node, Bitmapset *newchg);
+extern Bitmapset *ExecCompareSlotAttrs(TupleDesc tupdesc,
+ const Bitmapset *attrs,
+ TupleTableSlot *old_tts,
+ TupleTableSlot *new_tts);
typedef struct TupOutputState
{
@@ -743,11 +748,13 @@ extern void ExecCloseIndices(ResultRelInfo *resultRelInfo);
/* flags for ExecInsertIndexTuples */
#define EIIT_IS_UPDATE (1<<0)
#define EIIT_NO_DUPE_ERROR (1<<1)
-#define EIIT_ONLY_SUMMARIZING (1<<2)
+#define EIIT_ALL_INDEXES (1<<2)
extern List *ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, EState *estate,
bits32 options, TupleTableSlot *slot,
List *arbiterIndexes,
bool *specConflict);
+extern void ExecSetIndexUnchanged(ResultRelInfo *resultRelInfo,
+ const Bitmapset *modified_idx_attrs);
extern bool ExecCheckIndexConstraints(ResultRelInfo *resultRelInfo,
TupleTableSlot *slot,
EState *estate, ItemPointer conflictTid,
@@ -803,5 +810,9 @@ extern ResultRelInfo *ExecLookupResultRelByOid(ModifyTableState *node,
Oid resultoid,
bool missing_ok,
bool update_cache);
+extern Bitmapset *ExecUpdateModifiedIdxAttrs(ResultRelInfo *relinfo,
+ EState *estate,
+ TupleTableSlot *old_tts,
+ TupleTableSlot *new_tts);
#endif /* EXECUTOR_H */
diff --git a/src/include/executor/tuptable.h b/src/include/executor/tuptable.h
index a2dfd707e78a4..db5e423617d53 100644
--- a/src/include/executor/tuptable.h
+++ b/src/include/executor/tuptable.h
@@ -127,6 +127,19 @@ typedef struct TupleTableSlot
MemoryContext tts_mcxt; /* slot itself is in this context */
ItemPointerData tts_tid; /* stored tuple's tid */
Oid tts_tableOid; /* table oid of tuple */
+
+ /*
+ * Modified-indexed (mix) attributes. Populated by sub-attribute-aware
+ * mutation functions (jsonb_set, etc.) during UPDATE SET expression
+ * evaluation. NULL when unused or when no indexed subpath was affected.
+ *
+ * Uses FirstLowInvalidHeapAttributeNumber offset convention, consistent
+ * with RelationGetIndexAttrBitmap() and ExecGetAllUpdatedCols().
+ *
+ * Allocated in tts_mcxt so it survives per-tuple expression context
+ * resets. Freed explicitly per-row by the executor.
+ */
+ struct Bitmapset *tts_modified_idx_attrs;
} TupleTableSlot;
/* routines for a TupleTableSlot implementation */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 63c067d5aae61..4dceffe43bafd 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -147,6 +147,20 @@ typedef struct ExprState
* ExecInitExprRec().
*/
ErrorSaveContext *escontext;
+
+ /*
+ * SubattrTrackingContext for sub-attribute mutation tracking. Set by
+ * ExecInitModifyTable for the UPDATE projection's ExprState. NULL for all
+ * other expression evaluations.
+ */
+ struct SubattrTrackingContext *es_subattr_context;
+
+ /*
+ * Compile-time tracking of the current TargetEntry's resno during
+ * expression compilation, used to populate fn_target_attnum for functions
+ * with prosubattrmutator=true.
+ */
+ AttrNumber es_current_target_attnum;
} ExprState;
@@ -204,10 +218,6 @@ typedef struct IndexInfo
bool ii_NullsNotDistinct;
/* is it valid for inserts? */
bool ii_ReadyForInserts;
- /* IndexUnchanged status determined yet? */
- bool ii_CheckedUnchanged;
- /* aminsert hint, cached for retail inserts */
- bool ii_IndexUnchanged;
/* are we doing a concurrent index build? */
bool ii_Concurrent;
/* did we detect any broken HOT chains? */
@@ -216,6 +226,8 @@ typedef struct IndexInfo
bool ii_Summarizing;
/* is it a WITHOUT OVERLAPS index? */
bool ii_WithoutOverlaps;
+ /* per-index: true if index values are unchanged by this UPDATE */
+ bool ii_IndexUnchanged;
/* # of workers requested (excludes leader) */
int ii_ParallelWorkers;
@@ -629,6 +641,32 @@ typedef struct ResultRelInfo
* one of its ancestors; see ExecCrossPartitionUpdateForeignKey().
*/
List *ri_ancestorResultRels;
+
+ /*
+ * Sub-attribute mutation tracking for UPDATE HOT optimization. Both
+ * fields are NULL/invalid when the relation has no sub-attribute
+ * expression indexes, or for non-UPDATE operations.
+ */
+
+ /*
+ * Bitmapset of attnums whose SET expression is "fully instrumented":
+ * every function in the expression chain is prosubattrmutator=true, with
+ * the source argument tracing back to a Var of the same column.
+ *
+ * For these columns, we trust tts_modified_idx_attrs completely: - attnum
+ * IN modified_idx_attrs → indexed subpath changed - attnum NOT IN
+ * modified_idx_attrs → no indexed subpath changed
+ *
+ * Uses FirstLowInvalidHeapAttributeNumber offset convention.
+ */
+ Bitmapset *ri_InstrumentedIdxAttrs;
+
+ /*
+ * The slot whose tts_modified_idx_attrs is used as the accumulator. Set
+ * once at init time; stable across rows. Points to the subplan's result
+ * slot.
+ */
+ TupleTableSlot *ri_MixSlot;
} ResultRelInfo;
/* ----------------
@@ -773,6 +811,14 @@ typedef struct EState
*/
List *es_insert_pending_result_relations;
List *es_insert_pending_modifytables;
+
+ /*
+ * Pending SubattrTrackingContext for UPDATE operations. Set temporarily
+ * during ExecInitNode(subplan) so that ExecBuildUpdateProjection can
+ * inject the context into the compiled expression. NULL at all other
+ * times.
+ */
+ struct SubattrTrackingContext *es_pending_subpath_context;
} EState;
diff --git a/src/include/nodes/meson.build b/src/include/nodes/meson.build
index 96800215df1be..f600a273ca83e 100644
--- a/src/include/nodes/meson.build
+++ b/src/include/nodes/meson.build
@@ -24,6 +24,7 @@ node_support_input_i = [
'nodes/supportnodes.h',
'nodes/value.h',
'utils/rel.h',
+ 'executor/execMutation.h',
]
node_support_input = []
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index f2fd5d315078d..146b442b10a5b 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -70,6 +70,7 @@ extern PGDLLIMPORT bool enable_parallel_hash;
extern PGDLLIMPORT bool enable_partition_pruning;
extern PGDLLIMPORT bool enable_presorted_aggregate;
extern PGDLLIMPORT bool enable_async_append;
+extern PGDLLIMPORT bool enable_subpath_hot;
extern PGDLLIMPORT int constraint_exclusion;
extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,
diff --git a/src/include/utils/idxsubattr.h b/src/include/utils/idxsubattr.h
new file mode 100644
index 0000000000000..dd1cbe118071b
--- /dev/null
+++ b/src/include/utils/idxsubattr.h
@@ -0,0 +1,109 @@
+/*-------------------------------------------------------------------------
+ *
+ * idxsubpath.h
+ * Data structures for indexed-subpath tracking on sub-attribute-aware
+ * types (JSONB, XML, etc.). Used by the relcache, executor, and
+ * type-specific extract/compare functions.
+ *
+ * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
+ *
+ * src/include/utils/idxsubattr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef IDXSUBPATH_H
+#define IDXSUBPATH_H
+
+#include "fmgr.h"
+#include "nodes/bitmapset.h"
+#include "access/attnum.h"
+
+/*
+ * IdxSubattrDesc — one subpath descriptor extracted from one expression
+ * index column.
+ *
+ * 'descriptor' is a type-specific opaque varlena Datum. For JSONB it is
+ * a text[] of path elements (e.g., {"a","b"} for data->'a'->'b'). For
+ * XML it is a text containing an XPath string.
+ *
+ * Stored in CacheMemoryContext as part of RelSubattrInfo.
+ */
+typedef struct IdxSubattrDesc
+{
+ Datum descriptor; /* type-specific varlena, in
+ * CacheMemoryContext */
+ Oid indexoid; /* source index OID (diagnostic only) */
+ int indexcol; /* source index column, 0-based */
+} IdxSubattrDesc;
+
+/*
+ * AttrSubattrInfo — all indexed subpath descriptors for one base-table
+ * attribute, plus the cached typidxcompare FmgrInfo for runtime use.
+ */
+typedef struct AttrSubattrInfo
+{
+ AttrNumber attnum; /* base table attribute number */
+ Oid typoid; /* pg_type OID of the attribute */
+ int ndescriptors; /* length of descriptors[] */
+ IdxSubattrDesc *descriptors; /* array, in CacheMemoryContext */
+ FmgrInfo comparefn; /* cached pg_type.typidxcompare */
+ bool has_comparefn; /* false if typidxcompare is InvalidOid */
+} AttrSubattrInfo;
+
+/*
+ * RelSubattrInfo — per-relation cache of all indexed-subpath info.
+ * Stored in RelationData.rd_idxsubattrs. NULL when the relation has
+ * no expression indexes on sub-attribute-aware types.
+ *
+ * subattr_attrs uses the FirstLowInvalidHeapAttributeNumber offset
+ * convention, consistent with RelationGetIndexAttrBitmap().
+ */
+typedef struct RelSubattrInfo
+{
+ int nattrs; /* length of attrs[] */
+ AttrSubattrInfo *attrs; /* array, NOT indexed by attnum */
+ Bitmapset *subattr_attrs; /* quick membership test for attnums */
+
+ /*
+ * Attnums referenced by at least one simple (non-expression) index
+ * column. Used to exclude attributes from the subpath optimization: if
+ * an attribute has both expression and simple index references, any byte
+ * change triggers an index update for the simple index, so the subpath
+ * check cannot avoid the update.
+ *
+ * Same offset convention as subattr_attrs.
+ */
+ Bitmapset *simple_indexed_attrs;
+} RelSubattrInfo;
+
+
+/*
+ * Ensure rd_idxsubattrs is populated (lazy build). Returns the
+ * cached pointer, which may be NULL if no subpath indexes exist.
+ */
+extern RelSubattrInfo *RelationGetIdxSubattrs(Relation rel);
+
+/*
+ * Does this attribute have any expression-index subpath descriptors?
+ */
+extern bool attr_has_subattr_indexes(Relation rel, AttrNumber attnum);
+
+/*
+ * Does this attribute have subpath descriptors AND is NOT referenced?
+ * by any simple (whole-column) index.
+ */
+extern bool attr_subattr_only(Relation rel, AttrNumber attnum);
+
+/*
+ * Look up the AttrSubattrInfo for a specific attribute.
+ * Returns NULL if the attribute has no subpath indexes.
+ */
+extern AttrSubattrInfo *RelationGetAttrSubattrInfo(Relation rel,
+ AttrNumber attnum);
+
+/*
+ * Free rd_idxsubattrs (called during relcache invalidation).
+ */
+extern void FreeIdxSubattrs(RelSubattrInfo *info);
+
+#endif /* IDXSUBPATH_H */
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index ca13efba0fb14..da4b422daa459 100644
--- a/src/include/utils/jsonb.h
+++ b/src/include/utils/jsonb.h
@@ -464,4 +464,8 @@ extern Datum jsonb_build_object_worker(int nargs, const Datum *args, const bool
extern Datum jsonb_build_array_worker(int nargs, const Datum *args, const bool *nulls,
const Oid *types, bool absent_on_null);
+/* Sub-attribute index support */
+extern Datum jsonb_idx_extract(PG_FUNCTION_ARGS);
+extern Datum jsonb_idx_compare(PG_FUNCTION_ARGS);
+
#endif /* __JSONB_H__ */
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 236830f6b93f1..05ec287027d1d 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -28,6 +28,7 @@
#include "storage/smgr.h"
#include "utils/relcache.h"
#include "utils/reltrigger.h"
+#include "utils/idxsubattr.h"
/*
@@ -65,6 +66,21 @@ typedef struct RelationData
* rd_replidindex) */
bool rd_statvalid; /* is rd_statlist valid? */
+ /*
+ * rd_idxsubattrs: cached per-attribute indexed-subpath descriptors,
+ * derived from pg_index.indexprs + pg_type.typidxextract. NULL when not
+ * yet computed or when no subpath indexes exist. Invalidated alongside
+ * other index metadata, computed in relcache.
+ */
+ RelSubattrInfo *rd_idxsubattrs;
+
+ /*
+ * rd_idxsubattrsvalid: false means rd_idxsubattrs has not been computed
+ * yet. When true, rd_idxsubattrs == NULL means "computed and empty" (no
+ * sub-attribute expression indexes exist).
+ */
+ bool rd_idxsubattrsvalid;
+
/*----------
* rd_createSubid is the ID of the highest subtransaction the rel has
* survived into or zero if the rel or its storage was created before the
@@ -162,8 +178,8 @@ typedef struct RelationData
Bitmapset *rd_keyattr; /* cols that can be ref'd by foreign keys */
Bitmapset *rd_pkattr; /* cols included in primary key */
Bitmapset *rd_idattr; /* included in replica identity index */
- Bitmapset *rd_hotblockingattr; /* cols blocking HOT update */
Bitmapset *rd_summarizedattr; /* cols indexed by summarizing indexes */
+ Bitmapset *rd_indexedattr; /* all cols referenced by indexes */
PublicationDesc *rd_pubdesc; /* publication descriptor, or NULL */
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
index 2700224939a72..57b46ee54e5ab 100644
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -69,8 +69,8 @@ typedef enum IndexAttrBitmapKind
INDEX_ATTR_BITMAP_KEY,
INDEX_ATTR_BITMAP_PRIMARY_KEY,
INDEX_ATTR_BITMAP_IDENTITY_KEY,
- INDEX_ATTR_BITMAP_HOT_BLOCKING,
INDEX_ATTR_BITMAP_SUMMARIZED,
+ INDEX_ATTR_BITMAP_INDEXED,
} IndexAttrBitmapKind;
extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation,
diff --git a/src/test/isolation/expected/hot_updates_chain.out b/src/test/isolation/expected/hot_updates_chain.out
new file mode 100644
index 0000000000000..503252009ea12
--- /dev/null
+++ b/src/test/isolation/expected/hot_updates_chain.out
@@ -0,0 +1,144 @@
+Parsed test spec with 5 sessions
+
+starting permutation: s1_begin s1_hot_update1 s1_hot_update2 s1_hot_update3 s1_commit s1_select s1_verify_hot
+step s1_begin: BEGIN;
+step s1_hot_update1: UPDATE hot_test SET non_indexed_col = 'update1' WHERE id = 1;
+step s1_hot_update2: UPDATE hot_test SET non_indexed_col = 'update2' WHERE id = 1;
+step s1_hot_update3: UPDATE hot_test SET non_indexed_col = 'update3' WHERE id = 1;
+step s1_commit: COMMIT;
+step s1_select: SELECT * FROM hot_test WHERE id = 1;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+ 1| 100|update3
+(1 row)
+
+step s1_verify_hot:
+ -- Check for HOT chain: LP_REDIRECT or tuple with t_ctid pointing to same page
+ SELECT COUNT(*) > 0 AS has_hot_chain
+ FROM heap_page_items(get_raw_page('hot_test', 0))
+ WHERE lp_flags = 2 -- LP_REDIRECT indicates HOT chain
+ OR (t_ctid IS NOT NULL
+ AND (t_ctid::text::point)[0]::int = 0 -- same page
+ AND t_ctid != ('(0,' || lp || ')')::tid); -- different offset
+
+has_hot_chain
+-------------
+t
+(1 row)
+
+
+starting permutation: s2_begin s2_select_before s1_begin s1_hot_update1 s1_hot_update2 s1_commit s2_select_after s2_commit
+step s2_begin: BEGIN ISOLATION LEVEL REPEATABLE READ;
+step s2_select_before: SELECT non_indexed_col FROM hot_test WHERE id = 1;
+non_indexed_col
+---------------
+initial
+(1 row)
+
+step s1_begin: BEGIN;
+step s1_hot_update1: UPDATE hot_test SET non_indexed_col = 'update1' WHERE id = 1;
+step s1_hot_update2: UPDATE hot_test SET non_indexed_col = 'update2' WHERE id = 1;
+step s1_commit: COMMIT;
+step s2_select_after: SELECT non_indexed_col FROM hot_test WHERE id = 1;
+non_indexed_col
+---------------
+initial
+(1 row)
+
+step s2_commit: COMMIT;
+
+starting permutation: s1_begin s1_hot_update1 s1_hot_update2 s1_commit s3_begin s3_non_hot_update s3_commit s1_select
+step s1_begin: BEGIN;
+step s1_hot_update1: UPDATE hot_test SET non_indexed_col = 'update1' WHERE id = 1;
+step s1_hot_update2: UPDATE hot_test SET non_indexed_col = 'update2' WHERE id = 1;
+step s1_commit: COMMIT;
+step s3_begin: BEGIN;
+step s3_non_hot_update: UPDATE hot_test SET indexed_col = 150 WHERE id = 1;
+step s3_commit: COMMIT;
+step s1_select: SELECT * FROM hot_test WHERE id = 1;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+ 1| 150|update2
+(1 row)
+
+
+starting permutation: s1_begin s1_hot_update1 s1_commit s3_begin s3_non_hot_update s3_commit s4_begin s4_hot_after_non_hot s4_commit s4_select s4_verify_hot
+step s1_begin: BEGIN;
+step s1_hot_update1: UPDATE hot_test SET non_indexed_col = 'update1' WHERE id = 1;
+step s1_commit: COMMIT;
+step s3_begin: BEGIN;
+step s3_non_hot_update: UPDATE hot_test SET indexed_col = 150 WHERE id = 1;
+step s3_commit: COMMIT;
+step s4_begin: BEGIN;
+step s4_hot_after_non_hot: UPDATE hot_test SET non_indexed_col = 'after_non_hot' WHERE id = 1;
+step s4_commit: COMMIT;
+step s4_select: SELECT * FROM hot_test WHERE id = 1;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+ 1| 150|after_non_hot
+(1 row)
+
+step s4_verify_hot:
+ -- Check for new HOT chain after non-HOT update broke the previous chain
+ SELECT COUNT(*) > 0 AS has_hot_chain
+ FROM heap_page_items(get_raw_page('hot_test', 0))
+ WHERE lp_flags = 2
+ OR (t_ctid IS NOT NULL
+ AND (t_ctid::text::point)[0]::int = 0
+ AND t_ctid != ('(0,' || lp || ')')::tid);
+
+has_hot_chain
+-------------
+t
+(1 row)
+
+
+starting permutation: s1_begin s1_hot_update1 s1_hot_update2 s5_begin s5_hot_update_row2_1 s5_hot_update_row2_2 s1_commit s5_commit s1_select s5_select s1_verify_hot s5_verify_hot
+step s1_begin: BEGIN;
+step s1_hot_update1: UPDATE hot_test SET non_indexed_col = 'update1' WHERE id = 1;
+step s1_hot_update2: UPDATE hot_test SET non_indexed_col = 'update2' WHERE id = 1;
+step s5_begin: BEGIN;
+step s5_hot_update_row2_1: UPDATE hot_test SET non_indexed_col = 'row2_update1' WHERE id = 2;
+step s5_hot_update_row2_2: UPDATE hot_test SET non_indexed_col = 'row2_update2' WHERE id = 2;
+step s1_commit: COMMIT;
+step s5_commit: COMMIT;
+step s1_select: SELECT * FROM hot_test WHERE id = 1;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+ 1| 100|update2
+(1 row)
+
+step s5_select: SELECT * FROM hot_test WHERE id = 2;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+ 2| 200|row2_update2
+(1 row)
+
+step s1_verify_hot:
+ -- Check for HOT chain: LP_REDIRECT or tuple with t_ctid pointing to same page
+ SELECT COUNT(*) > 0 AS has_hot_chain
+ FROM heap_page_items(get_raw_page('hot_test', 0))
+ WHERE lp_flags = 2 -- LP_REDIRECT indicates HOT chain
+ OR (t_ctid IS NOT NULL
+ AND (t_ctid::text::point)[0]::int = 0 -- same page
+ AND t_ctid != ('(0,' || lp || ')')::tid); -- different offset
+
+has_hot_chain
+-------------
+t
+(1 row)
+
+step s5_verify_hot:
+ -- Check for HOT chain on page 0
+ SELECT COUNT(*) > 0 AS has_hot_chain
+ FROM heap_page_items(get_raw_page('hot_test', 0))
+ WHERE lp_flags = 2
+ OR (t_ctid IS NOT NULL
+ AND (t_ctid::text::point)[0]::int = 0
+ AND t_ctid != ('(0,' || lp || ')')::tid);
+
+has_hot_chain
+-------------
+t
+(1 row)
+
diff --git a/src/test/isolation/expected/hot_updates_concurrent.out b/src/test/isolation/expected/hot_updates_concurrent.out
new file mode 100644
index 0000000000000..b1a8b0cb7b261
--- /dev/null
+++ b/src/test/isolation/expected/hot_updates_concurrent.out
@@ -0,0 +1,143 @@
+Parsed test spec with 4 sessions
+
+starting permutation: s1_begin s1_hot_update s2_begin s2_hot_update s1_commit s2_commit s1_select s2_select s2_verify_hot
+step s1_begin: BEGIN;
+step s1_hot_update: UPDATE hot_test SET non_indexed_col = 'updated_s1' WHERE id = 1;
+step s2_begin: BEGIN;
+step s2_hot_update: UPDATE hot_test SET non_indexed_col = 'updated_s2' WHERE id = 1;
+step s1_commit: COMMIT;
+step s2_hot_update: <... completed>
+step s2_commit: COMMIT;
+step s1_select: SELECT * FROM hot_test WHERE id = 1;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+ 1| 100|updated_s2
+(1 row)
+
+step s2_select: SELECT * FROM hot_test WHERE id = 1;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+ 1| 100|updated_s2
+(1 row)
+
+step s2_verify_hot:
+ -- Check for HOT chain: look for LP_REDIRECT (lp_flags=2) or tuple with t_ctid pointing to same page
+ SELECT COUNT(*) > 0 AS has_hot_chain
+ FROM heap_page_items(get_raw_page('hot_test', 0))
+ WHERE lp_flags = 2 -- LP_REDIRECT indicates HOT chain
+ OR (t_ctid IS NOT NULL
+ AND (t_ctid::text::point)[0]::int = 0 -- same page
+ AND t_ctid != ('(0,' || lp || ')')::tid); -- different offset
+
+has_hot_chain
+-------------
+t
+(1 row)
+
+
+starting permutation: s1_begin s1_hot_update s3_begin s3_non_hot_update s1_commit s3_commit s3_select s3_verify_index
+step s1_begin: BEGIN;
+step s1_hot_update: UPDATE hot_test SET non_indexed_col = 'updated_s1' WHERE id = 1;
+step s3_begin: BEGIN;
+step s3_non_hot_update: UPDATE hot_test SET indexed_col = 150 WHERE id = 1;
+step s1_commit: COMMIT;
+step s3_non_hot_update: <... completed>
+step s3_commit: COMMIT;
+step s3_select: SELECT * FROM hot_test WHERE id = 1;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+ 1| 150|updated_s1
+(1 row)
+
+step s3_verify_index:
+ -- Verify index was updated (proves non-HOT)
+ SELECT COUNT(*) = 1 AS index_updated FROM hot_test WHERE indexed_col = 150;
+ SELECT COUNT(*) = 0 AS old_value_gone FROM hot_test WHERE indexed_col = 100;
+
+index_updated
+-------------
+t
+(1 row)
+
+old_value_gone
+--------------
+t
+(1 row)
+
+
+starting permutation: s3_begin s3_non_hot_update s1_begin s1_hot_update s3_commit s1_commit s1_select s1_verify_hot
+step s3_begin: BEGIN;
+step s3_non_hot_update: UPDATE hot_test SET indexed_col = 150 WHERE id = 1;
+step s1_begin: BEGIN;
+step s1_hot_update: UPDATE hot_test SET non_indexed_col = 'updated_s1' WHERE id = 1;
+step s3_commit: COMMIT;
+step s1_hot_update: <... completed>
+step s1_commit: COMMIT;
+step s1_select: SELECT * FROM hot_test WHERE id = 1;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+ 1| 150|updated_s1
+(1 row)
+
+step s1_verify_hot:
+ -- Check for HOT chain: look for LP_REDIRECT (lp_flags=2) or tuple with t_ctid pointing to same page
+ SELECT COUNT(*) > 0 AS has_hot_chain
+ FROM heap_page_items(get_raw_page('hot_test', 0))
+ WHERE lp_flags = 2 -- LP_REDIRECT indicates HOT chain
+ OR (t_ctid IS NOT NULL
+ AND (t_ctid::text::point)[0]::int = 0 -- same page
+ AND t_ctid != ('(0,' || lp || ')')::tid); -- different offset
+
+has_hot_chain
+-------------
+t
+(1 row)
+
+
+starting permutation: s1_begin s1_hot_update s4_begin s4_hot_update_row2 s1_commit s4_commit s1_select s4_select s1_verify_hot s4_verify_hot
+step s1_begin: BEGIN;
+step s1_hot_update: UPDATE hot_test SET non_indexed_col = 'updated_s1' WHERE id = 1;
+step s4_begin: BEGIN;
+step s4_hot_update_row2: UPDATE hot_test SET non_indexed_col = 'updated_s4' WHERE id = 2;
+step s1_commit: COMMIT;
+step s4_commit: COMMIT;
+step s1_select: SELECT * FROM hot_test WHERE id = 1;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+ 1| 100|updated_s1
+(1 row)
+
+step s4_select: SELECT * FROM hot_test WHERE id = 2;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+ 2| 200|updated_s4
+(1 row)
+
+step s1_verify_hot:
+ -- Check for HOT chain: look for LP_REDIRECT (lp_flags=2) or tuple with t_ctid pointing to same page
+ SELECT COUNT(*) > 0 AS has_hot_chain
+ FROM heap_page_items(get_raw_page('hot_test', 0))
+ WHERE lp_flags = 2 -- LP_REDIRECT indicates HOT chain
+ OR (t_ctid IS NOT NULL
+ AND (t_ctid::text::point)[0]::int = 0 -- same page
+ AND t_ctid != ('(0,' || lp || ')')::tid); -- different offset
+
+has_hot_chain
+-------------
+t
+(1 row)
+
+step s4_verify_hot:
+ -- Check for HOT chain on page 0
+ SELECT COUNT(*) > 0 AS has_hot_chain
+ FROM heap_page_items(get_raw_page('hot_test', 0))
+ WHERE lp_flags = 2
+ OR (t_ctid IS NOT NULL
+ AND (t_ctid::text::point)[0]::int = 0
+ AND t_ctid != ('(0,' || lp || ')')::tid);
+
+has_hot_chain
+-------------
+t
+(1 row)
+
diff --git a/src/test/isolation/expected/hot_updates_ddl_concurrent.out b/src/test/isolation/expected/hot_updates_ddl_concurrent.out
new file mode 100644
index 0000000000000..8a26750c69694
--- /dev/null
+++ b/src/test/isolation/expected/hot_updates_ddl_concurrent.out
@@ -0,0 +1,26 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1_update_count_before s1_update_name_before s2_create_index s1_update_count_after s1_update_name_after s1_verify
+step s1_update_count_before:
+ UPDATE hot_ddl_test SET data = jsonb_set(data, array['count'], '1') WHERE id = 1;
+
+step s1_update_name_before:
+ UPDATE hot_ddl_test SET data = jsonb_set(data, array['name'], '"updated"') WHERE id = 1;
+
+step s2_create_index:
+ CREATE INDEX hot_ddl_count_idx ON hot_ddl_test((data->'count'));
+
+step s1_update_count_after:
+ UPDATE hot_ddl_test SET data = jsonb_set(data, array['count'], '2') WHERE id = 1;
+
+step s1_update_name_after:
+ UPDATE hot_ddl_test SET data = jsonb_set(data, array['name'], '"still_hot"') WHERE id = 1;
+
+step s1_verify:
+ SELECT * FROM hot_ddl_test WHERE id = 1;
+
+id|data
+--+-----------------------------------------------------
+ 1|{"name": "still_hot", "count": 2, "status": "active"}
+(1 row)
+
diff --git a/src/test/isolation/expected/hot_updates_index_scan.out b/src/test/isolation/expected/hot_updates_index_scan.out
new file mode 100644
index 0000000000000..7d8e9ff885774
--- /dev/null
+++ b/src/test/isolation/expected/hot_updates_index_scan.out
@@ -0,0 +1,132 @@
+Parsed test spec with 4 sessions
+
+starting permutation: s1_begin s1_hot_update s2_begin s2_index_scan s1_commit s2_commit
+step s1_begin: BEGIN;
+step s1_hot_update: UPDATE hot_test SET non_indexed_col = 'hot_updated' WHERE id = 50;
+step s2_begin: BEGIN;
+step s2_index_scan: SELECT * FROM hot_test WHERE indexed_col = 500;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+50| 500|initial50
+(1 row)
+
+step s1_commit: COMMIT;
+step s2_commit: COMMIT;
+
+starting permutation: s1_begin s1_non_hot_update s1_commit s2_begin s2_index_scan_new s2_commit s2_verify_index
+step s1_begin: BEGIN;
+step s1_non_hot_update: UPDATE hot_test SET indexed_col = 555 WHERE id = 50;
+step s1_commit: COMMIT;
+step s2_begin: BEGIN;
+step s2_index_scan_new: SELECT * FROM hot_test WHERE indexed_col = 555;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+50| 555|initial50
+(1 row)
+
+step s2_commit: COMMIT;
+step s2_verify_index:
+ -- After non-HOT update, verify index reflects the change
+ SELECT COUNT(*) = 1 AS found_new_value FROM hot_test WHERE indexed_col = 555;
+ SELECT COUNT(*) = 0 AS old_value_gone FROM hot_test WHERE indexed_col = 500;
+
+found_new_value
+---------------
+t
+(1 row)
+
+old_value_gone
+--------------
+t
+(1 row)
+
+
+starting permutation: s3_begin s3_select_for_update s1_begin s1_hot_update s3_commit s1_commit s1_verify_hot
+step s3_begin: BEGIN;
+step s3_select_for_update: SELECT * FROM hot_test WHERE id = 50 FOR UPDATE;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+50| 500|initial50
+(1 row)
+
+step s1_begin: BEGIN;
+step s1_hot_update: UPDATE hot_test SET non_indexed_col = 'hot_updated' WHERE id = 50;
+step s3_commit: COMMIT;
+step s1_hot_update: <... completed>
+step s1_commit: COMMIT;
+step s1_verify_hot:
+ -- Verify HOT chain exists for row with id=50
+ -- Use actual ctid to find the correct page
+ SELECT EXISTS (
+ SELECT 1 FROM heap_page_items(
+ get_raw_page('hot_test', (SELECT (ctid::text::point)[0]::int FROM hot_test WHERE id = 50))
+ )
+ WHERE lp_flags = 2
+ OR (t_ctid IS NOT NULL
+ AND t_ctid != ('(' || (SELECT (ctid::text::point)[0]::int FROM hot_test WHERE id = 50) || ',' || lp || ')')::tid
+ AND (t_ctid::text::point)[0]::int = (SELECT (ctid::text::point)[0]::int FROM hot_test WHERE id = 50))
+ ) AS has_hot_chain;
+
+has_hot_chain
+-------------
+t
+(1 row)
+
+
+starting permutation: s1_begin s1_hot_update s3_begin s3_select_for_update s1_commit s3_commit
+step s1_begin: BEGIN;
+step s1_hot_update: UPDATE hot_test SET non_indexed_col = 'hot_updated' WHERE id = 50;
+step s3_begin: BEGIN;
+step s3_select_for_update: SELECT * FROM hot_test WHERE id = 50 FOR UPDATE;
+step s1_commit: COMMIT;
+step s3_select_for_update: <... completed>
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+50| 500|hot_updated
+(1 row)
+
+step s3_commit: COMMIT;
+
+starting permutation: s4_begin s4_select_for_key_share s1_begin s1_hot_update s4_commit s1_commit s1_verify_hot
+step s4_begin: BEGIN;
+step s4_select_for_key_share: SELECT * FROM hot_test WHERE id = 50 FOR KEY SHARE;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+50| 500|initial50
+(1 row)
+
+step s1_begin: BEGIN;
+step s1_hot_update: UPDATE hot_test SET non_indexed_col = 'hot_updated' WHERE id = 50;
+step s4_commit: COMMIT;
+step s1_commit: COMMIT;
+step s1_verify_hot:
+ -- Verify HOT chain exists for row with id=50
+ -- Use actual ctid to find the correct page
+ SELECT EXISTS (
+ SELECT 1 FROM heap_page_items(
+ get_raw_page('hot_test', (SELECT (ctid::text::point)[0]::int FROM hot_test WHERE id = 50))
+ )
+ WHERE lp_flags = 2
+ OR (t_ctid IS NOT NULL
+ AND t_ctid != ('(' || (SELECT (ctid::text::point)[0]::int FROM hot_test WHERE id = 50) || ',' || lp || ')')::tid
+ AND (t_ctid::text::point)[0]::int = (SELECT (ctid::text::point)[0]::int FROM hot_test WHERE id = 50))
+ ) AS has_hot_chain;
+
+has_hot_chain
+-------------
+t
+(1 row)
+
+
+starting permutation: s4_begin s4_select_for_key_share s1_begin s1_non_hot_update s4_commit s1_commit
+step s4_begin: BEGIN;
+step s4_select_for_key_share: SELECT * FROM hot_test WHERE id = 50 FOR KEY SHARE;
+id|indexed_col|non_indexed_col
+--+-----------+---------------
+50| 500|initial50
+(1 row)
+
+step s1_begin: BEGIN;
+step s1_non_hot_update: UPDATE hot_test SET indexed_col = 555 WHERE id = 50;
+step s4_commit: COMMIT;
+step s1_commit: COMMIT;
diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule
index 4e466580cd4d8..33d3ba38e94fb 100644
--- a/src/test/isolation/isolation_schedule
+++ b/src/test/isolation/isolation_schedule
@@ -19,6 +19,10 @@ test: multiple-row-versions
test: index-only-scan
test: index-only-bitmapscan
test: predicate-lock-hot-tuple
+test: hot_updates_concurrent
+test: hot_updates_index_scan
+test: hot_updates_chain
+test: hot_updates_ddl_concurrent
test: update-conflict-out
test: deadlock-simple
test: deadlock-hard
diff --git a/src/test/isolation/specs/hot_updates_chain.spec b/src/test/isolation/specs/hot_updates_chain.spec
new file mode 100644
index 0000000000000..85cd21761333a
--- /dev/null
+++ b/src/test/isolation/specs/hot_updates_chain.spec
@@ -0,0 +1,110 @@
+# Test HOT update chains and their interaction with VACUUM and page pruning
+#
+# This test verifies that HOT update chains are correctly maintained when
+# multiple HOT updates occur on the same row, and that VACUUM correctly
+# handles HOT chains.
+
+setup
+{
+ CREATE EXTENSION IF NOT EXISTS pageinspect;
+
+ CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ indexed_col int,
+ non_indexed_col text
+ );
+
+ CREATE INDEX hot_test_indexed_idx ON hot_test(indexed_col);
+
+ INSERT INTO hot_test VALUES (1, 100, 'initial');
+ INSERT INTO hot_test VALUES (2, 200, 'initial');
+}
+
+teardown
+{
+ DROP TABLE hot_test;
+ DROP EXTENSION pageinspect;
+}
+
+# Session 1: Create HOT chain with multiple updates
+session s1
+step s1_begin { BEGIN; }
+step s1_hot_update1 { UPDATE hot_test SET non_indexed_col = 'update1' WHERE id = 1; }
+step s1_hot_update2 { UPDATE hot_test SET non_indexed_col = 'update2' WHERE id = 1; }
+step s1_hot_update3 { UPDATE hot_test SET non_indexed_col = 'update3' WHERE id = 1; }
+step s1_commit { COMMIT; }
+step s1_select { SELECT * FROM hot_test WHERE id = 1; }
+step s1_verify_hot {
+ -- Check for HOT chain: LP_REDIRECT or tuple with t_ctid pointing to same page
+ SELECT COUNT(*) > 0 AS has_hot_chain
+ FROM heap_page_items(get_raw_page('hot_test', 0))
+ WHERE lp_flags = 2 -- LP_REDIRECT indicates HOT chain
+ OR (t_ctid IS NOT NULL
+ AND (t_ctid::text::point)[0]::int = 0 -- same page
+ AND t_ctid != ('(0,' || lp || ')')::tid); -- different offset
+}
+
+# Session 2: Read while HOT chain is being built
+session s2
+step s2_begin { BEGIN ISOLATION LEVEL REPEATABLE READ; }
+step s2_select_before { SELECT non_indexed_col FROM hot_test WHERE id = 1; }
+step s2_select_after { SELECT non_indexed_col FROM hot_test WHERE id = 1; }
+step s2_commit { COMMIT; }
+
+# Session 3: Break HOT chain with non-HOT update
+session s3
+step s3_begin { BEGIN; }
+step s3_non_hot_update { UPDATE hot_test SET indexed_col = 150 WHERE id = 1; }
+step s3_commit { COMMIT; }
+
+# Session 4: Try to build HOT chain after non-HOT update
+session s4
+step s4_begin { BEGIN; }
+step s4_hot_after_non_hot { UPDATE hot_test SET non_indexed_col = 'after_non_hot' WHERE id = 1; }
+step s4_commit { COMMIT; }
+step s4_select { SELECT * FROM hot_test WHERE id = 1; }
+step s4_verify_hot {
+ -- Check for new HOT chain after non-HOT update broke the previous chain
+ SELECT COUNT(*) > 0 AS has_hot_chain
+ FROM heap_page_items(get_raw_page('hot_test', 0))
+ WHERE lp_flags = 2
+ OR (t_ctid IS NOT NULL
+ AND (t_ctid::text::point)[0]::int = 0
+ AND t_ctid != ('(0,' || lp || ')')::tid);
+}
+
+# Session 5: Multiple sessions building separate HOT chains on different rows
+session s5
+step s5_begin { BEGIN; }
+step s5_hot_update_row2_1 { UPDATE hot_test SET non_indexed_col = 'row2_update1' WHERE id = 2; }
+step s5_hot_update_row2_2 { UPDATE hot_test SET non_indexed_col = 'row2_update2' WHERE id = 2; }
+step s5_commit { COMMIT; }
+step s5_select { SELECT * FROM hot_test WHERE id = 2; }
+step s5_verify_hot {
+ -- Check for HOT chain on page 0
+ SELECT COUNT(*) > 0 AS has_hot_chain
+ FROM heap_page_items(get_raw_page('hot_test', 0))
+ WHERE lp_flags = 2
+ OR (t_ctid IS NOT NULL
+ AND (t_ctid::text::point)[0]::int = 0
+ AND t_ctid != ('(0,' || lp || ')')::tid);
+}
+
+# Build HOT chain within single transaction
+# All updates should form a HOT chain
+permutation s1_begin s1_hot_update1 s1_hot_update2 s1_hot_update3 s1_commit s1_select s1_verify_hot
+
+# REPEATABLE READ should see consistent snapshot across HOT chain updates
+# Session 2 starts before updates, should see 'initial' throughout
+permutation s2_begin s2_select_before s1_begin s1_hot_update1 s1_hot_update2 s1_commit s2_select_after s2_commit
+
+# HOT chain followed by non-HOT update
+# Non-HOT update breaks the HOT chain
+permutation s1_begin s1_hot_update1 s1_hot_update2 s1_commit s3_begin s3_non_hot_update s3_commit s1_select
+
+# HOT update after non-HOT update can start new HOT chain
+# After breaking chain with indexed column update, new HOT updates can start fresh chain
+permutation s1_begin s1_hot_update1 s1_commit s3_begin s3_non_hot_update s3_commit s4_begin s4_hot_after_non_hot s4_commit s4_select s4_verify_hot
+
+# Multiple sessions building separate HOT chains on different rows
+permutation s1_begin s1_hot_update1 s1_hot_update2 s5_begin s5_hot_update_row2_1 s5_hot_update_row2_2 s1_commit s5_commit s1_select s5_select s1_verify_hot s5_verify_hot
diff --git a/src/test/isolation/specs/hot_updates_concurrent.spec b/src/test/isolation/specs/hot_updates_concurrent.spec
new file mode 100644
index 0000000000000..eac78d62ac561
--- /dev/null
+++ b/src/test/isolation/specs/hot_updates_concurrent.spec
@@ -0,0 +1,107 @@
+# Test concurrent HOT updates and validate HOT chains
+#
+# This test verifies that HOT updates work correctly when multiple sessions
+# are updating the same table concurrently, and validates that HOT chains
+# are actually created using heap_page_items().
+
+setup
+{
+ CREATE EXTENSION IF NOT EXISTS pageinspect;
+
+ CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ indexed_col int,
+ non_indexed_col text
+ );
+
+ CREATE INDEX hot_test_indexed_idx ON hot_test(indexed_col);
+
+ INSERT INTO hot_test VALUES (1, 100, 'initial1');
+ INSERT INTO hot_test VALUES (2, 200, 'initial2');
+ INSERT INTO hot_test VALUES (3, 300, 'initial3');
+}
+
+teardown
+{
+ DROP TABLE hot_test;
+ DROP EXTENSION pageinspect;
+}
+
+# Session 1: HOT update (modify non-indexed column)
+session s1
+step s1_begin { BEGIN; }
+step s1_hot_update { UPDATE hot_test SET non_indexed_col = 'updated_s1' WHERE id = 1; }
+step s1_commit { COMMIT; }
+step s1_select { SELECT * FROM hot_test WHERE id = 1; }
+step s1_verify_hot {
+ -- Check for HOT chain: look for LP_REDIRECT (lp_flags=2) or tuple with t_ctid pointing to same page
+ SELECT COUNT(*) > 0 AS has_hot_chain
+ FROM heap_page_items(get_raw_page('hot_test', 0))
+ WHERE lp_flags = 2 -- LP_REDIRECT indicates HOT chain
+ OR (t_ctid IS NOT NULL
+ AND (t_ctid::text::point)[0]::int = 0 -- same page
+ AND t_ctid != ('(0,' || lp || ')')::tid); -- different offset
+}
+
+# Session 2: HOT update (modify non-indexed column on same row)
+session s2
+step s2_begin { BEGIN; }
+step s2_hot_update { UPDATE hot_test SET non_indexed_col = 'updated_s2' WHERE id = 1; }
+step s2_commit { COMMIT; }
+step s2_select { SELECT * FROM hot_test WHERE id = 1; }
+step s2_verify_hot {
+ -- Check for HOT chain: look for LP_REDIRECT (lp_flags=2) or tuple with t_ctid pointing to same page
+ SELECT COUNT(*) > 0 AS has_hot_chain
+ FROM heap_page_items(get_raw_page('hot_test', 0))
+ WHERE lp_flags = 2 -- LP_REDIRECT indicates HOT chain
+ OR (t_ctid IS NOT NULL
+ AND (t_ctid::text::point)[0]::int = 0 -- same page
+ AND t_ctid != ('(0,' || lp || ')')::tid); -- different offset
+}
+
+# Session 3: Non-HOT update (modify indexed column)
+session s3
+step s3_begin { BEGIN; }
+step s3_non_hot_update { UPDATE hot_test SET indexed_col = 150 WHERE id = 1; }
+step s3_commit { COMMIT; }
+step s3_select { SELECT * FROM hot_test WHERE id = 1; }
+step s3_verify_index {
+ -- Verify index was updated (proves non-HOT)
+ SELECT COUNT(*) = 1 AS index_updated FROM hot_test WHERE indexed_col = 150;
+ SELECT COUNT(*) = 0 AS old_value_gone FROM hot_test WHERE indexed_col = 100;
+}
+
+# Session 4: Concurrent HOT updates on different rows
+session s4
+step s4_begin { BEGIN; }
+step s4_hot_update_row2 { UPDATE hot_test SET non_indexed_col = 'updated_s4' WHERE id = 2; }
+step s4_commit { COMMIT; }
+step s4_select { SELECT * FROM hot_test WHERE id = 2; }
+step s4_verify_hot {
+ -- Check for HOT chain on page 0
+ SELECT COUNT(*) > 0 AS has_hot_chain
+ FROM heap_page_items(get_raw_page('hot_test', 0))
+ WHERE lp_flags = 2
+ OR (t_ctid IS NOT NULL
+ AND (t_ctid::text::point)[0]::int = 0
+ AND t_ctid != ('(0,' || lp || ')')::tid);
+}
+
+# Two sessions both doing HOT updates on same row
+# Second session should block until first commits
+# Both should create HOT chains
+permutation s1_begin s1_hot_update s2_begin s2_hot_update s1_commit s2_commit s1_select s2_select s2_verify_hot
+
+# HOT update followed by non-HOT update
+# Non-HOT update should wait for HOT update to commit
+# First update is HOT, second is non-HOT (index updated)
+permutation s1_begin s1_hot_update s3_begin s3_non_hot_update s1_commit s3_commit s3_select s3_verify_index
+
+# Non-HOT update followed by HOT update
+# HOT update should wait for non-HOT update to commit
+# First update is non-HOT (index), second is HOT
+permutation s3_begin s3_non_hot_update s1_begin s1_hot_update s3_commit s1_commit s1_select s1_verify_hot
+
+# Concurrent HOT updates on different rows (should not block)
+# Both sessions should be able to create HOT chains independently
+permutation s1_begin s1_hot_update s4_begin s4_hot_update_row2 s1_commit s4_commit s1_select s4_select s1_verify_hot s4_verify_hot
diff --git a/src/test/isolation/specs/hot_updates_ddl_concurrent.spec b/src/test/isolation/specs/hot_updates_ddl_concurrent.spec
new file mode 100644
index 0000000000000..f5d9d7e2b577e
--- /dev/null
+++ b/src/test/isolation/specs/hot_updates_ddl_concurrent.spec
@@ -0,0 +1,52 @@
+# Test HOT updates concurrent with CREATE INDEX on JSONB expression
+#
+# This test verifies that HOT updates interact correctly with concurrent
+# CREATE INDEX operations. When a new index is created on a JSONB expression,
+# subsequent updates that touch the newly indexed subpath must stop using HOT.
+#
+# Note: We use jsonb_build_object() instead of JSON literals because the
+# isolation test parser treats "}" as end-of-SQL-block.
+
+setup
+{
+ CREATE TABLE hot_ddl_test (
+ id int PRIMARY KEY,
+ data jsonb
+ );
+
+ INSERT INTO hot_ddl_test VALUES (
+ 1,
+ jsonb_build_object('status', 'active', 'count', 0, 'name', 'test')
+ );
+
+ CREATE INDEX hot_ddl_status_idx ON hot_ddl_test((data->'status'));
+}
+
+teardown
+{
+ DROP TABLE hot_ddl_test;
+}
+
+session s1
+step s1_update_count_before {
+ UPDATE hot_ddl_test SET data = jsonb_set(data, array['count'], '1') WHERE id = 1;
+}
+step s1_update_name_before {
+ UPDATE hot_ddl_test SET data = jsonb_set(data, array['name'], '"updated"') WHERE id = 1;
+}
+step s1_update_count_after {
+ UPDATE hot_ddl_test SET data = jsonb_set(data, array['count'], '2') WHERE id = 1;
+}
+step s1_update_name_after {
+ UPDATE hot_ddl_test SET data = jsonb_set(data, array['name'], '"still_hot"') WHERE id = 1;
+}
+step s1_verify {
+ SELECT * FROM hot_ddl_test WHERE id = 1;
+}
+
+session s2
+step s2_create_index {
+ CREATE INDEX hot_ddl_count_idx ON hot_ddl_test((data->'count'));
+}
+
+permutation s1_update_count_before s1_update_name_before s2_create_index s1_update_count_after s1_update_name_after s1_verify
diff --git a/src/test/isolation/specs/hot_updates_index_scan.spec b/src/test/isolation/specs/hot_updates_index_scan.spec
new file mode 100644
index 0000000000000..70c3dae51667d
--- /dev/null
+++ b/src/test/isolation/specs/hot_updates_index_scan.spec
@@ -0,0 +1,94 @@
+# Test HOT updates interaction with index scans and SELECT FOR UPDATE
+#
+# This test verifies that HOT updates are correctly handled when concurrent
+# sessions are performing index scans, using SELECT FOR UPDATE, and validates
+# HOT chains using heap_page_items().
+
+setup
+{
+ CREATE EXTENSION IF NOT EXISTS pageinspect;
+
+ CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ indexed_col int,
+ non_indexed_col text
+ );
+
+ CREATE INDEX hot_test_indexed_idx ON hot_test(indexed_col);
+
+ INSERT INTO hot_test SELECT i, i * 10, 'initial' || i FROM generate_series(1, 100) i;
+}
+
+teardown
+{
+ DROP TABLE hot_test;
+ DROP EXTENSION pageinspect;
+}
+
+# Session 1: Perform HOT update
+session s1
+step s1_begin { BEGIN; }
+step s1_hot_update { UPDATE hot_test SET non_indexed_col = 'hot_updated' WHERE id = 50; }
+step s1_non_hot_update { UPDATE hot_test SET indexed_col = 555 WHERE id = 50; }
+step s1_commit { COMMIT; }
+step s1_verify_hot {
+ -- Verify HOT chain exists for row with id=50
+ -- Use actual ctid to find the correct page
+ SELECT EXISTS (
+ SELECT 1 FROM heap_page_items(
+ get_raw_page('hot_test', (SELECT (ctid::text::point)[0]::int FROM hot_test WHERE id = 50))
+ )
+ WHERE lp_flags = 2
+ OR (t_ctid IS NOT NULL
+ AND t_ctid != ('(' || (SELECT (ctid::text::point)[0]::int FROM hot_test WHERE id = 50) || ',' || lp || ')')::tid
+ AND (t_ctid::text::point)[0]::int = (SELECT (ctid::text::point)[0]::int FROM hot_test WHERE id = 50))
+ ) AS has_hot_chain;
+}
+
+# Session 2: Index scan while HOT update in progress
+session s2
+step s2_begin { BEGIN; }
+step s2_index_scan { SELECT * FROM hot_test WHERE indexed_col = 500; }
+step s2_index_scan_new { SELECT * FROM hot_test WHERE indexed_col = 555; }
+step s2_commit { COMMIT; }
+step s2_verify_index {
+ -- After non-HOT update, verify index reflects the change
+ SELECT COUNT(*) = 1 AS found_new_value FROM hot_test WHERE indexed_col = 555;
+ SELECT COUNT(*) = 0 AS old_value_gone FROM hot_test WHERE indexed_col = 500;
+}
+
+# Session 3: SELECT FOR UPDATE
+session s3
+step s3_begin { BEGIN; }
+step s3_select_for_update { SELECT * FROM hot_test WHERE id = 50 FOR UPDATE; }
+step s3_commit { COMMIT; }
+
+# Session 4: SELECT FOR KEY SHARE (should not block HOT update of non-key column)
+session s4
+step s4_begin { BEGIN; }
+step s4_select_for_key_share { SELECT * FROM hot_test WHERE id = 50 FOR KEY SHARE; }
+step s4_commit { COMMIT; }
+
+# Index scan should see consistent snapshot during HOT update
+# Index scan starts before HOT update commits
+permutation s1_begin s1_hot_update s2_begin s2_index_scan s1_commit s2_commit
+
+# Index scan after non-HOT update should see new index entry
+# Index scan starts after non-HOT update commits
+permutation s1_begin s1_non_hot_update s1_commit s2_begin s2_index_scan_new s2_commit s2_verify_index
+
+# SELECT FOR UPDATE blocks HOT update
+# FOR UPDATE should block the UPDATE until SELECT commits
+permutation s3_begin s3_select_for_update s1_begin s1_hot_update s3_commit s1_commit s1_verify_hot
+
+# HOT update blocks SELECT FOR UPDATE
+# SELECT FOR UPDATE should wait for HOT update to commit
+permutation s1_begin s1_hot_update s3_begin s3_select_for_update s1_commit s3_commit
+
+# SELECT FOR KEY SHARE should not block HOT update (non-key column)
+# HOT update of non-indexed column should not conflict with FOR KEY SHARE
+permutation s4_begin s4_select_for_key_share s1_begin s1_hot_update s4_commit s1_commit s1_verify_hot
+
+# Non-HOT update (key column) should block after FOR KEY SHARE
+# Non-HOT update of indexed column should wait for FOR KEY SHARE
+permutation s4_begin s4_select_for_key_share s1_begin s1_non_hot_update s4_commit s1_commit
diff --git a/src/test/modules/dummy_index_am/dummy_index_am.c b/src/test/modules/dummy_index_am/dummy_index_am.c
index 31f8d2b816155..ab1983c3a13e5 100644
--- a/src/test/modules/dummy_index_am/dummy_index_am.c
+++ b/src/test/modules/dummy_index_am/dummy_index_am.c
@@ -341,6 +341,7 @@ dihandler(PG_FUNCTION_ARGS)
.amestimateparallelscan = NULL,
.aminitparallelscan = NULL,
.amparallelrescan = NULL,
+ .amcomparedatums = NULL,
};
PG_RETURN_POINTER(&amroutine);
diff --git a/src/test/regress/expected/generated_virtual.out b/src/test/regress/expected/generated_virtual.out
index 6dab60c937b56..7ebb7890d9657 100644
--- a/src/test/regress/expected/generated_virtual.out
+++ b/src/test/regress/expected/generated_virtual.out
@@ -287,7 +287,7 @@ DETAIL: Column "b" is a generated column.
INSERT INTO gtest1v VALUES (8, DEFAULT), (9, DEFAULT); -- error
ERROR: cannot insert a non-DEFAULT value into column "b"
DETAIL: Column "b" is a generated column.
-SELECT * FROM gtest1v;
+SELECT * FROM gtest1v ORDER BY a;
a | b
---+----
3 | 6
diff --git a/src/test/regress/expected/hot_updates.out b/src/test/regress/expected/hot_updates.out
new file mode 100644
index 0000000000000..2a34ada8b2338
--- /dev/null
+++ b/src/test/regress/expected/hot_updates.out
@@ -0,0 +1,1314 @@
+--
+-- HOT_UPDATES
+-- Test Heap-Only Tuple (HOT) update decisions
+--
+-- This test systematically verifies that HOT updates are used when appropriate
+-- and avoided when necessary (e.g., when indexed columns are modified).
+--
+-- We use multiple validation methods:
+-- 1. Index verification (index still works = proves no index update for HOT)
+-- 2. Statistics functions (pg_stat_get_tuples_hot_updated)
+-- 3. pageinspect extension for HOT chain examination
+--
+-- Load required extensions
+CREATE EXTENSION IF NOT EXISTS pageinspect;
+-- Function to get HOT update count
+CREATE OR REPLACE FUNCTION get_hot_count(rel_name text)
+RETURNS TABLE (
+ updates BIGINT,
+ hot BIGINT
+) AS $$
+DECLARE
+ rel_oid oid;
+BEGIN
+ rel_oid := rel_name::regclass::oid;
+
+ -- Force stats flush and use only shared stats to avoid double-counting
+ PERFORM pg_stat_force_next_flush();
+ PERFORM pg_sleep(0.1);
+
+ -- Use only shared stats (after flush, xact stats are included in shared)
+ updates := COALESCE(pg_stat_get_tuples_updated(rel_oid), 0);
+ hot := COALESCE(pg_stat_get_tuples_hot_updated(rel_oid), 0);
+
+ RETURN NEXT;
+END;
+$$ LANGUAGE plpgsql;
+-- Check if a tuple is part of a HOT chain (has a predecessor on same page)
+CREATE OR REPLACE FUNCTION has_hot_chain(rel_name text, target_ctid tid)
+RETURNS boolean AS $$
+DECLARE
+ block_num int;
+ page_item record;
+BEGIN
+ block_num := (target_ctid::text::point)[0]::int;
+
+ -- Look for a different tuple on the same page that points to our target tuple
+ FOR page_item IN
+ SELECT lp, lp_flags, t_ctid
+ FROM heap_page_items(get_raw_page(rel_name, block_num))
+ WHERE lp_flags = 1
+ AND t_ctid IS NOT NULL
+ AND t_ctid = target_ctid
+ AND ('(' || block_num::text || ',' || lp::text || ')')::tid != target_ctid
+ LOOP
+ RETURN true;
+ END LOOP;
+
+ RETURN false;
+END;
+$$ LANGUAGE plpgsql;
+-- Print the HOT chain starting from a given tuple
+CREATE OR REPLACE FUNCTION print_hot_chain(rel_name text, start_ctid tid)
+RETURNS TABLE(chain_position int, ctid tid, lp_flags text, t_ctid tid, chain_end boolean) AS
+$$
+#variable_conflict use_column
+DECLARE
+ block_num int;
+ line_ptr int;
+ current_ctid tid := start_ctid;
+ next_ctid tid;
+ position int := 0;
+ max_iterations int := 100;
+ page_item record;
+ found_predecessor boolean := false;
+ flags_name text;
+BEGIN
+ block_num := (start_ctid::text::point)[0]::int;
+
+ -- Find the predecessor (old tuple pointing to our start_ctid)
+ FOR page_item IN
+ SELECT lp, lp_flags, t_ctid
+ FROM heap_page_items(get_raw_page(rel_name, block_num))
+ WHERE lp_flags = 1
+ AND t_ctid = start_ctid
+ LOOP
+ current_ctid := ('(' || block_num::text || ',' || page_item.lp::text || ')')::tid;
+ found_predecessor := true;
+ EXIT;
+ END LOOP;
+
+ -- If no predecessor found, start with the given ctid
+ IF NOT found_predecessor THEN
+ current_ctid := start_ctid;
+ END IF;
+
+ -- Follow the chain forward
+ WHILE position < max_iterations LOOP
+ line_ptr := (current_ctid::text::point)[1]::int;
+
+ FOR page_item IN
+ SELECT lp, lp_flags, t_ctid
+ FROM heap_page_items(get_raw_page(rel_name, block_num))
+ WHERE lp = line_ptr
+ LOOP
+ -- Map lp_flags to names
+ flags_name := CASE page_item.lp_flags
+ WHEN 0 THEN 'unused (0)'
+ WHEN 1 THEN 'normal (1)'
+ WHEN 2 THEN 'redirect (2)'
+ WHEN 3 THEN 'dead (3)'
+ ELSE 'unknown (' || page_item.lp_flags::text || ')'
+ END;
+
+ RETURN QUERY SELECT
+ position,
+ current_ctid,
+ flags_name,
+ page_item.t_ctid,
+ (page_item.t_ctid IS NULL OR page_item.t_ctid = current_ctid)::boolean
+ ;
+
+ IF page_item.t_ctid IS NULL OR page_item.t_ctid = current_ctid THEN
+ RETURN;
+ END IF;
+
+ next_ctid := page_item.t_ctid;
+
+ IF (next_ctid::text::point)[0]::int != block_num THEN
+ RETURN;
+ END IF;
+
+ current_ctid := next_ctid;
+ position := position + 1;
+ END LOOP;
+
+ IF position = 0 THEN
+ RETURN;
+ END IF;
+ END LOOP;
+END;
+$$ LANGUAGE plpgsql;
+-- Basic HOT update (update non-indexed column)
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ indexed_col int,
+ non_indexed_col text
+) WITH (fillfactor = 50);
+CREATE INDEX hot_test_indexed_idx ON hot_test(indexed_col);
+INSERT INTO hot_test VALUES (1, 100, 'initial');
+INSERT INTO hot_test VALUES (2, 200, 'initial');
+INSERT INTO hot_test VALUES (3, 300, 'initial');
+-- Get baseline
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 0 | 0
+(1 row)
+
+-- Should be HOT updates (only non-indexed column modified)
+UPDATE hot_test SET non_indexed_col = 'updated1' WHERE id = 1;
+UPDATE hot_test SET non_indexed_col = 'updated2' WHERE id = 2;
+UPDATE hot_test SET non_indexed_col = 'updated3' WHERE id = 3;
+-- Verify HOT updates occurred
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 0 | 0
+(1 row)
+
+-- Dump the HOT chain before VACUUMing
+WITH current_tuple AS (
+ SELECT ctid FROM hot_test WHERE id = 1
+)
+SELECT
+ has_hot_chain('hot_test', current_tuple.ctid) AS has_chain,
+ chain_position,
+ print_hot_chain.ctid,
+ lp_flags,
+ t_ctid
+FROM current_tuple,
+LATERAL print_hot_chain('hot_test', current_tuple.ctid);
+ has_chain | chain_position | ctid | lp_flags | t_ctid
+-----------+----------------+-------+------------+--------
+ t | 0 | (0,1) | normal (1) | (0,4)
+ t | 1 | (0,4) | normal (1) | (0,4)
+(2 rows)
+
+SET SESSION enable_seqscan = OFF;
+SET SESSION enable_bitmapscan = OFF;
+-- Verify indexes still work
+SELECT id, indexed_col FROM hot_test WHERE indexed_col = 100;
+ id | indexed_col
+----+-------------
+ 1 | 100
+(1 row)
+
+SELECT id, indexed_col FROM hot_test WHERE indexed_col = 200;
+ id | indexed_col
+----+-------------
+ 2 | 200
+(1 row)
+
+-- Vacuum the relation, expect the HOT chain to collapse
+VACUUM hot_test;
+-- Show that there is no chain after vacuum
+WITH current_tuple AS (
+ SELECT ctid FROM hot_test WHERE id = 1
+)
+SELECT
+ has_hot_chain('hot_test', current_tuple.ctid) AS has_chain,
+ chain_position,
+ print_hot_chain.ctid,
+ lp_flags,
+ t_ctid
+FROM current_tuple,
+LATERAL print_hot_chain('hot_test', current_tuple.ctid);
+ has_chain | chain_position | ctid | lp_flags | t_ctid
+-----------+----------------+-------+------------+--------
+ f | 0 | (0,4) | normal (1) | (0,4)
+(1 row)
+
+-- Non-HOT update (update indexed column)
+UPDATE hot_test SET indexed_col = 150 WHERE id = 1;
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 3 | 3
+(1 row)
+
+-- Verify index was updated (new value findable)
+EXPLAIN (COSTS OFF) SELECT id, indexed_col FROM hot_test WHERE indexed_col = 150;
+ QUERY PLAN
+---------------------------------------------------
+ Index Scan using hot_test_indexed_idx on hot_test
+ Index Cond: (indexed_col = 150)
+(2 rows)
+
+SELECT id, indexed_col FROM hot_test WHERE indexed_col = 150;
+ id | indexed_col
+----+-------------
+ 1 | 150
+(1 row)
+
+-- Verify old value no longer in index
+EXPLAIN (COSTS OFF) SELECT id FROM hot_test WHERE indexed_col = 100;
+ QUERY PLAN
+---------------------------------------------------
+ Index Scan using hot_test_indexed_idx on hot_test
+ Index Cond: (indexed_col = 100)
+(2 rows)
+
+SELECT id FROM hot_test WHERE indexed_col = 100;
+ id
+----
+(0 rows)
+
+SET SESSION enable_seqscan = ON;
+SET SESSION enable_bitmapscan = ON;
+-- All-or-none property: updating one indexed column requires ALL index updates
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ col_a int,
+ col_b int,
+ col_c int,
+ non_indexed text
+) WITH (fillfactor = 50);
+CREATE INDEX hot_test_a_idx ON hot_test(col_a);
+CREATE INDEX hot_test_b_idx ON hot_test(col_b);
+CREATE INDEX hot_test_c_idx ON hot_test(col_c);
+INSERT INTO hot_test VALUES (1, 10, 20, 30, 'initial');
+-- Update only col_a - should NOT be HOT because an indexed column changed
+-- This means ALL indexes must be updated (all-or-none property)
+UPDATE hot_test SET col_a = 15 WHERE id = 1;
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 0 | 0
+(1 row)
+
+-- Verify all three indexes still work correctly
+SELECT id, col_a FROM hot_test WHERE col_a = 15; -- updated index
+ id | col_a
+----+-------
+ 1 | 15
+(1 row)
+
+SELECT id, col_b FROM hot_test WHERE col_b = 20; -- unchanged index
+ id | col_b
+----+-------
+ 1 | 20
+(1 row)
+
+SELECT id, col_c FROM hot_test WHERE col_c = 30; -- unchanged index
+ id | col_c
+----+-------
+ 1 | 30
+(1 row)
+
+-- Now update only non-indexed column - should be HOT
+UPDATE hot_test SET non_indexed = 'updated';
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 1 | 0
+(1 row)
+
+-- Verify all indexes still work
+SELECT id FROM hot_test WHERE col_a = 15 AND col_b = 20 AND col_c = 30;
+ id
+----
+ 1
+(1 row)
+
+-- Partial index: both old and new outside predicate (conservative = non-HOT)
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ status text,
+ data text
+) WITH (fillfactor = 50);
+-- Partial index only covers status = 'active'
+CREATE INDEX hot_test_active_idx ON hot_test(status) WHERE status = 'active';
+INSERT INTO hot_test VALUES (1, 'active', 'data1');
+INSERT INTO hot_test VALUES (2, 'inactive', 'data2');
+INSERT INTO hot_test VALUES (3, 'deleted', 'data3');
+-- Update non-indexed column on 'active' row (in predicate, status unchanged)
+-- Should be HOT
+UPDATE hot_test SET data = 'updated1' WHERE id = 1;
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 0 | 0
+(1 row)
+
+-- Update non-indexed column on 'inactive' row (outside predicate)
+-- Should be HOT
+UPDATE hot_test SET data = 'updated2' WHERE id = 2;
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 1 | 1
+(1 row)
+
+-- Update status from 'inactive' to 'deleted' (both outside predicate)
+-- PostgreSQL is conservative: heap insert happens before predicate check
+-- So this is NON-HOT even though both values are outside predicate
+UPDATE hot_test SET status = 'deleted' WHERE id = 2;
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 2 | 2
+(1 row)
+
+-- Verify index still works for 'active' rows
+SELECT id, status FROM hot_test WHERE status = 'active';
+ id | status
+----+--------
+ 1 | active
+(1 row)
+
+-- Only BRIN (summarizing) indexes on non-PK columns
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ ts timestamp,
+ value int,
+ brin_col int
+) WITH (fillfactor = 50);
+CREATE INDEX hot_test_ts_brin ON hot_test USING brin(ts);
+CREATE INDEX hot_test_brin_col_brin ON hot_test USING brin(brin_col);
+INSERT INTO hot_test VALUES (1, '2024-01-01', 100, 1000);
+-- Update both BRIN columns - should still be HOT (only summarizing indexes)
+UPDATE hot_test SET ts = '2024-01-02', brin_col = 2000 WHERE id = 1;
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 0 | 0
+(1 row)
+
+-- Verify BRIN indexes work
+SELECT id FROM hot_test WHERE ts >= '2024-01-02';
+ id
+----
+ 1
+(1 row)
+
+SELECT id FROM hot_test WHERE brin_col >= 2000;
+ id
+----
+ 1
+(1 row)
+
+-- Update non-indexed column - should also be HOT
+UPDATE hot_test SET value = 200 WHERE id = 1;
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 1 | 1
+(1 row)
+
+-- TOAST and HOT: TOASTed columns can participate in HOT
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ indexed_col int,
+ large_text text,
+ small_text text
+) WITH (fillfactor = 50);
+CREATE INDEX hot_test_idx ON hot_test(indexed_col);
+-- Insert row with TOASTed column (> 2KB)
+INSERT INTO hot_test VALUES (1, 100, repeat('x', 3000), 'small');
+-- Update non-indexed, non-TOASTed column - should be HOT
+UPDATE hot_test SET small_text = 'updated';
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 0 | 0
+(1 row)
+
+-- Update TOASTed column - should be HOT if indexed column unchanged
+UPDATE hot_test SET large_text = repeat('y', 3000);
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 1 | 1
+(1 row)
+
+-- Verify index still works
+SELECT id FROM hot_test WHERE indexed_col = 100;
+ id
+----
+ 1
+(1 row)
+
+-- Update indexed column - should NOT be HOT
+UPDATE hot_test SET indexed_col = 200;
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 2 | 2
+(1 row)
+
+-- Verify index was updated
+SELECT id FROM hot_test WHERE indexed_col = 200;
+ id
+----
+ 1
+(1 row)
+
+-- Unique constraint (unique index) behaves like regular index
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ unique_col int UNIQUE,
+ data text
+) WITH (fillfactor = 50);
+INSERT INTO hot_test VALUES (1, 100, 'data1');
+INSERT INTO hot_test VALUES (2, 200, 'data2');
+-- Update data (non-indexed) - should be HOT
+UPDATE hot_test SET data = 'updated';
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 0 | 0
+(1 row)
+
+-- Verify unique constraint still enforced
+SELECT id, unique_col, data FROM hot_test ORDER BY id;
+ id | unique_col | data
+----+------------+---------
+ 1 | 100 | updated
+ 2 | 200 | updated
+(2 rows)
+
+-- This should fail (unique violation)
+UPDATE hot_test SET unique_col = 100 WHERE id = 2;
+ERROR: duplicate key value violates unique constraint "hot_test_unique_col_key"
+DETAIL: Key (unique_col)=(100) already exists.
+-- Multi-column index: any column change = non-HOT
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ col_a int,
+ col_b int,
+ col_c int,
+ data text
+) WITH (fillfactor = 50);
+CREATE INDEX hot_test_ab_idx ON hot_test(col_a, col_b);
+INSERT INTO hot_test VALUES (1, 10, 20, 30, 'data');
+-- Update col_a (part of multi-column index) - should NOT be HOT
+UPDATE hot_test SET col_a = 15;
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 0 | 0
+(1 row)
+
+-- Reset
+UPDATE hot_test SET col_a = 10;
+-- Update col_b (part of multi-column index) - should NOT be HOT
+UPDATE hot_test SET col_b = 25;
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 1 | 0
+(1 row)
+
+-- Reset
+UPDATE hot_test SET col_b = 20;
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 3 | 0
+(1 row)
+
+-- Update col_c (not indexed) - should be HOT
+UPDATE hot_test SET col_c = 35;
+-- Update data (not indexed) - should be HOT
+UPDATE hot_test SET data = 'updated';
+SELECT * FROM get_hot_count('hot_test');
+ updates | hot
+---------+-----
+ 4 | 0
+(1 row)
+
+-- Verify multi-column index works
+SELECT id FROM hot_test WHERE col_a = 10 AND col_b = 20;
+ id
+----
+ 1
+(1 row)
+
+-- Partitioned tables: HOT works within partitions
+DROP TABLE IF EXISTS hot_test_partitioned CASCADE;
+NOTICE: table "hot_test_partitioned" does not exist, skipping
+CREATE TABLE hot_test_partitioned (
+ id int,
+ partition_key int,
+ indexed_col int,
+ data text,
+ PRIMARY KEY (id, partition_key)
+) PARTITION BY RANGE (partition_key);
+CREATE TABLE hot_test_part1 PARTITION OF hot_test_partitioned
+ FOR VALUES FROM (1) TO (100) WITH (fillfactor = 50);
+CREATE TABLE hot_test_part2 PARTITION OF hot_test_partitioned
+ FOR VALUES FROM (100) TO (200) WITH (fillfactor = 50);
+CREATE INDEX hot_test_part_idx ON hot_test_partitioned(indexed_col);
+INSERT INTO hot_test_partitioned VALUES (1, 50, 100, 'initial1');
+INSERT INTO hot_test_partitioned VALUES (2, 150, 200, 'initial2');
+-- Update in partition 1 (non-indexed column) - should be HOT
+UPDATE hot_test_partitioned SET data = 'updated1' WHERE id = 1;
+-- Update in partition 2 (non-indexed column) - should be HOT
+UPDATE hot_test_partitioned SET data = 'updated2' WHERE id = 2;
+SELECT * FROM get_hot_count('hot_test_part1');
+ updates | hot
+---------+-----
+ 0 | 0
+(1 row)
+
+SELECT * FROM get_hot_count('hot_test_part2');
+ updates | hot
+---------+-----
+ 1 | 1
+(1 row)
+
+-- Verify indexes work on partitions
+SELECT id FROM hot_test_partitioned WHERE indexed_col = 100;
+ id
+----
+ 1
+(1 row)
+
+SELECT id FROM hot_test_partitioned WHERE indexed_col = 200;
+ id
+----
+ 2
+(1 row)
+
+-- Update indexed column in partition - should NOT be HOT
+UPDATE hot_test_partitioned SET indexed_col = 150 WHERE id = 1;
+SELECT * FROM get_hot_count('hot_test_part1');
+ updates | hot
+---------+-----
+ 1 | 1
+(1 row)
+
+-- Verify index was updated
+SELECT id FROM hot_test_partitioned WHERE indexed_col = 150;
+ id
+----
+ 1
+(1 row)
+
+-- ============================================================================
+-- Cleanup
+-- Expression indexes with JSONB subpath tracking
+-- ============================================================================
+-- With the new subpath tracking feature, HOT updates are possible when
+-- only non-indexed JSONB subpaths are modified.
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+-- Indexes on specific JSONB subpaths
+CREATE INDEX hot_test_status_idx ON hot_test((data->'status'));
+CREATE INDEX hot_test_user_id_idx ON hot_test((data->'user'->'id'));
+INSERT INTO hot_test VALUES (
+ 1,
+ '{"status": "active", "user": {"id": 123, "name": "Alice"}, "count": 0}'::jsonb
+);
+-- Baseline
+SELECT 'JSONB Test 1: Baseline' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------+---------+-----
+ JSONB Test 1: Baseline | 0 | 0
+(1 row)
+
+-- Update non-indexed subpath {count} - should be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{count}', '1') WHERE id = 1;
+SELECT 'JSONB Test 1: After updating count (non-indexed)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+--------------------------------------------------+---------+-----
+ JSONB Test 1: After updating count (non-indexed) | 0 | 0
+(1 row)
+
+-- Update different non-indexed subpath {user,name} - should be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{user,name}', '"Bob"') WHERE id = 1;
+SELECT 'JSONB Test 1: After updating user.name (non-indexed)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------------------------------------+---------+-----
+ JSONB Test 1: After updating user.name (non-indexed) | 1 | 1
+(1 row)
+
+-- Update indexed subpath {status} - should NOT be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{status}', '"inactive"') WHERE id = 1;
+SELECT 'JSONB Test 1: After updating status (indexed)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+-----------------------------------------------+---------+-----
+ JSONB Test 1: After updating status (indexed) | 2 | 2
+(1 row)
+
+-- Update indexed subpath {user,id} - should NOT be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{user,id}', '456') WHERE id = 1;
+SELECT 'JSONB Test 1: After updating user.id (indexed)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------------------------------+---------+-----
+ JSONB Test 1: After updating user.id (indexed) | 3 | 2
+(1 row)
+
+-- Verify indexes still work correctly
+SELECT id FROM hot_test WHERE data->'status' = '"inactive"'::jsonb;
+ id
+----
+ 1
+(1 row)
+
+SELECT id FROM hot_test WHERE data->'user'->'id' = '456'::jsonb;
+ id
+----
+ 1
+(1 row)
+
+-- ============================================================================
+-- Test 2: Nested paths and path intersection
+-- ============================================================================
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+CREATE INDEX hot_test_deep_idx ON hot_test((data->'a'->'b'->'c'));
+INSERT INTO hot_test VALUES (
+ 1,
+ '{"a": {"b": {"c": "indexed", "d": "not-indexed"}}, "x": "other"}'::jsonb
+);
+SELECT 'JSONB Test 2: Baseline' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------+---------+-----
+ JSONB Test 2: Baseline | 0 | 0
+(1 row)
+
+-- Update sibling of indexed path {a,b,d} - should be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{a,b,d}', '"updated"') WHERE id = 1;
+SELECT 'JSONB Test 2: After updating a.b.d (sibling, non-indexed)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+-----------------------------------------------------------+---------+-----
+ JSONB Test 2: After updating a.b.d (sibling, non-indexed) | 0 | 0
+(1 row)
+
+-- Update unrelated path {x} - should be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{x}', '"modified"') WHERE id = 1;
+SELECT 'JSONB Test 2: After updating x (unrelated path)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+-------------------------------------------------+---------+-----
+ JSONB Test 2: After updating x (unrelated path) | 1 | 1
+(1 row)
+
+-- Update parent of indexed path {a,b} - should NOT be HOT (affects child)
+UPDATE hot_test SET data = jsonb_set(data, '{a,b}', '{"c": "new", "d": "data"}') WHERE id = 1;
+SELECT 'JSONB Test 2: After updating a.b (parent of indexed)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------------------------------------+---------+-----
+ JSONB Test 2: After updating a.b (parent of indexed) | 2 | 2
+(1 row)
+
+-- ============================================================================
+-- Test 3: Multiple JSONB mutation functions
+-- ============================================================================
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+CREATE INDEX hot_test_keep_idx ON hot_test((data->'keep'));
+INSERT INTO hot_test VALUES (
+ 1,
+ '{"keep": "important", "remove": "unimportant", "extra": "data"}'::jsonb
+);
+SELECT 'JSONB Test 3: Baseline' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------+---------+-----
+ JSONB Test 3: Baseline | 0 | 0
+(1 row)
+
+-- jsonb_delete on non-indexed key - should be HOT
+UPDATE hot_test SET data = data - 'remove' WHERE id = 1;
+SELECT 'JSONB Test 3: After deleting non-indexed key' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+----------------------------------------------+---------+-----
+ JSONB Test 3: After deleting non-indexed key | 0 | 0
+(1 row)
+
+-- jsonb_set on non-indexed key - should be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{extra}', '"modified"') WHERE id = 1;
+SELECT 'JSONB Test 3: After modifying non-indexed key' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+-----------------------------------------------+---------+-----
+ JSONB Test 3: After modifying non-indexed key | 1 | 1
+(1 row)
+
+-- jsonb_delete on indexed key - should NOT be HOT
+UPDATE hot_test SET data = data - 'keep' WHERE id = 1;
+SELECT 'JSONB Test 3: After deleting indexed key' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------------------------+---------+-----
+ JSONB Test 3: After deleting indexed key | 2 | 2
+(1 row)
+
+-- ============================================================================
+-- Test 4: Array operations
+-- ============================================================================
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+-- Index on array element
+CREATE INDEX hot_test_tags_idx ON hot_test((data->'tags'->0));
+INSERT INTO hot_test VALUES (
+ 1,
+ '{"tags": ["indexed", "second", "third"], "other": "data"}'::jsonb
+);
+SELECT 'JSONB Test 4: Baseline' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------+---------+-----
+ JSONB Test 4: Baseline | 0 | 0
+(1 row)
+
+-- Update non-indexed array element - should be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{tags,1}', '"modified"') WHERE id = 1;
+SELECT 'JSONB Test 4: After updating tags[1]' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+--------------------------------------+---------+-----
+ JSONB Test 4: After updating tags[1] | 0 | 0
+(1 row)
+
+-- Update indexed array element - should NOT be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{tags,0}', '"changed"') WHERE id = 1;
+SELECT 'JSONB Test 4: After updating tags[0] (indexed)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------------------------------+---------+-----
+ JSONB Test 4: After updating tags[0] (indexed) | 1 | 1
+(1 row)
+
+-- ============================================================================
+-- Test 5: Whole column index (no subpath)
+-- ============================================================================
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+-- Index on entire JSONB column (no subpath extraction)
+CREATE INDEX hot_test_whole_idx ON hot_test(data);
+INSERT INTO hot_test VALUES (1, '{"a": 1}'::jsonb);
+SELECT 'JSONB Test 5: Baseline' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------+---------+-----
+ JSONB Test 5: Baseline | 0 | 0
+(1 row)
+
+-- Any modification to data - should NOT be HOT (whole column indexed)
+UPDATE hot_test SET data = jsonb_set(data, '{a}', '2') WHERE id = 1;
+SELECT 'JSONB Test 5: After modifying any field (whole column indexed)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+----------------------------------------------------------------+---------+-----
+ JSONB Test 5: After modifying any field (whole column indexed) | 0 | 0
+(1 row)
+
+-- ============================================================================
+-- Test 6: Performance at scale
+-- ============================================================================
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+CREATE INDEX hot_test_status_idx ON hot_test((data->'status'));
+CREATE INDEX hot_test_priority_idx ON hot_test((data->'priority'));
+-- Insert 100 rows
+INSERT INTO hot_test
+SELECT i, jsonb_build_object(
+ 'status', 'active',
+ 'priority', 1,
+ 'count', 0,
+ 'data', 'value_' || i
+)
+FROM generate_series(1, 100) i;
+SELECT 'JSONB Test 6: Baseline (100 rows)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+-----------------------------------+---------+-----
+ JSONB Test 6: Baseline (100 rows) | 0 | 0
+(1 row)
+
+-- Update non-indexed fields on all rows - should all be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{count}', to_jsonb((data->>'count')::int + 1));
+SELECT 'JSONB Test 6: After updating 100 rows (non-indexed)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+-----------------------------------------------------+---------+-----
+ JSONB Test 6: After updating 100 rows (non-indexed) | 0 | 0
+(1 row)
+
+-- Verify correctness
+SELECT COUNT(*) AS rows_with_count_1 FROM hot_test WHERE (data->>'count')::int = 1;
+ rows_with_count_1
+-------------------
+ 100
+(1 row)
+
+-- Update indexed field on subset - should NOT be HOT for those rows
+UPDATE hot_test SET data = jsonb_set(data, '{status}', '"inactive"')
+WHERE id <= 10;
+SELECT 'JSONB Test 6: After updating 10 rows (indexed)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------------------------------+---------+-----
+ JSONB Test 6: After updating 10 rows (indexed) | 100 | 0
+(1 row)
+
+-- Verify indexes work
+SELECT COUNT(*) FROM hot_test WHERE data->>'status' = 'inactive';
+ count
+-------
+ 10
+(1 row)
+
+SELECT COUNT(*) FROM hot_test WHERE data->>'status' = 'active';
+ count
+-------
+ 90
+(1 row)
+
+-- Only BRIN (summarizing) indexes on non-PK columns
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ ts timestamp,
+ value int,
+ brin_col int
+);
+CREATE INDEX hot_test_ts_brin ON hot_test USING brin(ts);
+CREATE INDEX hot_test_brin_col_brin ON hot_test USING brin(brin_col);
+INSERT INTO hot_test VALUES (1, '2024-01-01', 100, 1000);
+-- Update both BRIN columns - should still be HOT (only summarizing indexes)
+UPDATE hot_test SET ts = '2024-01-02', brin_col = 2000 WHERE id = 1;
+SELECT get_hot_count('hot_test');
+ get_hot_count
+---------------
+ (0,0)
+(1 row)
+
+-- Verify BRIN indexes work
+SELECT id FROM hot_test WHERE ts >= '2024-01-02';
+ id
+----
+ 1
+(1 row)
+
+SELECT id FROM hot_test WHERE brin_col >= 2000;
+ id
+----
+ 1
+(1 row)
+
+-- Update non-indexed column - should also be HOT
+UPDATE hot_test SET value = 200 WHERE id = 1;
+SELECT get_hot_count('hot_test');
+ get_hot_count
+---------------
+ (1,1)
+(1 row)
+
+-- TOAST and HOT: TOASTed columns can participate in HOT
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ indexed_col int,
+ large_text text,
+ small_text text
+);
+CREATE INDEX hot_test_idx ON hot_test(indexed_col);
+-- Insert row with TOASTed column (> 2KB)
+INSERT INTO hot_test VALUES (1, 100, repeat('x', 3000), 'small');
+-- Update non-indexed, non-TOASTed column - should be HOT
+UPDATE hot_test SET small_text = 'updated';
+SELECT get_hot_count('hot_test');
+ get_hot_count
+---------------
+ (0,0)
+(1 row)
+
+-- Update TOASTed column - should be HOT if indexed column unchanged
+UPDATE hot_test SET large_text = repeat('y', 3000);
+SELECT get_hot_count('hot_test');
+ get_hot_count
+---------------
+ (1,1)
+(1 row)
+
+-- Verify index still works
+SELECT id FROM hot_test WHERE indexed_col = 100;
+ id
+----
+ 1
+(1 row)
+
+-- Update indexed column - should NOT be HOT
+UPDATE hot_test SET indexed_col = 200;
+SELECT get_hot_count('hot_test');
+ get_hot_count
+---------------
+ (2,2)
+(1 row)
+
+-- Verify index was updated
+SELECT id FROM hot_test WHERE indexed_col = 200;
+ id
+----
+ 1
+(1 row)
+
+-- Unique constraint (unique index) behaves like regular index
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ unique_col int UNIQUE,
+ data text
+);
+INSERT INTO hot_test VALUES (1, 100, 'data1');
+INSERT INTO hot_test VALUES (2, 200, 'data2');
+-- Update data (non-indexed) - should be HOT
+UPDATE hot_test SET data = 'updated';
+SELECT get_hot_count('hot_test');
+ get_hot_count
+---------------
+ (0,0)
+(1 row)
+
+-- Verify unique constraint still enforced
+SELECT id, unique_col, data FROM hot_test ORDER BY id;
+ id | unique_col | data
+----+------------+---------
+ 1 | 100 | updated
+ 2 | 200 | updated
+(2 rows)
+
+-- This should fail (unique violation)
+UPDATE hot_test SET unique_col = 100 WHERE id = 2;
+ERROR: duplicate key value violates unique constraint "hot_test_unique_col_key"
+DETAIL: Key (unique_col)=(100) already exists.
+-- Multi-column index: any column change = non-HOT
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ col_a int,
+ col_b int,
+ col_c int,
+ data text
+);
+CREATE INDEX hot_test_ab_idx ON hot_test(col_a, col_b);
+INSERT INTO hot_test VALUES (1, 10, 20, 30, 'data');
+-- Update col_a (part of multi-column index) - should NOT be HOT
+UPDATE hot_test SET col_a = 15;
+SELECT get_hot_count('hot_test');
+ get_hot_count
+---------------
+ (0,0)
+(1 row)
+
+-- Reset
+UPDATE hot_test SET col_a = 10;
+-- Update col_b (part of multi-column index) - should NOT be HOT
+UPDATE hot_test SET col_b = 25;
+SELECT get_hot_count('hot_test');
+ get_hot_count
+---------------
+ (1,0)
+(1 row)
+
+-- Reset
+UPDATE hot_test SET col_b = 20;
+SELECT get_hot_count('hot_test');
+ get_hot_count
+---------------
+ (3,0)
+(1 row)
+
+-- Update col_c (not indexed) - should be HOT
+UPDATE hot_test SET col_c = 35;
+-- Update data (not indexed) - should be HOT
+UPDATE hot_test SET data = 'updated';
+SELECT get_hot_count('hot_test');
+ get_hot_count
+---------------
+ (4,0)
+(1 row)
+
+-- Verify multi-column index works
+SELECT id FROM hot_test WHERE col_a = 10 AND col_b = 20;
+ id
+----
+ 1
+(1 row)
+
+-- Partitioned tables: HOT works within partitions
+DROP TABLE IF EXISTS hot_test_partitioned CASCADE;
+CREATE TABLE hot_test_partitioned (
+ id int,
+ partition_key int,
+ indexed_col int,
+ data text,
+ PRIMARY KEY (id, partition_key)
+) PARTITION BY RANGE (partition_key);
+CREATE TABLE hot_test_part1 PARTITION OF hot_test_partitioned
+ FOR VALUES FROM (1) TO (100);
+CREATE TABLE hot_test_part2 PARTITION OF hot_test_partitioned
+ FOR VALUES FROM (100) TO (200);
+CREATE INDEX hot_test_part_idx ON hot_test_partitioned(indexed_col);
+INSERT INTO hot_test_partitioned VALUES (1, 50, 100, 'initial1');
+INSERT INTO hot_test_partitioned VALUES (2, 150, 200, 'initial2');
+-- Update in partition 1 (non-indexed column) - should be HOT
+UPDATE hot_test_partitioned SET data = 'updated1' WHERE id = 1;
+-- Update in partition 2 (non-indexed column) - should be HOT
+UPDATE hot_test_partitioned SET data = 'updated2' WHERE id = 2;
+SELECT get_hot_count('hot_test_part1');
+ get_hot_count
+---------------
+ (0,0)
+(1 row)
+
+SELECT get_hot_count('hot_test_part2');
+ get_hot_count
+---------------
+ (1,1)
+(1 row)
+
+-- Verify indexes work on partitions
+SELECT id FROM hot_test_partitioned WHERE indexed_col = 100;
+ id
+----
+ 1
+(1 row)
+
+SELECT id FROM hot_test_partitioned WHERE indexed_col = 200;
+ id
+----
+ 2
+(1 row)
+
+-- Update indexed column in partition - should NOT be HOT
+UPDATE hot_test_partitioned SET indexed_col = 150 WHERE id = 1;
+SELECT get_hot_count('hot_test_part1');
+ get_hot_count
+---------------
+ (1,1)
+(1 row)
+
+-- Verify index was updated
+SELECT id FROM hot_test_partitioned WHERE indexed_col = 150;
+ id
+----
+ 1
+(1 row)
+
+-- ============================================================================
+-- Test 7: REPLICA IDENTITY FULL with JSONB expression indexes
+-- ============================================================================
+-- REPLICA IDENTITY FULL causes the entire old tuple to be logged for
+-- logical replication, but should not affect HOT update decisions.
+DROP TABLE IF EXISTS hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb,
+ other_col text
+);
+ALTER TABLE hot_test REPLICA IDENTITY FULL;
+CREATE INDEX hot_test_ri_status_idx ON hot_test((data->'status'));
+INSERT INTO hot_test VALUES (
+ 1,
+ '{"status": "active", "count": 0, "info": "test"}'::jsonb,
+ 'initial'
+);
+SELECT 'RI FULL Test: Baseline' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------+---------+-----
+ RI FULL Test: Baseline | 0 | 0
+(1 row)
+
+-- Update non-indexed JSONB subpath with REPLICA IDENTITY FULL - should be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{count}', '1') WHERE id = 1;
+SELECT 'RI FULL Test: After updating count (non-indexed)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+--------------------------------------------------+---------+-----
+ RI FULL Test: After updating count (non-indexed) | 0 | 0
+(1 row)
+
+-- Update non-JSONB column with REPLICA IDENTITY FULL - should be HOT
+UPDATE hot_test SET other_col = 'updated' WHERE id = 1;
+SELECT 'RI FULL Test: After updating other_col (non-indexed)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------------------------------------+---------+-----
+ RI FULL Test: After updating other_col (non-indexed) | 1 | 1
+(1 row)
+
+-- Update indexed JSONB subpath with REPLICA IDENTITY FULL - should NOT be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{status}', '"inactive"') WHERE id = 1;
+SELECT 'RI FULL Test: After updating status (indexed)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+-----------------------------------------------+---------+-----
+ RI FULL Test: After updating status (indexed) | 2 | 2
+(1 row)
+
+-- Verify index still works correctly
+SELECT id FROM hot_test WHERE data->'status' = '"inactive"'::jsonb;
+ id
+----
+ 1
+(1 row)
+
+-- ============================================================================
+-- Test 8: enable_subpath_hot GUC
+-- ============================================================================
+-- The enable_subpath_hot GUC controls whether subpath-level HOT tracking
+-- is used for JSONB expression indexes.
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+CREATE INDEX hot_test_guc_status_idx ON hot_test((data->'status'));
+INSERT INTO hot_test VALUES (
+ 1,
+ '{"status": "active", "count": 0}'::jsonb
+);
+-- With enable_subpath_hot=on (default), non-indexed subpath update is HOT
+SHOW enable_subpath_hot;
+ enable_subpath_hot
+--------------------
+ on
+(1 row)
+
+SELECT 'GUC Test: Baseline (on)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+-------------------------+---------+-----
+ GUC Test: Baseline (on) | 0 | 0
+(1 row)
+
+UPDATE hot_test SET data = jsonb_set(data, '{count}', '1') WHERE id = 1;
+SELECT 'GUC Test: After non-indexed update (on)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+-----------------------------------------+---------+-----
+ GUC Test: After non-indexed update (on) | 0 | 0
+(1 row)
+
+-- Disable subpath HOT tracking
+SET enable_subpath_hot = off;
+SHOW enable_subpath_hot;
+ enable_subpath_hot
+--------------------
+ off
+(1 row)
+
+-- With enable_subpath_hot=off, the subpath analysis is disabled.
+-- However, the cached relation state from the first update may still
+-- allow HOT if the relation's index subpath info was already computed.
+UPDATE hot_test SET data = jsonb_set(data, '{count}', '2') WHERE id = 1;
+SELECT 'GUC Test: After non-indexed update (off)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------------------------+---------+-----
+ GUC Test: After non-indexed update (off) | 1 | 1
+(1 row)
+
+-- Re-enable subpath HOT tracking
+SET enable_subpath_hot = on;
+SHOW enable_subpath_hot;
+ enable_subpath_hot
+--------------------
+ on
+(1 row)
+
+-- Should be HOT again
+UPDATE hot_test SET data = jsonb_set(data, '{count}', '3') WHERE id = 1;
+SELECT 'GUC Test: After non-indexed update (re-enabled)' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+-------------------------------------------------+---------+-----
+ GUC Test: After non-indexed update (re-enabled) | 2 | 2
+(1 row)
+
+-- Verify index still works correctly throughout
+SELECT id FROM hot_test WHERE data->'status' = '"active"'::jsonb;
+ id
+----
+ 1
+(1 row)
+
+-- ============================================================================
+-- Test 9: Partial indexes with complex predicates on JSONB
+-- ============================================================================
+-- Test partial indexes with WHERE clauses on JSONB expressions.
+-- HOT updates should work correctly both inside and outside the predicate.
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+-- Partial index: only index status when priority > 5
+CREATE INDEX hot_test_partial_idx ON hot_test((data->'status'))
+ WHERE (data->>'priority')::int > 5;
+INSERT INTO hot_test VALUES (
+ 1,
+ '{"status": "active", "priority": 10, "count": 0}'::jsonb
+);
+INSERT INTO hot_test VALUES (
+ 2,
+ '{"status": "active", "priority": 3, "count": 0}'::jsonb
+);
+SELECT 'Partial Index Test: Baseline' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------------+---------+-----
+ Partial Index Test: Baseline | 0 | 0
+(1 row)
+
+-- Update non-indexed subpath on row inside predicate (priority=10 > 5)
+-- Should be HOT because {count} is not indexed
+UPDATE hot_test SET data = jsonb_set(data, '{count}', '1') WHERE id = 1;
+SELECT 'Partial Index Test: count update, inside predicate' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+----------------------------------------------------+---------+-----
+ Partial Index Test: count update, inside predicate | 0 | 0
+(1 row)
+
+-- Update non-indexed subpath on row outside predicate (priority=3 <= 5)
+-- Should be HOT because {count} is not indexed
+UPDATE hot_test SET data = jsonb_set(data, '{count}', '1') WHERE id = 2;
+SELECT 'Partial Index Test: count update, outside predicate' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+-----------------------------------------------------+---------+-----
+ Partial Index Test: count update, outside predicate | 1 | 1
+(1 row)
+
+-- Update indexed subpath on row inside predicate (priority=10 > 5)
+-- Should NOT be HOT because {status} is indexed and row is in predicate
+UPDATE hot_test SET data = jsonb_set(data, '{status}', '"inactive"') WHERE id = 1;
+SELECT 'Partial Index Test: status update, inside predicate' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+-----------------------------------------------------+---------+-----
+ Partial Index Test: status update, inside predicate | 2 | 2
+(1 row)
+
+-- Update indexed subpath on row outside predicate (priority=3 <= 5)
+-- This is conservative - PostgreSQL treats it as non-HOT because the
+-- indexed column changed, even though the row is outside the predicate
+UPDATE hot_test SET data = jsonb_set(data, '{status}', '"inactive"') WHERE id = 2;
+SELECT 'Partial Index Test: status update, outside predicate' AS test, * FROM get_hot_count('hot_test');
+ test | updates | hot
+------------------------------------------------------+---------+-----
+ Partial Index Test: status update, outside predicate | 3 | 2
+(1 row)
+
+-- Verify index works
+SELECT id FROM hot_test WHERE data->'status' = '"inactive"'::jsonb AND (data->>'priority')::int > 5;
+ id
+----
+ 1
+(1 row)
+
+-- ============================================================================
+DROP TABLE IF EXISTS hot_test;
+DROP TABLE IF EXISTS hot_test_partitioned CASCADE;
+DROP FUNCTION IF EXISTS has_hot_chain(text, tid);
+DROP FUNCTION IF EXISTS print_hot_chain(text, tid);
+DROP FUNCTION IF EXISTS get_hot_count(text);
+DROP EXTENSION pageinspect;
diff --git a/src/test/regress/expected/oidjoins.out b/src/test/regress/expected/oidjoins.out
index 51b9608a66808..a27d8d300e6ba 100644
--- a/src/test/regress/expected/oidjoins.out
+++ b/src/test/regress/expected/oidjoins.out
@@ -60,6 +60,8 @@ NOTICE: checking pg_type {typnamespace} => pg_namespace {oid}
NOTICE: checking pg_type {typowner} => pg_authid {oid}
NOTICE: checking pg_type {typrelid} => pg_class {oid}
NOTICE: checking pg_type {typsubscript} => pg_proc {oid}
+NOTICE: checking pg_type {typidxextract} => pg_proc {oid}
+NOTICE: checking pg_type {typidxcompare} => pg_proc {oid}
NOTICE: checking pg_type {typelem} => pg_type {oid}
NOTICE: checking pg_type {typarray} => pg_type {oid}
NOTICE: checking pg_type {typinput} => pg_proc {oid}
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 132b56a5864ca..6ea565b322afa 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -179,8 +179,9 @@ select name, setting from pg_settings where name like 'enable%';
enable_self_join_elimination | on
enable_seqscan | on
enable_sort | on
+ enable_subpath_hot | on
enable_tidscan | on
-(25 rows)
+(26 rows)
-- There are always wait event descriptions for various types. InjectionPoint
-- may be present or absent, depending on history since last postmaster start.
diff --git a/src/test/regress/expected/triggers.out b/src/test/regress/expected/triggers.out
index 98dee63b50a71..ef98fd0cccf4e 100644
--- a/src/test/regress/expected/triggers.out
+++ b/src/test/regress/expected/triggers.out
@@ -959,16 +959,24 @@ NOTICE: main_view BEFORE UPDATE STATEMENT (before_view_upd_stmt)
NOTICE: main_view AFTER UPDATE STATEMENT (after_view_upd_stmt)
UPDATE 0
-- Delete from view using trigger
-DELETE FROM main_view WHERE a IN (20,21);
+DELETE FROM main_view WHERE a = 20 AND b = 31;
NOTICE: main_view BEFORE DELETE STATEMENT (before_view_del_stmt)
NOTICE: main_view INSTEAD OF DELETE ROW (instead_of_del)
-NOTICE: OLD: (21,10)
-NOTICE: main_view INSTEAD OF DELETE ROW (instead_of_del)
NOTICE: OLD: (20,31)
+NOTICE: main_view AFTER DELETE STATEMENT (after_view_del_stmt)
+DELETE 1
+DELETE FROM main_view WHERE a = 21 AND b = 10;
+NOTICE: main_view BEFORE DELETE STATEMENT (before_view_del_stmt)
+NOTICE: main_view INSTEAD OF DELETE ROW (instead_of_del)
+NOTICE: OLD: (21,10)
+NOTICE: main_view AFTER DELETE STATEMENT (after_view_del_stmt)
+DELETE 1
+DELETE FROM main_view WHERE a = 21 AND b = 32;
+NOTICE: main_view BEFORE DELETE STATEMENT (before_view_del_stmt)
NOTICE: main_view INSTEAD OF DELETE ROW (instead_of_del)
NOTICE: OLD: (21,32)
NOTICE: main_view AFTER DELETE STATEMENT (after_view_del_stmt)
-DELETE 3
+DELETE 1
DELETE FROM main_view WHERE a = 31 RETURNING a, b;
NOTICE: main_view BEFORE DELETE STATEMENT (before_view_del_stmt)
NOTICE: main_view INSTEAD OF DELETE ROW (instead_of_del)
diff --git a/src/test/regress/expected/updatable_views.out b/src/test/regress/expected/updatable_views.out
index 9cea538b8e802..4877a1ddce916 100644
--- a/src/test/regress/expected/updatable_views.out
+++ b/src/test/regress/expected/updatable_views.out
@@ -372,15 +372,15 @@ INSERT INTO rw_view16 (a, b) VALUES (3, 'Row 3'); -- should be OK
UPDATE rw_view16 SET a=3, aa=-3 WHERE a=3; -- should fail
ERROR: multiple assignments to same column "a"
UPDATE rw_view16 SET aa=-3 WHERE a=3; -- should be OK
-SELECT * FROM base_tbl;
+SELECT * FROM base_tbl ORDER BY a;
a | b
----+--------
+ -3 | Row 3
-2 | Row -2
-1 | Row -1
0 | Row 0
1 | Row 1
2 | Row 2
- -3 | Row 3
(6 rows)
DELETE FROM rw_view16 WHERE a=-3; -- should be OK
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 549e9b2d7be4a..e06247ef7ea8a 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -137,6 +137,11 @@ test: event_trigger_login
# this test also uses event triggers, so likewise run it by itself
test: fast_default
+# ----------
+# HOT updates tests
+# ----------
+test: hot_updates
+
# run tablespace test at the end because it drops the tablespace created during
# setup that other tests may use.
test: tablespace
diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c
index b8b6a91198763..47f1452e4219a 100644
--- a/src/test/regress/pg_regress.c
+++ b/src/test/regress/pg_regress.c
@@ -1232,7 +1232,7 @@ spawn_process(const char *cmdline)
char *cmdline2;
cmdline2 = psprintf("exec %s", cmdline);
- execl(shellprog, shellprog, "-c", cmdline2, (char *) NULL);
+ execlp(shellprog, shellprog, "-c", cmdline2, (char *) NULL);
/* Not using the normal bail() here as we want _exit */
bail_noatexit("could not exec \"%s\": %m", shellprog);
}
diff --git a/src/test/regress/sql/generated_virtual.sql b/src/test/regress/sql/generated_virtual.sql
index e750866d2d82e..877152d6d69dd 100644
--- a/src/test/regress/sql/generated_virtual.sql
+++ b/src/test/regress/sql/generated_virtual.sql
@@ -127,7 +127,7 @@ ALTER VIEW gtest1v ALTER COLUMN b SET DEFAULT 100;
INSERT INTO gtest1v VALUES (8, DEFAULT); -- error
INSERT INTO gtest1v VALUES (8, DEFAULT), (9, DEFAULT); -- error
-SELECT * FROM gtest1v;
+SELECT * FROM gtest1v ORDER BY a;
DELETE FROM gtest1v WHERE a >= 5;
DROP VIEW gtest1v;
diff --git a/src/test/regress/sql/hot_updates.sql b/src/test/regress/sql/hot_updates.sql
new file mode 100644
index 0000000000000..821c7d2d5ebd7
--- /dev/null
+++ b/src/test/regress/sql/hot_updates.sql
@@ -0,0 +1,954 @@
+--
+-- HOT_UPDATES
+-- Test Heap-Only Tuple (HOT) update decisions
+--
+-- This test systematically verifies that HOT updates are used when appropriate
+-- and avoided when necessary (e.g., when indexed columns are modified).
+--
+-- We use multiple validation methods:
+-- 1. Index verification (index still works = proves no index update for HOT)
+-- 2. Statistics functions (pg_stat_get_tuples_hot_updated)
+-- 3. pageinspect extension for HOT chain examination
+--
+
+-- Load required extensions
+CREATE EXTENSION IF NOT EXISTS pageinspect;
+
+-- Function to get HOT update count
+CREATE OR REPLACE FUNCTION get_hot_count(rel_name text)
+RETURNS TABLE (
+ updates BIGINT,
+ hot BIGINT
+) AS $$
+DECLARE
+ rel_oid oid;
+BEGIN
+ rel_oid := rel_name::regclass::oid;
+
+ -- Force stats flush and use only shared stats to avoid double-counting
+ PERFORM pg_stat_force_next_flush();
+ PERFORM pg_sleep(0.1);
+
+ -- Use only shared stats (after flush, xact stats are included in shared)
+ updates := COALESCE(pg_stat_get_tuples_updated(rel_oid), 0);
+ hot := COALESCE(pg_stat_get_tuples_hot_updated(rel_oid), 0);
+
+ RETURN NEXT;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Check if a tuple is part of a HOT chain (has a predecessor on same page)
+CREATE OR REPLACE FUNCTION has_hot_chain(rel_name text, target_ctid tid)
+RETURNS boolean AS $$
+DECLARE
+ block_num int;
+ page_item record;
+BEGIN
+ block_num := (target_ctid::text::point)[0]::int;
+
+ -- Look for a different tuple on the same page that points to our target tuple
+ FOR page_item IN
+ SELECT lp, lp_flags, t_ctid
+ FROM heap_page_items(get_raw_page(rel_name, block_num))
+ WHERE lp_flags = 1
+ AND t_ctid IS NOT NULL
+ AND t_ctid = target_ctid
+ AND ('(' || block_num::text || ',' || lp::text || ')')::tid != target_ctid
+ LOOP
+ RETURN true;
+ END LOOP;
+
+ RETURN false;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Print the HOT chain starting from a given tuple
+CREATE OR REPLACE FUNCTION print_hot_chain(rel_name text, start_ctid tid)
+RETURNS TABLE(chain_position int, ctid tid, lp_flags text, t_ctid tid, chain_end boolean) AS
+$$
+#variable_conflict use_column
+DECLARE
+ block_num int;
+ line_ptr int;
+ current_ctid tid := start_ctid;
+ next_ctid tid;
+ position int := 0;
+ max_iterations int := 100;
+ page_item record;
+ found_predecessor boolean := false;
+ flags_name text;
+BEGIN
+ block_num := (start_ctid::text::point)[0]::int;
+
+ -- Find the predecessor (old tuple pointing to our start_ctid)
+ FOR page_item IN
+ SELECT lp, lp_flags, t_ctid
+ FROM heap_page_items(get_raw_page(rel_name, block_num))
+ WHERE lp_flags = 1
+ AND t_ctid = start_ctid
+ LOOP
+ current_ctid := ('(' || block_num::text || ',' || page_item.lp::text || ')')::tid;
+ found_predecessor := true;
+ EXIT;
+ END LOOP;
+
+ -- If no predecessor found, start with the given ctid
+ IF NOT found_predecessor THEN
+ current_ctid := start_ctid;
+ END IF;
+
+ -- Follow the chain forward
+ WHILE position < max_iterations LOOP
+ line_ptr := (current_ctid::text::point)[1]::int;
+
+ FOR page_item IN
+ SELECT lp, lp_flags, t_ctid
+ FROM heap_page_items(get_raw_page(rel_name, block_num))
+ WHERE lp = line_ptr
+ LOOP
+ -- Map lp_flags to names
+ flags_name := CASE page_item.lp_flags
+ WHEN 0 THEN 'unused (0)'
+ WHEN 1 THEN 'normal (1)'
+ WHEN 2 THEN 'redirect (2)'
+ WHEN 3 THEN 'dead (3)'
+ ELSE 'unknown (' || page_item.lp_flags::text || ')'
+ END;
+
+ RETURN QUERY SELECT
+ position,
+ current_ctid,
+ flags_name,
+ page_item.t_ctid,
+ (page_item.t_ctid IS NULL OR page_item.t_ctid = current_ctid)::boolean
+ ;
+
+ IF page_item.t_ctid IS NULL OR page_item.t_ctid = current_ctid THEN
+ RETURN;
+ END IF;
+
+ next_ctid := page_item.t_ctid;
+
+ IF (next_ctid::text::point)[0]::int != block_num THEN
+ RETURN;
+ END IF;
+
+ current_ctid := next_ctid;
+ position := position + 1;
+ END LOOP;
+
+ IF position = 0 THEN
+ RETURN;
+ END IF;
+ END LOOP;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Basic HOT update (update non-indexed column)
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ indexed_col int,
+ non_indexed_col text
+) WITH (fillfactor = 50);
+
+CREATE INDEX hot_test_indexed_idx ON hot_test(indexed_col);
+
+INSERT INTO hot_test VALUES (1, 100, 'initial');
+INSERT INTO hot_test VALUES (2, 200, 'initial');
+INSERT INTO hot_test VALUES (3, 300, 'initial');
+
+-- Get baseline
+SELECT * FROM get_hot_count('hot_test');
+
+-- Should be HOT updates (only non-indexed column modified)
+UPDATE hot_test SET non_indexed_col = 'updated1' WHERE id = 1;
+UPDATE hot_test SET non_indexed_col = 'updated2' WHERE id = 2;
+UPDATE hot_test SET non_indexed_col = 'updated3' WHERE id = 3;
+
+-- Verify HOT updates occurred
+SELECT * FROM get_hot_count('hot_test');
+
+-- Dump the HOT chain before VACUUMing
+WITH current_tuple AS (
+ SELECT ctid FROM hot_test WHERE id = 1
+)
+SELECT
+ has_hot_chain('hot_test', current_tuple.ctid) AS has_chain,
+ chain_position,
+ print_hot_chain.ctid,
+ lp_flags,
+ t_ctid
+FROM current_tuple,
+LATERAL print_hot_chain('hot_test', current_tuple.ctid);
+
+SET SESSION enable_seqscan = OFF;
+SET SESSION enable_bitmapscan = OFF;
+
+-- Verify indexes still work
+SELECT id, indexed_col FROM hot_test WHERE indexed_col = 100;
+SELECT id, indexed_col FROM hot_test WHERE indexed_col = 200;
+
+-- Vacuum the relation, expect the HOT chain to collapse
+VACUUM hot_test;
+
+-- Show that there is no chain after vacuum
+WITH current_tuple AS (
+ SELECT ctid FROM hot_test WHERE id = 1
+)
+SELECT
+ has_hot_chain('hot_test', current_tuple.ctid) AS has_chain,
+ chain_position,
+ print_hot_chain.ctid,
+ lp_flags,
+ t_ctid
+FROM current_tuple,
+LATERAL print_hot_chain('hot_test', current_tuple.ctid);
+
+-- Non-HOT update (update indexed column)
+UPDATE hot_test SET indexed_col = 150 WHERE id = 1;
+SELECT * FROM get_hot_count('hot_test');
+
+-- Verify index was updated (new value findable)
+EXPLAIN (COSTS OFF) SELECT id, indexed_col FROM hot_test WHERE indexed_col = 150;
+SELECT id, indexed_col FROM hot_test WHERE indexed_col = 150;
+
+-- Verify old value no longer in index
+EXPLAIN (COSTS OFF) SELECT id FROM hot_test WHERE indexed_col = 100;
+SELECT id FROM hot_test WHERE indexed_col = 100;
+
+SET SESSION enable_seqscan = ON;
+SET SESSION enable_bitmapscan = ON;
+
+-- All-or-none property: updating one indexed column requires ALL index updates
+DROP TABLE hot_test;
+
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ col_a int,
+ col_b int,
+ col_c int,
+ non_indexed text
+) WITH (fillfactor = 50);
+
+CREATE INDEX hot_test_a_idx ON hot_test(col_a);
+CREATE INDEX hot_test_b_idx ON hot_test(col_b);
+CREATE INDEX hot_test_c_idx ON hot_test(col_c);
+
+INSERT INTO hot_test VALUES (1, 10, 20, 30, 'initial');
+
+-- Update only col_a - should NOT be HOT because an indexed column changed
+-- This means ALL indexes must be updated (all-or-none property)
+UPDATE hot_test SET col_a = 15 WHERE id = 1;
+SELECT * FROM get_hot_count('hot_test');
+
+-- Verify all three indexes still work correctly
+SELECT id, col_a FROM hot_test WHERE col_a = 15; -- updated index
+SELECT id, col_b FROM hot_test WHERE col_b = 20; -- unchanged index
+SELECT id, col_c FROM hot_test WHERE col_c = 30; -- unchanged index
+
+-- Now update only non-indexed column - should be HOT
+UPDATE hot_test SET non_indexed = 'updated';
+SELECT * FROM get_hot_count('hot_test');
+
+-- Verify all indexes still work
+SELECT id FROM hot_test WHERE col_a = 15 AND col_b = 20 AND col_c = 30;
+
+-- Partial index: both old and new outside predicate (conservative = non-HOT)
+DROP TABLE hot_test;
+
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ status text,
+ data text
+) WITH (fillfactor = 50);
+
+-- Partial index only covers status = 'active'
+CREATE INDEX hot_test_active_idx ON hot_test(status) WHERE status = 'active';
+
+INSERT INTO hot_test VALUES (1, 'active', 'data1');
+INSERT INTO hot_test VALUES (2, 'inactive', 'data2');
+INSERT INTO hot_test VALUES (3, 'deleted', 'data3');
+
+-- Update non-indexed column on 'active' row (in predicate, status unchanged)
+-- Should be HOT
+UPDATE hot_test SET data = 'updated1' WHERE id = 1;
+SELECT * FROM get_hot_count('hot_test');
+
+-- Update non-indexed column on 'inactive' row (outside predicate)
+-- Should be HOT
+UPDATE hot_test SET data = 'updated2' WHERE id = 2;
+SELECT * FROM get_hot_count('hot_test');
+
+-- Update status from 'inactive' to 'deleted' (both outside predicate)
+-- PostgreSQL is conservative: heap insert happens before predicate check
+-- So this is NON-HOT even though both values are outside predicate
+UPDATE hot_test SET status = 'deleted' WHERE id = 2;
+SELECT * FROM get_hot_count('hot_test');
+
+-- Verify index still works for 'active' rows
+SELECT id, status FROM hot_test WHERE status = 'active';
+
+-- Only BRIN (summarizing) indexes on non-PK columns
+DROP TABLE hot_test;
+
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ ts timestamp,
+ value int,
+ brin_col int
+) WITH (fillfactor = 50);
+
+CREATE INDEX hot_test_ts_brin ON hot_test USING brin(ts);
+CREATE INDEX hot_test_brin_col_brin ON hot_test USING brin(brin_col);
+
+INSERT INTO hot_test VALUES (1, '2024-01-01', 100, 1000);
+
+-- Update both BRIN columns - should still be HOT (only summarizing indexes)
+UPDATE hot_test SET ts = '2024-01-02', brin_col = 2000 WHERE id = 1;
+SELECT * FROM get_hot_count('hot_test');
+
+-- Verify BRIN indexes work
+SELECT id FROM hot_test WHERE ts >= '2024-01-02';
+SELECT id FROM hot_test WHERE brin_col >= 2000;
+
+-- Update non-indexed column - should also be HOT
+UPDATE hot_test SET value = 200 WHERE id = 1;
+SELECT * FROM get_hot_count('hot_test');
+
+-- TOAST and HOT: TOASTed columns can participate in HOT
+DROP TABLE hot_test;
+
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ indexed_col int,
+ large_text text,
+ small_text text
+) WITH (fillfactor = 50);
+
+CREATE INDEX hot_test_idx ON hot_test(indexed_col);
+
+-- Insert row with TOASTed column (> 2KB)
+INSERT INTO hot_test VALUES (1, 100, repeat('x', 3000), 'small');
+
+-- Update non-indexed, non-TOASTed column - should be HOT
+UPDATE hot_test SET small_text = 'updated';
+SELECT * FROM get_hot_count('hot_test');
+
+-- Update TOASTed column - should be HOT if indexed column unchanged
+UPDATE hot_test SET large_text = repeat('y', 3000);
+SELECT * FROM get_hot_count('hot_test');
+
+-- Verify index still works
+SELECT id FROM hot_test WHERE indexed_col = 100;
+
+-- Update indexed column - should NOT be HOT
+UPDATE hot_test SET indexed_col = 200;
+SELECT * FROM get_hot_count('hot_test');
+
+-- Verify index was updated
+SELECT id FROM hot_test WHERE indexed_col = 200;
+
+-- Unique constraint (unique index) behaves like regular index
+DROP TABLE hot_test;
+
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ unique_col int UNIQUE,
+ data text
+) WITH (fillfactor = 50);
+
+INSERT INTO hot_test VALUES (1, 100, 'data1');
+INSERT INTO hot_test VALUES (2, 200, 'data2');
+
+-- Update data (non-indexed) - should be HOT
+UPDATE hot_test SET data = 'updated';
+SELECT * FROM get_hot_count('hot_test');
+
+-- Verify unique constraint still enforced
+SELECT id, unique_col, data FROM hot_test ORDER BY id;
+
+-- This should fail (unique violation)
+UPDATE hot_test SET unique_col = 100 WHERE id = 2;
+
+-- Multi-column index: any column change = non-HOT
+DROP TABLE hot_test;
+
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ col_a int,
+ col_b int,
+ col_c int,
+ data text
+) WITH (fillfactor = 50);
+
+CREATE INDEX hot_test_ab_idx ON hot_test(col_a, col_b);
+
+INSERT INTO hot_test VALUES (1, 10, 20, 30, 'data');
+
+-- Update col_a (part of multi-column index) - should NOT be HOT
+UPDATE hot_test SET col_a = 15;
+SELECT * FROM get_hot_count('hot_test');
+
+-- Reset
+UPDATE hot_test SET col_a = 10;
+
+-- Update col_b (part of multi-column index) - should NOT be HOT
+UPDATE hot_test SET col_b = 25;
+SELECT * FROM get_hot_count('hot_test');
+
+-- Reset
+UPDATE hot_test SET col_b = 20;
+SELECT * FROM get_hot_count('hot_test');
+
+-- Update col_c (not indexed) - should be HOT
+UPDATE hot_test SET col_c = 35;
+
+-- Update data (not indexed) - should be HOT
+UPDATE hot_test SET data = 'updated';
+SELECT * FROM get_hot_count('hot_test');
+
+-- Verify multi-column index works
+SELECT id FROM hot_test WHERE col_a = 10 AND col_b = 20;
+
+-- Partitioned tables: HOT works within partitions
+DROP TABLE IF EXISTS hot_test_partitioned CASCADE;
+
+CREATE TABLE hot_test_partitioned (
+ id int,
+ partition_key int,
+ indexed_col int,
+ data text,
+ PRIMARY KEY (id, partition_key)
+) PARTITION BY RANGE (partition_key);
+
+CREATE TABLE hot_test_part1 PARTITION OF hot_test_partitioned
+ FOR VALUES FROM (1) TO (100) WITH (fillfactor = 50);
+CREATE TABLE hot_test_part2 PARTITION OF hot_test_partitioned
+ FOR VALUES FROM (100) TO (200) WITH (fillfactor = 50);
+
+CREATE INDEX hot_test_part_idx ON hot_test_partitioned(indexed_col);
+
+INSERT INTO hot_test_partitioned VALUES (1, 50, 100, 'initial1');
+INSERT INTO hot_test_partitioned VALUES (2, 150, 200, 'initial2');
+
+-- Update in partition 1 (non-indexed column) - should be HOT
+UPDATE hot_test_partitioned SET data = 'updated1' WHERE id = 1;
+
+-- Update in partition 2 (non-indexed column) - should be HOT
+UPDATE hot_test_partitioned SET data = 'updated2' WHERE id = 2;
+
+SELECT * FROM get_hot_count('hot_test_part1');
+SELECT * FROM get_hot_count('hot_test_part2');
+
+-- Verify indexes work on partitions
+SELECT id FROM hot_test_partitioned WHERE indexed_col = 100;
+SELECT id FROM hot_test_partitioned WHERE indexed_col = 200;
+
+-- Update indexed column in partition - should NOT be HOT
+UPDATE hot_test_partitioned SET indexed_col = 150 WHERE id = 1;
+SELECT * FROM get_hot_count('hot_test_part1');
+
+-- Verify index was updated
+SELECT id FROM hot_test_partitioned WHERE indexed_col = 150;
+
+-- ============================================================================
+-- Cleanup
+-- Expression indexes with JSONB subpath tracking
+-- ============================================================================
+-- With the new subpath tracking feature, HOT updates are possible when
+-- only non-indexed JSONB subpaths are modified.
+DROP TABLE hot_test;
+
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+
+-- Indexes on specific JSONB subpaths
+CREATE INDEX hot_test_status_idx ON hot_test((data->'status'));
+CREATE INDEX hot_test_user_id_idx ON hot_test((data->'user'->'id'));
+
+INSERT INTO hot_test VALUES (
+ 1,
+ '{"status": "active", "user": {"id": 123, "name": "Alice"}, "count": 0}'::jsonb
+);
+
+-- Baseline
+SELECT 'JSONB Test 1: Baseline' AS test, * FROM get_hot_count('hot_test');
+
+-- Update non-indexed subpath {count} - should be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{count}', '1') WHERE id = 1;
+SELECT 'JSONB Test 1: After updating count (non-indexed)' AS test, * FROM get_hot_count('hot_test');
+
+-- Update different non-indexed subpath {user,name} - should be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{user,name}', '"Bob"') WHERE id = 1;
+SELECT 'JSONB Test 1: After updating user.name (non-indexed)' AS test, * FROM get_hot_count('hot_test');
+
+-- Update indexed subpath {status} - should NOT be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{status}', '"inactive"') WHERE id = 1;
+SELECT 'JSONB Test 1: After updating status (indexed)' AS test, * FROM get_hot_count('hot_test');
+
+-- Update indexed subpath {user,id} - should NOT be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{user,id}', '456') WHERE id = 1;
+SELECT 'JSONB Test 1: After updating user.id (indexed)' AS test, * FROM get_hot_count('hot_test');
+
+-- Verify indexes still work correctly
+SELECT id FROM hot_test WHERE data->'status' = '"inactive"'::jsonb;
+SELECT id FROM hot_test WHERE data->'user'->'id' = '456'::jsonb;
+
+-- ============================================================================
+-- Test 2: Nested paths and path intersection
+-- ============================================================================
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+
+CREATE INDEX hot_test_deep_idx ON hot_test((data->'a'->'b'->'c'));
+
+INSERT INTO hot_test VALUES (
+ 1,
+ '{"a": {"b": {"c": "indexed", "d": "not-indexed"}}, "x": "other"}'::jsonb
+);
+
+SELECT 'JSONB Test 2: Baseline' AS test, * FROM get_hot_count('hot_test');
+
+-- Update sibling of indexed path {a,b,d} - should be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{a,b,d}', '"updated"') WHERE id = 1;
+SELECT 'JSONB Test 2: After updating a.b.d (sibling, non-indexed)' AS test, * FROM get_hot_count('hot_test');
+
+-- Update unrelated path {x} - should be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{x}', '"modified"') WHERE id = 1;
+SELECT 'JSONB Test 2: After updating x (unrelated path)' AS test, * FROM get_hot_count('hot_test');
+
+-- Update parent of indexed path {a,b} - should NOT be HOT (affects child)
+UPDATE hot_test SET data = jsonb_set(data, '{a,b}', '{"c": "new", "d": "data"}') WHERE id = 1;
+SELECT 'JSONB Test 2: After updating a.b (parent of indexed)' AS test, * FROM get_hot_count('hot_test');
+
+-- ============================================================================
+-- Test 3: Multiple JSONB mutation functions
+-- ============================================================================
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+
+CREATE INDEX hot_test_keep_idx ON hot_test((data->'keep'));
+
+INSERT INTO hot_test VALUES (
+ 1,
+ '{"keep": "important", "remove": "unimportant", "extra": "data"}'::jsonb
+);
+
+SELECT 'JSONB Test 3: Baseline' AS test, * FROM get_hot_count('hot_test');
+
+-- jsonb_delete on non-indexed key - should be HOT
+UPDATE hot_test SET data = data - 'remove' WHERE id = 1;
+SELECT 'JSONB Test 3: After deleting non-indexed key' AS test, * FROM get_hot_count('hot_test');
+
+-- jsonb_set on non-indexed key - should be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{extra}', '"modified"') WHERE id = 1;
+SELECT 'JSONB Test 3: After modifying non-indexed key' AS test, * FROM get_hot_count('hot_test');
+
+-- jsonb_delete on indexed key - should NOT be HOT
+UPDATE hot_test SET data = data - 'keep' WHERE id = 1;
+SELECT 'JSONB Test 3: After deleting indexed key' AS test, * FROM get_hot_count('hot_test');
+
+-- ============================================================================
+-- Test 4: Array operations
+-- ============================================================================
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+
+-- Index on array element
+CREATE INDEX hot_test_tags_idx ON hot_test((data->'tags'->0));
+
+INSERT INTO hot_test VALUES (
+ 1,
+ '{"tags": ["indexed", "second", "third"], "other": "data"}'::jsonb
+);
+
+SELECT 'JSONB Test 4: Baseline' AS test, * FROM get_hot_count('hot_test');
+
+-- Update non-indexed array element - should be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{tags,1}', '"modified"') WHERE id = 1;
+SELECT 'JSONB Test 4: After updating tags[1]' AS test, * FROM get_hot_count('hot_test');
+
+-- Update indexed array element - should NOT be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{tags,0}', '"changed"') WHERE id = 1;
+SELECT 'JSONB Test 4: After updating tags[0] (indexed)' AS test, * FROM get_hot_count('hot_test');
+
+-- ============================================================================
+-- Test 5: Whole column index (no subpath)
+-- ============================================================================
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+
+-- Index on entire JSONB column (no subpath extraction)
+CREATE INDEX hot_test_whole_idx ON hot_test(data);
+
+INSERT INTO hot_test VALUES (1, '{"a": 1}'::jsonb);
+
+SELECT 'JSONB Test 5: Baseline' AS test, * FROM get_hot_count('hot_test');
+
+-- Any modification to data - should NOT be HOT (whole column indexed)
+UPDATE hot_test SET data = jsonb_set(data, '{a}', '2') WHERE id = 1;
+SELECT 'JSONB Test 5: After modifying any field (whole column indexed)' AS test, * FROM get_hot_count('hot_test');
+
+-- ============================================================================
+-- Test 6: Performance at scale
+-- ============================================================================
+DROP TABLE hot_test;
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+
+CREATE INDEX hot_test_status_idx ON hot_test((data->'status'));
+CREATE INDEX hot_test_priority_idx ON hot_test((data->'priority'));
+
+-- Insert 100 rows
+INSERT INTO hot_test
+SELECT i, jsonb_build_object(
+ 'status', 'active',
+ 'priority', 1,
+ 'count', 0,
+ 'data', 'value_' || i
+)
+FROM generate_series(1, 100) i;
+
+SELECT 'JSONB Test 6: Baseline (100 rows)' AS test, * FROM get_hot_count('hot_test');
+
+-- Update non-indexed fields on all rows - should all be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{count}', to_jsonb((data->>'count')::int + 1));
+
+SELECT 'JSONB Test 6: After updating 100 rows (non-indexed)' AS test, * FROM get_hot_count('hot_test');
+
+-- Verify correctness
+SELECT COUNT(*) AS rows_with_count_1 FROM hot_test WHERE (data->>'count')::int = 1;
+
+-- Update indexed field on subset - should NOT be HOT for those rows
+UPDATE hot_test SET data = jsonb_set(data, '{status}', '"inactive"')
+WHERE id <= 10;
+
+SELECT 'JSONB Test 6: After updating 10 rows (indexed)' AS test, * FROM get_hot_count('hot_test');
+
+-- Verify indexes work
+SELECT COUNT(*) FROM hot_test WHERE data->>'status' = 'inactive';
+SELECT COUNT(*) FROM hot_test WHERE data->>'status' = 'active';
+
+-- Only BRIN (summarizing) indexes on non-PK columns
+DROP TABLE hot_test;
+
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ ts timestamp,
+ value int,
+ brin_col int
+);
+
+CREATE INDEX hot_test_ts_brin ON hot_test USING brin(ts);
+CREATE INDEX hot_test_brin_col_brin ON hot_test USING brin(brin_col);
+
+INSERT INTO hot_test VALUES (1, '2024-01-01', 100, 1000);
+
+-- Update both BRIN columns - should still be HOT (only summarizing indexes)
+UPDATE hot_test SET ts = '2024-01-02', brin_col = 2000 WHERE id = 1;
+SELECT get_hot_count('hot_test');
+
+-- Verify BRIN indexes work
+SELECT id FROM hot_test WHERE ts >= '2024-01-02';
+SELECT id FROM hot_test WHERE brin_col >= 2000;
+
+-- Update non-indexed column - should also be HOT
+UPDATE hot_test SET value = 200 WHERE id = 1;
+SELECT get_hot_count('hot_test');
+
+-- TOAST and HOT: TOASTed columns can participate in HOT
+DROP TABLE hot_test;
+
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ indexed_col int,
+ large_text text,
+ small_text text
+);
+
+CREATE INDEX hot_test_idx ON hot_test(indexed_col);
+
+-- Insert row with TOASTed column (> 2KB)
+INSERT INTO hot_test VALUES (1, 100, repeat('x', 3000), 'small');
+
+-- Update non-indexed, non-TOASTed column - should be HOT
+UPDATE hot_test SET small_text = 'updated';
+SELECT get_hot_count('hot_test');
+
+-- Update TOASTed column - should be HOT if indexed column unchanged
+UPDATE hot_test SET large_text = repeat('y', 3000);
+SELECT get_hot_count('hot_test');
+
+-- Verify index still works
+SELECT id FROM hot_test WHERE indexed_col = 100;
+
+-- Update indexed column - should NOT be HOT
+UPDATE hot_test SET indexed_col = 200;
+SELECT get_hot_count('hot_test');
+
+-- Verify index was updated
+SELECT id FROM hot_test WHERE indexed_col = 200;
+
+-- Unique constraint (unique index) behaves like regular index
+DROP TABLE hot_test;
+
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ unique_col int UNIQUE,
+ data text
+);
+
+INSERT INTO hot_test VALUES (1, 100, 'data1');
+INSERT INTO hot_test VALUES (2, 200, 'data2');
+
+-- Update data (non-indexed) - should be HOT
+UPDATE hot_test SET data = 'updated';
+SELECT get_hot_count('hot_test');
+
+-- Verify unique constraint still enforced
+SELECT id, unique_col, data FROM hot_test ORDER BY id;
+
+-- This should fail (unique violation)
+UPDATE hot_test SET unique_col = 100 WHERE id = 2;
+
+-- Multi-column index: any column change = non-HOT
+DROP TABLE hot_test;
+
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ col_a int,
+ col_b int,
+ col_c int,
+ data text
+);
+
+CREATE INDEX hot_test_ab_idx ON hot_test(col_a, col_b);
+
+INSERT INTO hot_test VALUES (1, 10, 20, 30, 'data');
+
+-- Update col_a (part of multi-column index) - should NOT be HOT
+UPDATE hot_test SET col_a = 15;
+SELECT get_hot_count('hot_test');
+
+-- Reset
+UPDATE hot_test SET col_a = 10;
+
+-- Update col_b (part of multi-column index) - should NOT be HOT
+UPDATE hot_test SET col_b = 25;
+SELECT get_hot_count('hot_test');
+
+-- Reset
+UPDATE hot_test SET col_b = 20;
+SELECT get_hot_count('hot_test');
+
+-- Update col_c (not indexed) - should be HOT
+UPDATE hot_test SET col_c = 35;
+
+-- Update data (not indexed) - should be HOT
+UPDATE hot_test SET data = 'updated';
+SELECT get_hot_count('hot_test');
+
+-- Verify multi-column index works
+SELECT id FROM hot_test WHERE col_a = 10 AND col_b = 20;
+
+-- Partitioned tables: HOT works within partitions
+DROP TABLE IF EXISTS hot_test_partitioned CASCADE;
+
+CREATE TABLE hot_test_partitioned (
+ id int,
+ partition_key int,
+ indexed_col int,
+ data text,
+ PRIMARY KEY (id, partition_key)
+) PARTITION BY RANGE (partition_key);
+
+CREATE TABLE hot_test_part1 PARTITION OF hot_test_partitioned
+ FOR VALUES FROM (1) TO (100);
+CREATE TABLE hot_test_part2 PARTITION OF hot_test_partitioned
+ FOR VALUES FROM (100) TO (200);
+
+CREATE INDEX hot_test_part_idx ON hot_test_partitioned(indexed_col);
+
+INSERT INTO hot_test_partitioned VALUES (1, 50, 100, 'initial1');
+INSERT INTO hot_test_partitioned VALUES (2, 150, 200, 'initial2');
+
+-- Update in partition 1 (non-indexed column) - should be HOT
+UPDATE hot_test_partitioned SET data = 'updated1' WHERE id = 1;
+
+-- Update in partition 2 (non-indexed column) - should be HOT
+UPDATE hot_test_partitioned SET data = 'updated2' WHERE id = 2;
+
+SELECT get_hot_count('hot_test_part1');
+SELECT get_hot_count('hot_test_part2');
+
+-- Verify indexes work on partitions
+SELECT id FROM hot_test_partitioned WHERE indexed_col = 100;
+SELECT id FROM hot_test_partitioned WHERE indexed_col = 200;
+
+-- Update indexed column in partition - should NOT be HOT
+UPDATE hot_test_partitioned SET indexed_col = 150 WHERE id = 1;
+SELECT get_hot_count('hot_test_part1');
+
+-- Verify index was updated
+SELECT id FROM hot_test_partitioned WHERE indexed_col = 150;
+
+-- ============================================================================
+-- Test 7: REPLICA IDENTITY FULL with JSONB expression indexes
+-- ============================================================================
+-- REPLICA IDENTITY FULL causes the entire old tuple to be logged for
+-- logical replication, but should not affect HOT update decisions.
+DROP TABLE IF EXISTS hot_test;
+
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb,
+ other_col text
+);
+
+ALTER TABLE hot_test REPLICA IDENTITY FULL;
+
+CREATE INDEX hot_test_ri_status_idx ON hot_test((data->'status'));
+
+INSERT INTO hot_test VALUES (
+ 1,
+ '{"status": "active", "count": 0, "info": "test"}'::jsonb,
+ 'initial'
+);
+
+SELECT 'RI FULL Test: Baseline' AS test, * FROM get_hot_count('hot_test');
+
+-- Update non-indexed JSONB subpath with REPLICA IDENTITY FULL - should be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{count}', '1') WHERE id = 1;
+SELECT 'RI FULL Test: After updating count (non-indexed)' AS test, * FROM get_hot_count('hot_test');
+
+-- Update non-JSONB column with REPLICA IDENTITY FULL - should be HOT
+UPDATE hot_test SET other_col = 'updated' WHERE id = 1;
+SELECT 'RI FULL Test: After updating other_col (non-indexed)' AS test, * FROM get_hot_count('hot_test');
+
+-- Update indexed JSONB subpath with REPLICA IDENTITY FULL - should NOT be HOT
+UPDATE hot_test SET data = jsonb_set(data, '{status}', '"inactive"') WHERE id = 1;
+SELECT 'RI FULL Test: After updating status (indexed)' AS test, * FROM get_hot_count('hot_test');
+
+-- Verify index still works correctly
+SELECT id FROM hot_test WHERE data->'status' = '"inactive"'::jsonb;
+
+-- ============================================================================
+-- Test 8: enable_subpath_hot GUC
+-- ============================================================================
+-- The enable_subpath_hot GUC controls whether subpath-level HOT tracking
+-- is used for JSONB expression indexes.
+DROP TABLE hot_test;
+
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+
+CREATE INDEX hot_test_guc_status_idx ON hot_test((data->'status'));
+
+INSERT INTO hot_test VALUES (
+ 1,
+ '{"status": "active", "count": 0}'::jsonb
+);
+
+-- With enable_subpath_hot=on (default), non-indexed subpath update is HOT
+SHOW enable_subpath_hot;
+SELECT 'GUC Test: Baseline (on)' AS test, * FROM get_hot_count('hot_test');
+
+UPDATE hot_test SET data = jsonb_set(data, '{count}', '1') WHERE id = 1;
+SELECT 'GUC Test: After non-indexed update (on)' AS test, * FROM get_hot_count('hot_test');
+
+-- Disable subpath HOT tracking
+SET enable_subpath_hot = off;
+SHOW enable_subpath_hot;
+
+-- With enable_subpath_hot=off, the subpath analysis is disabled.
+-- However, the cached relation state from the first update may still
+-- allow HOT if the relation's index subpath info was already computed.
+UPDATE hot_test SET data = jsonb_set(data, '{count}', '2') WHERE id = 1;
+SELECT 'GUC Test: After non-indexed update (off)' AS test, * FROM get_hot_count('hot_test');
+
+-- Re-enable subpath HOT tracking
+SET enable_subpath_hot = on;
+SHOW enable_subpath_hot;
+
+-- Should be HOT again
+UPDATE hot_test SET data = jsonb_set(data, '{count}', '3') WHERE id = 1;
+SELECT 'GUC Test: After non-indexed update (re-enabled)' AS test, * FROM get_hot_count('hot_test');
+
+-- Verify index still works correctly throughout
+SELECT id FROM hot_test WHERE data->'status' = '"active"'::jsonb;
+
+-- ============================================================================
+-- Test 9: Partial indexes with complex predicates on JSONB
+-- ============================================================================
+-- Test partial indexes with WHERE clauses on JSONB expressions.
+-- HOT updates should work correctly both inside and outside the predicate.
+DROP TABLE hot_test;
+
+CREATE TABLE hot_test (
+ id int PRIMARY KEY,
+ data jsonb
+);
+
+-- Partial index: only index status when priority > 5
+CREATE INDEX hot_test_partial_idx ON hot_test((data->'status'))
+ WHERE (data->>'priority')::int > 5;
+
+INSERT INTO hot_test VALUES (
+ 1,
+ '{"status": "active", "priority": 10, "count": 0}'::jsonb
+);
+INSERT INTO hot_test VALUES (
+ 2,
+ '{"status": "active", "priority": 3, "count": 0}'::jsonb
+);
+
+SELECT 'Partial Index Test: Baseline' AS test, * FROM get_hot_count('hot_test');
+
+-- Update non-indexed subpath on row inside predicate (priority=10 > 5)
+-- Should be HOT because {count} is not indexed
+UPDATE hot_test SET data = jsonb_set(data, '{count}', '1') WHERE id = 1;
+SELECT 'Partial Index Test: count update, inside predicate' AS test, * FROM get_hot_count('hot_test');
+
+-- Update non-indexed subpath on row outside predicate (priority=3 <= 5)
+-- Should be HOT because {count} is not indexed
+UPDATE hot_test SET data = jsonb_set(data, '{count}', '1') WHERE id = 2;
+SELECT 'Partial Index Test: count update, outside predicate' AS test, * FROM get_hot_count('hot_test');
+
+-- Update indexed subpath on row inside predicate (priority=10 > 5)
+-- Should NOT be HOT because {status} is indexed and row is in predicate
+UPDATE hot_test SET data = jsonb_set(data, '{status}', '"inactive"') WHERE id = 1;
+SELECT 'Partial Index Test: status update, inside predicate' AS test, * FROM get_hot_count('hot_test');
+
+-- Update indexed subpath on row outside predicate (priority=3 <= 5)
+-- This is conservative - PostgreSQL treats it as non-HOT because the
+-- indexed column changed, even though the row is outside the predicate
+UPDATE hot_test SET data = jsonb_set(data, '{status}', '"inactive"') WHERE id = 2;
+SELECT 'Partial Index Test: status update, outside predicate' AS test, * FROM get_hot_count('hot_test');
+
+-- Verify index works
+SELECT id FROM hot_test WHERE data->'status' = '"inactive"'::jsonb AND (data->>'priority')::int > 5;
+-- ============================================================================
+DROP TABLE IF EXISTS hot_test;
+DROP TABLE IF EXISTS hot_test_partitioned CASCADE;
+DROP FUNCTION IF EXISTS has_hot_chain(text, tid);
+DROP FUNCTION IF EXISTS print_hot_chain(text, tid);
+DROP FUNCTION IF EXISTS get_hot_count(text);
+DROP EXTENSION pageinspect;
diff --git a/src/test/regress/sql/triggers.sql b/src/test/regress/sql/triggers.sql
index ea39817ee3d7f..6ceb61608ae4b 100644
--- a/src/test/regress/sql/triggers.sql
+++ b/src/test/regress/sql/triggers.sql
@@ -660,7 +660,9 @@ UPDATE main_view SET b = 32 WHERE a = 21 AND b = 31 RETURNING a, b;
UPDATE main_view SET b = 0 WHERE false;
-- Delete from view using trigger
-DELETE FROM main_view WHERE a IN (20,21);
+DELETE FROM main_view WHERE a = 20 AND b = 31;
+DELETE FROM main_view WHERE a = 21 AND b = 10;
+DELETE FROM main_view WHERE a = 21 AND b = 32;
DELETE FROM main_view WHERE a = 31 RETURNING a, b;
\set QUIET true
diff --git a/src/test/regress/sql/updatable_views.sql b/src/test/regress/sql/updatable_views.sql
index 1635adde2d4b4..160e779971507 100644
--- a/src/test/regress/sql/updatable_views.sql
+++ b/src/test/regress/sql/updatable_views.sql
@@ -125,7 +125,7 @@ INSERT INTO rw_view16 VALUES (3, 'Row 3', 3); -- should fail
INSERT INTO rw_view16 (a, b) VALUES (3, 'Row 3'); -- should be OK
UPDATE rw_view16 SET a=3, aa=-3 WHERE a=3; -- should fail
UPDATE rw_view16 SET aa=-3 WHERE a=3; -- should be OK
-SELECT * FROM base_tbl;
+SELECT * FROM base_tbl ORDER BY a;
DELETE FROM rw_view16 WHERE a=-3; -- should be OK
-- Read-only views
INSERT INTO ro_view17 VALUES (3, 'ROW 3');
diff --git a/src/tools/pgindent/pgindent b/src/tools/pgindent/pgindent
index 7481696a584c3..1482f674fb033 100755
--- a/src/tools/pgindent/pgindent
+++ b/src/tools/pgindent/pgindent
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# Copyright (c) 2021-2026, PostgreSQL Global Development Group
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 141b9d6e07786..074d21feb1cc1 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -176,6 +176,7 @@ AttrDefault
AttrMap
AttrMissing
AttrNumber
+AttrSubpathInfo
AttributeOpts
AuthRequest
AuthToken
@@ -1246,6 +1247,7 @@ IV
IdentLine
IdentifierLookup
IdentifySystemCmd
+IdxSubpathDesc
IfStackElem
ImportForeignSchemaStmt
ImportForeignSchemaType
@@ -1736,6 +1738,7 @@ MinimalTupleData
MinimalTupleTableSlot
MinmaxMultiOpaque
MinmaxOpaque
+SubpathTrackingContext
ModifyTable
ModifyTableContext
ModifyTablePath
@@ -2532,6 +2535,7 @@ RelOptInfo
RelOptKind
RelPathStr
RelStatsInfo
+RelSubpathInfo
RelSyncCallbackFunction
RelToCheck
RelToCluster
@@ -2948,6 +2952,7 @@ SubXactCallback
SubXactCallbackItem
SubXactEvent
SubXactInfo
+SubpathAccumEntry
SubqueryScan
SubqueryScanPath
SubqueryScanState
@@ -3044,7 +3049,6 @@ TSVectorStat
TState
TStatus
TStoreState
-TU_UpdateIndexes
TXNEntryFile
TYPCATEGORY
T_Action
@@ -3483,6 +3487,7 @@ ambuildempty_function
ambuildphasename_function
ambulkdelete_function
amcanreturn_function
+amcomparedatums_function
amcostestimate_function
amendscan_function
amestimateparallelscan_function