From 8bfbf39a70977934d1937c2af88a1ed081573a37 Mon Sep 17 00:00:00 2001
From: Muhammad Usama <m.usama@gmail.com>
Date: Fri, 20 Feb 2026 17:45:11 +0500
Subject: [PATCH] Disable idle_in_transaction_session_timeout on metadata
 worker connections during shard moves

In block_writes mode, LockShardListMetadataOnWorkers() opens coordinated
transactions on all metadata workers to hold advisory shard metadata locks.
These connections remain open for the entire duration of the shard move,
but workers not involved in the data copy have no commands to execute and
they sit idle-in-transaction until the coordinated transaction commits.

For large shards, the data copy can take hours, easily exceeding common
idle_in_transaction_session_timeout values. When the timeout fires on an
uninvolved worker, PostgreSQL terminates the connection and the move fails.

Fix by sending SET LOCAL idle_in_transaction_session_timeout = 0 on each
metadata worker connection before acquiring locks. SET LOCAL scopes the
change to the current transaction only, so normal sessions are unaffected.
---
 src/backend/distributed/utils/resource_lock.c | 12 ++-
 .../shard_move_constraints_blocking.out       | 84 +++++++++++++++++++
 .../sql/shard_move_constraints_blocking.sql   | 38 +++++++++
 3 files changed, 133 insertions(+), 1 deletion(-)

diff --git a/src/backend/distributed/utils/resource_lock.c b/src/backend/distributed/utils/resource_lock.c
index 1dbc84c42b2..9edfc4943a9 100644
--- a/src/backend/distributed/utils/resource_lock.c
+++ b/src/backend/distributed/utils/resource_lock.c
@@ -405,7 +405,17 @@ LockShardListMetadataOnWorkers(LOCKMODE lockmode, List *shardIntervalList)
 
 	appendStringInfo(lockCommand, "])");
 
-	SendCommandToWorkersWithMetadata(lockCommand->data);
+	/*
+	 * Disable idle_in_transaction_session_timeout on metadata workers before
+	 * acquiring locks. In block_writes mode, these connections stay open for
+	 * the entire shard copy which can take hours for large shards. Without
+	 * this, the timeout would kill the connection and fail the move.
+	 * SET LOCAL scopes the change to this transaction only.
+	 */
+	List *commandList = list_make2(
+		"SET LOCAL idle_in_transaction_session_timeout = 0",
+		lockCommand->data);
+	SendCommandListToWorkersWithMetadata(commandList);
 }
 
 
diff --git a/src/test/regress/expected/shard_move_constraints_blocking.out b/src/test/regress/expected/shard_move_constraints_blocking.out
index 66dec069e7a..61dbf41ec04 100644
--- a/src/test/regress/expected/shard_move_constraints_blocking.out
+++ b/src/test/regress/expected/shard_move_constraints_blocking.out
@@ -399,3 +399,87 @@ drop cascades to table "blocking shard Move Fkeys Indexes".reference_table
 drop cascades to table "blocking shard Move Fkeys Indexes".reference_table_8970028
 drop cascades to table "blocking shard Move Fkeys Indexes".index_backed_rep_identity
 DROP ROLE mx_rebalancer_blocking_role_ent;
+-- Test: block_writes shard move succeeds even when workers have a low
+-- idle_in_transaction_session_timeout. LockShardListMetadataOnWorkers opens
+-- coordinated transactions on ALL metadata workers before the data copy.
+-- Workers not involved in the copy sit idle-in-transaction for the entire
+-- duration. Without the SET LOCAL override, the timeout would kill those
+-- connections and fail the move.
+SET citus.next_shard_id TO 8980000;
+SET citus.shard_count TO 4;
+SET citus.shard_replication_factor TO 1;
+CREATE SCHEMA blocking_move_idle_timeout;
+SET search_path TO blocking_move_idle_timeout;
+-- set a very low idle_in_transaction_session_timeout on all nodes
+SELECT 1 FROM run_command_on_all_nodes(
+    'ALTER SYSTEM SET idle_in_transaction_session_timeout = ''1s''');
+ ?column?
+---------------------------------------------------------------------
+        1
+        1
+        1
+(3 rows)
+
+SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()');
+ ?column?
+---------------------------------------------------------------------
+        1
+        1
+        1
+(3 rows)
+
+-- allow the reload to take effect
+SELECT pg_sleep(0.5);
+ pg_sleep
+---------------------------------------------------------------------
+
+(1 row)
+
+CREATE TABLE test_move(id int PRIMARY KEY, val text);
+SELECT create_distributed_table('test_move', 'id');
+ create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+INSERT INTO test_move SELECT i, 'val_' || i FROM generate_series(1, 100) i;
+-- move a shard using block_writes; should succeed despite the 1s timeout
+SELECT citus_move_shard_placement(8980000, 'localhost', :worker_1_port, 'localhost', :worker_2_port, shard_transfer_mode:='block_writes');
+ citus_move_shard_placement
+---------------------------------------------------------------------
+
+(1 row)
+
+SELECT public.wait_for_resource_cleanup();
+ wait_for_resource_cleanup
+---------------------------------------------------------------------
+
+(1 row)
+
+-- verify data integrity after move
+SELECT count(*) FROM test_move;
+ count
+---------------------------------------------------------------------
+   100
+(1 row)
+
+-- cleanup: restore idle_in_transaction_session_timeout
+SELECT 1 FROM run_command_on_all_nodes(
+    'ALTER SYSTEM RESET idle_in_transaction_session_timeout');
+ ?column?
+---------------------------------------------------------------------
+        1
+        1
+        1
+(3 rows)
+
+SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()');
+ ?column?
+---------------------------------------------------------------------
+        1
+        1
+        1
+(3 rows)
+
+DROP SCHEMA blocking_move_idle_timeout CASCADE;
+NOTICE:  drop cascades to table test_move
diff --git a/src/test/regress/sql/shard_move_constraints_blocking.sql b/src/test/regress/sql/shard_move_constraints_blocking.sql
index 66b58f42b9c..acbaca76ab9 100644
--- a/src/test/regress/sql/shard_move_constraints_blocking.sql
+++ b/src/test/regress/sql/shard_move_constraints_blocking.sql
@@ -222,3 +222,41 @@ ALTER TABLE sensors_2020_01_01 DROP CONSTRAINT fkey_from_child_to_child;
 \c - postgres - :master_port
 DROP SCHEMA "blocking shard Move Fkeys Indexes" CASCADE;
 DROP ROLE mx_rebalancer_blocking_role_ent;
+
+-- Test: block_writes shard move succeeds even when workers have a low
+-- idle_in_transaction_session_timeout. LockShardListMetadataOnWorkers opens
+-- coordinated transactions on ALL metadata workers before the data copy.
+-- Workers not involved in the copy sit idle-in-transaction for the entire
+-- duration. Without the SET LOCAL override, the timeout would kill those
+-- connections and fail the move.
+SET citus.next_shard_id TO 8980000;
+SET citus.shard_count TO 4;
+SET citus.shard_replication_factor TO 1;
+
+CREATE SCHEMA blocking_move_idle_timeout;
+SET search_path TO blocking_move_idle_timeout;
+
+-- set a very low idle_in_transaction_session_timeout on all nodes
+SELECT 1 FROM run_command_on_all_nodes(
+    'ALTER SYSTEM SET idle_in_transaction_session_timeout = ''1s''');
+SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()');
+-- allow the reload to take effect
+SELECT pg_sleep(0.5);
+
+CREATE TABLE test_move(id int PRIMARY KEY, val text);
+SELECT create_distributed_table('test_move', 'id');
+INSERT INTO test_move SELECT i, 'val_' || i FROM generate_series(1, 100) i;
+
+-- move a shard using block_writes; should succeed despite the 1s timeout
+SELECT citus_move_shard_placement(8980000, 'localhost', :worker_1_port, 'localhost', :worker_2_port, shard_transfer_mode:='block_writes');
+SELECT public.wait_for_resource_cleanup();
+
+-- verify data integrity after move
+SELECT count(*) FROM test_move;
+
+-- cleanup: restore idle_in_transaction_session_timeout
+SELECT 1 FROM run_command_on_all_nodes(
+    'ALTER SYSTEM RESET idle_in_transaction_session_timeout');
+SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()');
+
+DROP SCHEMA blocking_move_idle_timeout CASCADE;