diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerConfiguration.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerConfiguration.java index ba777aa21719..447691c0a5bd 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerConfiguration.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerConfiguration.java @@ -116,6 +116,15 @@ public final class DiskBalancerConfiguration { description = "If true, the DiskBalancer will automatically stop once disks are balanced.") private boolean stopAfterDiskEven = true; + @Config(key = "hdds.datanode.disk.balancer.replica.deletion.delay", + defaultValue = "5m", + type = ConfigType.TIME, + tags = { DATANODE, ConfigTag.DISKBALANCER }, + description = "The delay after a container is successfully moved from source volume to " + + "destination volume before the source container replica is deleted. " + + "Unit could be defined with postfix (ns,ms,s,m,h,d).") + private long replicaDeletionDelay = Duration.ofMinutes(5).toMillis(); + public DiskBalancerConfiguration(Double threshold, Long bandwidthInMB, Integer parallelThread, @@ -181,6 +190,15 @@ public void setStopAfterDiskEven(boolean stopAfterDiskEven) { this.stopAfterDiskEven = stopAfterDiskEven; } + /** + * Gets the replica deletion delay in milliseconds. + * + * @return delay in milliseconds before source replica is deleted after move + */ + public long getReplicaDeletionDelay() { + return replicaDeletionDelay; + } + /** * Gets the threshold value for DiskBalancer. * diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java index aaa14321011e..9503c2e3c1f8 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/diskbalancer/DiskBalancerService.java @@ -91,7 +91,7 @@ public class DiskBalancerService extends BackgroundService { LoggerFactory.getLogger(DiskBalancerService.class); public static final String DISK_BALANCER_DIR = "diskBalancer"; - private static long replicaDeletionDelayMills = 60 * 60 * 1000L; // 60 minutes + private long replicaDeletionDelay; private OzoneContainer ozoneContainer; private final ConfigurationSource conf; @@ -162,6 +162,8 @@ public DiskBalancerService(OzoneContainer ozoneContainer, throw new IOException(e); } + replicaDeletionDelay = conf.getObject(DiskBalancerConfiguration.class) + .getReplicaDeletionDelay(); metrics = DiskBalancerServiceMetrics.create(); loadDiskBalancerInfo(); @@ -617,7 +619,7 @@ public BackgroundTaskResult call() { } if (moveSucceeded) { // Add current old container to pendingDeletionContainers. - pendingDeletionContainers.put(System.currentTimeMillis() + replicaDeletionDelayMills, container); + pendingDeletionContainers.put(System.currentTimeMillis() + replicaDeletionDelay, container); ContainerLogger.logMoveSuccess(containerId, sourceVolume, destVolume, containerSize, Time.monotonicNow() - startTime); } @@ -657,7 +659,7 @@ private void deleteContainer(Container container) { container.delete(); container.getContainerData().getVolume().decrementUsedSpace(containerData.getBytesUsed()); LOG.info("Deleted expired container {} after delay {} ms.", - containerData.getContainerID(), replicaDeletionDelayMills); + containerData.getContainerID(), replicaDeletionDelay); } catch (IOException ex) { LOG.warn("Failed to delete old container {} after it's marked as DELETED. " + "It will be handled by background scanners.", container.getContainerData().getContainerID(), ex); @@ -824,7 +826,7 @@ public static void setInjector(FaultInjector instance) { } @VisibleForTesting - public static void setReplicaDeletionDelayMills(long durationMills) { - replicaDeletionDelayMills = durationMills; + public void setReplicaDeletionDelay(long durationMills) { + this.replicaDeletionDelay = durationMills; } } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java index af9efdf75049..cd53404a670d 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/diskbalancer/TestDiskBalancerTask.java @@ -246,7 +246,7 @@ public void setup() throws Exception { conf.setFromObject(diskBalancerConfiguration); diskBalancerService = new DiskBalancerServiceTestImpl(ozoneContainer, 100, conf, 1); - DiskBalancerService.setReplicaDeletionDelayMills(0); + diskBalancerService.setReplicaDeletionDelay(0); KeyValueContainer.setInjector(kvFaultInjector); } @@ -592,7 +592,7 @@ public void testOldReplicaDelayedDeletion(ContainerTestVersionInfo versionInfo) throws IOException, InterruptedException { setLayoutAndSchemaForTest(versionInfo); long delay = 2000L; // 2 second delay - DiskBalancerService.setReplicaDeletionDelayMills(delay); + diskBalancerService.setReplicaDeletionDelay(delay); Container container = createContainer(CONTAINER_ID, sourceVolume, State.CLOSED); KeyValueContainerData keyValueContainerData = (KeyValueContainerData) container.getContainerData(); diff --git a/hadoop-hdds/docs/content/design/diskbalancer.md b/hadoop-hdds/docs/content/design/diskbalancer.md index f546b5253d7a..aab8af2ff345 100644 --- a/hadoop-hdds/docs/content/design/diskbalancer.md +++ b/hadoop-hdds/docs/content/design/diskbalancer.md @@ -103,6 +103,18 @@ D1 ----> C1-CLOSED --- (5) ---> C1-DELETED | D2 ----> Temp C1-CLOSED --- (2) ---> Temp C1-RECOVERING --- (3) ---> C1-RECOVERING --- (4) ---> C1-CLOSED ``` + +### Lazy Deletion of Source Container Replica + +The source container on D1 is **not** deleted immediately after the move completes. Instead, it is scheduled for deletion after a configurable delay using config `hdds.datanode.disk.balancer.replica.deletion.delay`, **default: 5 minutes**. + +**Rationale:** When a container has only one replica and that replica has an in-flight read operation, the read thread may still hold a reference to the old container at the source path. +If the DiskBalancer deletes the old container immediately after the move, the in-flight read would fail because the container data is now at the new path. The lazy deletion provides a +grace period for in-flight reads to complete before the old container is removed, avoiding immediate read failures. + +**Note:** Because of this lazy deletion, the disk utilization of the source volume will not decrease immediately after a container move or after the DiskBalancer is stopped. The freed space +and balanced state will be visible only after the configured delay, when the source container replicas are actually deleted. + ## DiskBalancing Policies By default, the DiskBalancer uses specific policies to decide which disks to balance and which containers to move. These diff --git a/hadoop-hdds/docs/content/feature/DiskBalancer.md b/hadoop-hdds/docs/content/feature/DiskBalancer.md index d8e7b501ae32..317d3f023723 100644 --- a/hadoop-hdds/docs/content/feature/DiskBalancer.md +++ b/hadoop-hdds/docs/content/feature/DiskBalancer.md @@ -238,16 +238,17 @@ ozone admin datanode diskbalancer report --in-service-datanodes --json The DiskBalancer's behavior can be controlled using the following configuration properties in `ozone-site.xml`. -| Property | Default Value | Description | -|-------------------------------------------------------------|----------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `hdds.datanode.disk.balancer.enabled` | `false` | If false, the DiskBalancer service on the Datanode is disabled. Configure it to true for diskBalancer to be enabled. | -| `hdds.datanode.disk.balancer.volume.density.threshold.percent` | `10.0` | A percentage (0-100). A datanode is considered balanced if for each volume, its utilization differs from the average datanode utilization by no more than this threshold. | -| `hdds.datanode.disk.balancer.max.disk.throughputInMBPerSec` | `10` | The maximum bandwidth (in MB/s) that the balancer can use for moving data, to avoid impacting client I/O. | -| `hdds.datanode.disk.balancer.parallel.thread` | `5` | The number of worker threads to use for moving containers in parallel. | -| `hdds.datanode.disk.balancer.service.interval` | `60s` | The time interval at which the Datanode DiskBalancer service checks for imbalance and updates its configuration. | -| `hdds.datanode.disk.balancer.stop.after.disk.even` | `true` | If true, the DiskBalancer will automatically stop its balancing activity once disks are considered balanced (i.e., all volume densities are within the threshold). | -| `hdds.datanode.disk.balancer.volume.choosing.policy` | `org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultVolumeChoosingPolicy` | The policy class for selecting source and destination volumes for balancing. | -| `hdds.datanode.disk.balancer.container.choosing.policy` | `org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy` | The policy class for selecting which containers to move from a source volume to destination volume. | -| `hdds.datanode.disk.balancer.service.timeout` | `300s` | Timeout for the Datanode DiskBalancer service operations. | -| `hdds.datanode.disk.balancer.should.run.default` | `false` | If the balancer fails to read its persisted configuration, this value determines if the service should run by default. | +| Property | Default Value | Description | +|-------------------------------------------------------------|----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `hdds.datanode.disk.balancer.enabled` | `false` | If false, the DiskBalancer service on the Datanode is disabled. Configure it to true for diskBalancer to be enabled. | +| `hdds.datanode.disk.balancer.volume.density.threshold.percent` | `10.0` | A percentage (0-100). A datanode is considered balanced if for each volume, its utilization differs from the average datanode utilization by no more than this threshold. | +| `hdds.datanode.disk.balancer.max.disk.throughputInMBPerSec` | `10` | The maximum bandwidth (in MB/s) that the balancer can use for moving data, to avoid impacting client I/O. | +| `hdds.datanode.disk.balancer.parallel.thread` | `5` | The number of worker threads to use for moving containers in parallel. | +| `hdds.datanode.disk.balancer.service.interval` | `60s` | The time interval at which the Datanode DiskBalancer service checks for imbalance and updates its configuration. | +| `hdds.datanode.disk.balancer.stop.after.disk.even` | `true` | If true, the DiskBalancer will automatically stop its balancing activity once disks are considered balanced (i.e., all volume densities are within the threshold). | +| `hdds.datanode.disk.balancer.replica.deletion.delay` | `5m` | The delay after a container is successfully moved from source volume to destination volume before the source container replica is deleted. This lazy deletion provides a grace period before failing the read thread holding the old container replica. Unit: ns, ms, s, m, h, d. | +| `hdds.datanode.disk.balancer.volume.choosing.policy` | `org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultVolumeChoosingPolicy` | The policy class for selecting source and destination volumes for balancing. | +| `hdds.datanode.disk.balancer.container.choosing.policy` | `org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy` | The policy class for selecting which containers to move from a source volume to destination volume. | +| `hdds.datanode.disk.balancer.service.timeout` | `300s` | Timeout for the Datanode DiskBalancer service operations. | +| `hdds.datanode.disk.balancer.should.run.default` | `false` | If the balancer fails to read its persisted configuration, this value determines if the service should run by default. | diff --git a/hadoop-hdds/docs/content/feature/DiskBalancer.zh.md b/hadoop-hdds/docs/content/feature/DiskBalancer.zh.md index 65ba7ca3fa1e..e892ee47e7c6 100644 --- a/hadoop-hdds/docs/content/feature/DiskBalancer.zh.md +++ b/hadoop-hdds/docs/content/feature/DiskBalancer.zh.md @@ -230,16 +230,17 @@ ozone admin datanode diskbalancer report --in-service-datanodes --json The DiskBalancer's behavior can be controlled using the following configuration properties in `ozone-site.xml`. -| Property | Default Value | Description | -|-------------------------------------------------------------|----------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `hdds.datanode.disk.balancer.enabled` | `false` | 如果为 false,则 Datanode 上的 DiskBalancer 服务将被禁用。将其配置为 true 可启用 DiskBalancer。 | | | | -| `hdds.datanode.disk.balancer.volume.density.threshold.percent` | `10.0` | 百分比(0-100)。如果对于每个卷,其利用率与平均数据节点利用率之差不超过此阈值,则认为数据节点处于平衡状态。 | -| `hdds.datanode.disk.balancer.max.disk.throughputInMBPerSec` | `10` | 平衡器可用于移动数据的最大带宽(以 MB/s 为单位),以避免影响客户端 I/O。 | -| `hdds.datanode.disk.balancer.parallel.thread` | `5` | 用于并行移动容器的工作线程数。 | -| `hdds.datanode.disk.balancer.service.interval` | `60s` | Datanode DiskBalancer 服务检查不平衡并更新其配置的时间间隔。 | -| `hdds.datanode.disk.balancer.stop.after.disk.even` | `true` | 如果为真,则一旦磁盘被视为平衡(即所有卷密度都在阈值内),DiskBalancer 将自动停止其平衡活动。 | -| `hdds.datanode.disk.balancer.volume.choosing.policy` | `org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultVolumeChoosingPolicy` | 用于选择平衡的源卷和目标卷的策略类。 | -| `hdds.datanode.disk.balancer.container.choosing.policy` | `org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy` | 用于选择将哪些容器从源卷移动到目标卷的策略类。 | -| `hdds.datanode.disk.balancer.service.timeout` | `300s` | Datanode DiskBalancer 服务操作超时。 | -| `hdds.datanode.disk.balancer.should.run.default` | `false` | 如果平衡器无法读取其持久配置,则该值决定服务是否应默认运行。 | +| Property | Default Value | Description | +|-------------------------------------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `hdds.datanode.disk.balancer.enabled` | `false` | 如果为 false,则 Datanode 上的 DiskBalancer 服务将被禁用。将其配置为 true 可启用 DiskBalancer。 | | | | +| `hdds.datanode.disk.balancer.volume.density.threshold.percent` | `10.0` | 百分比(0-100)。如果对于每个卷,其利用率与平均数据节点利用率之差不超过此阈值,则认为数据节点处于平衡状态。 | +| `hdds.datanode.disk.balancer.max.disk.throughputInMBPerSec` | `10` | 平衡器可用于移动数据的最大带宽(以 MB/s 为单位),以避免影响客户端 I/O。 | +| `hdds.datanode.disk.balancer.parallel.thread` | `5` | 用于并行移动容器的工作线程数。 | +| `hdds.datanode.disk.balancer.service.interval` | `60s` | Datanode DiskBalancer 服务检查不平衡并更新其配置的时间间隔。 | +| `hdds.datanode.disk.balancer.stop.after.disk.even` | `true` | 如果为真,则一旦磁盘被视为平衡(即所有卷密度都在阈值内),DiskBalancer 将自动停止其平衡活动。 | +| `hdds.datanode.disk.balancer.replica.deletion.delay` | `5m` | 容器成功从源卷移动到目标卷后,源容器副本被删除前的延迟时间。这种延迟删除机制旨在避免旧副本的即时删除导致持有旧容器副本的线程数据读取失败。单位:ns、ms、s、m、h、d。| +| `hdds.datanode.disk.balancer.volume.choosing.policy` | `org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultVolumeChoosingPolicy` | 用于选择平衡的源卷和目标卷的策略类。 | +| `hdds.datanode.disk.balancer.container.choosing.policy` | `org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy` | 用于选择将哪些容器从源卷移动到目标卷的策略类。 | +| `hdds.datanode.disk.balancer.service.timeout` | `300s` | Datanode DiskBalancer 服务操作超时。 | +| `hdds.datanode.disk.balancer.should.run.default` | `false` | 如果平衡器无法读取其持久配置,则该值决定服务是否应默认运行。 |