Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,15 @@ public final class DiskBalancerConfiguration {
description = "If true, the DiskBalancer will automatically stop once disks are balanced.")
private boolean stopAfterDiskEven = true;

@Config(key = "hdds.datanode.disk.balancer.replica.deletion.delay",
defaultValue = "5m",
type = ConfigType.TIME,
tags = { DATANODE, ConfigTag.DISKBALANCER },
description = "The delay after a container is successfully moved from source volume to " +
"destination volume before the source container replica is deleted. " +
"Unit could be defined with postfix (ns,ms,s,m,h,d).")
private long replicaDeletionDelay = Duration.ofMinutes(5).toMillis();

public DiskBalancerConfiguration(Double threshold,
Long bandwidthInMB,
Integer parallelThread,
Expand Down Expand Up @@ -181,6 +190,15 @@ public void setStopAfterDiskEven(boolean stopAfterDiskEven) {
this.stopAfterDiskEven = stopAfterDiskEven;
}

/**
* Gets the replica deletion delay in milliseconds.
*
* @return delay in milliseconds before source replica is deleted after move
*/
public long getReplicaDeletionDelay() {
return replicaDeletionDelay;
}

/**
* Gets the threshold value for DiskBalancer.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ public class DiskBalancerService extends BackgroundService {
LoggerFactory.getLogger(DiskBalancerService.class);

public static final String DISK_BALANCER_DIR = "diskBalancer";
private static long replicaDeletionDelayMills = 60 * 60 * 1000L; // 60 minutes
private long replicaDeletionDelay;

private OzoneContainer ozoneContainer;
private final ConfigurationSource conf;
Expand Down Expand Up @@ -162,6 +162,8 @@ public DiskBalancerService(OzoneContainer ozoneContainer,
throw new IOException(e);
}

replicaDeletionDelay = conf.getObject(DiskBalancerConfiguration.class)
.getReplicaDeletionDelay();
metrics = DiskBalancerServiceMetrics.create();

loadDiskBalancerInfo();
Expand Down Expand Up @@ -617,7 +619,7 @@ public BackgroundTaskResult call() {
}
if (moveSucceeded) {
// Add current old container to pendingDeletionContainers.
pendingDeletionContainers.put(System.currentTimeMillis() + replicaDeletionDelayMills, container);
pendingDeletionContainers.put(System.currentTimeMillis() + replicaDeletionDelay, container);
ContainerLogger.logMoveSuccess(containerId, sourceVolume,
destVolume, containerSize, Time.monotonicNow() - startTime);
}
Expand Down Expand Up @@ -657,7 +659,7 @@ private void deleteContainer(Container container) {
container.delete();
container.getContainerData().getVolume().decrementUsedSpace(containerData.getBytesUsed());
LOG.info("Deleted expired container {} after delay {} ms.",
containerData.getContainerID(), replicaDeletionDelayMills);
containerData.getContainerID(), replicaDeletionDelay);
} catch (IOException ex) {
LOG.warn("Failed to delete old container {} after it's marked as DELETED. " +
"It will be handled by background scanners.", container.getContainerData().getContainerID(), ex);
Expand Down Expand Up @@ -824,7 +826,7 @@ public static void setInjector(FaultInjector instance) {
}

@VisibleForTesting
public static void setReplicaDeletionDelayMills(long durationMills) {
replicaDeletionDelayMills = durationMills;
public void setReplicaDeletionDelay(long durationMills) {
this.replicaDeletionDelay = durationMills;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ public void setup() throws Exception {
conf.setFromObject(diskBalancerConfiguration);
diskBalancerService = new DiskBalancerServiceTestImpl(ozoneContainer,
100, conf, 1);
DiskBalancerService.setReplicaDeletionDelayMills(0);
diskBalancerService.setReplicaDeletionDelay(0);
KeyValueContainer.setInjector(kvFaultInjector);
}

Expand Down Expand Up @@ -592,7 +592,7 @@ public void testOldReplicaDelayedDeletion(ContainerTestVersionInfo versionInfo)
throws IOException, InterruptedException {
setLayoutAndSchemaForTest(versionInfo);
long delay = 2000L; // 2 second delay
DiskBalancerService.setReplicaDeletionDelayMills(delay);
diskBalancerService.setReplicaDeletionDelay(delay);

Container container = createContainer(CONTAINER_ID, sourceVolume, State.CLOSED);
KeyValueContainerData keyValueContainerData = (KeyValueContainerData) container.getContainerData();
Expand Down
12 changes: 12 additions & 0 deletions hadoop-hdds/docs/content/design/diskbalancer.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,18 @@ D1 ----> C1-CLOSED --- (5) ---> C1-DELETED
|
D2 ----> Temp C1-CLOSED --- (2) ---> Temp C1-RECOVERING --- (3) ---> C1-RECOVERING --- (4) ---> C1-CLOSED
```

### Lazy Deletion of Source Container Replica

The source container on D1 is **not** deleted immediately after the move completes. Instead, it is scheduled for deletion after a configurable delay using config `hdds.datanode.disk.balancer.replica.deletion.delay`, **default: 5 minutes**.

**Rationale:** When a container has only one replica and that replica has an in-flight read operation, the read thread may still hold a reference to the old container at the source path.
If the DiskBalancer deletes the old container immediately after the move, the in-flight read would fail because the container data is now at the new path. The lazy deletion provides a
grace period for in-flight reads to complete before the old container is removed, avoiding immediate read failures.

**Note:** Because of this lazy deletion, the disk utilization of the source volume will not decrease immediately after a container move or after the DiskBalancer is stopped. The freed space
and balanced state will be visible only after the configured delay, when the source container replicas are actually deleted.

## DiskBalancing Policies

By default, the DiskBalancer uses specific policies to decide which disks to balance and which containers to move. These
Expand Down
25 changes: 13 additions & 12 deletions hadoop-hdds/docs/content/feature/DiskBalancer.md
Original file line number Diff line number Diff line change
Expand Up @@ -238,16 +238,17 @@ ozone admin datanode diskbalancer report --in-service-datanodes --json

The DiskBalancer's behavior can be controlled using the following configuration properties in `ozone-site.xml`.

| Property | Default Value | Description |
|-------------------------------------------------------------|----------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `hdds.datanode.disk.balancer.enabled` | `false` | If false, the DiskBalancer service on the Datanode is disabled. Configure it to true for diskBalancer to be enabled. |
| `hdds.datanode.disk.balancer.volume.density.threshold.percent` | `10.0` | A percentage (0-100). A datanode is considered balanced if for each volume, its utilization differs from the average datanode utilization by no more than this threshold. |
| `hdds.datanode.disk.balancer.max.disk.throughputInMBPerSec` | `10` | The maximum bandwidth (in MB/s) that the balancer can use for moving data, to avoid impacting client I/O. |
| `hdds.datanode.disk.balancer.parallel.thread` | `5` | The number of worker threads to use for moving containers in parallel. |
| `hdds.datanode.disk.balancer.service.interval` | `60s` | The time interval at which the Datanode DiskBalancer service checks for imbalance and updates its configuration. |
| `hdds.datanode.disk.balancer.stop.after.disk.even` | `true` | If true, the DiskBalancer will automatically stop its balancing activity once disks are considered balanced (i.e., all volume densities are within the threshold). |
| `hdds.datanode.disk.balancer.volume.choosing.policy` | `org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultVolumeChoosingPolicy` | The policy class for selecting source and destination volumes for balancing. |
| `hdds.datanode.disk.balancer.container.choosing.policy` | `org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy` | The policy class for selecting which containers to move from a source volume to destination volume. |
| `hdds.datanode.disk.balancer.service.timeout` | `300s` | Timeout for the Datanode DiskBalancer service operations. |
| `hdds.datanode.disk.balancer.should.run.default` | `false` | If the balancer fails to read its persisted configuration, this value determines if the service should run by default. |
| Property | Default Value | Description |
|-------------------------------------------------------------|----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `hdds.datanode.disk.balancer.enabled` | `false` | If false, the DiskBalancer service on the Datanode is disabled. Configure it to true for diskBalancer to be enabled. |
| `hdds.datanode.disk.balancer.volume.density.threshold.percent` | `10.0` | A percentage (0-100). A datanode is considered balanced if for each volume, its utilization differs from the average datanode utilization by no more than this threshold. |
| `hdds.datanode.disk.balancer.max.disk.throughputInMBPerSec` | `10` | The maximum bandwidth (in MB/s) that the balancer can use for moving data, to avoid impacting client I/O. |
| `hdds.datanode.disk.balancer.parallel.thread` | `5` | The number of worker threads to use for moving containers in parallel. |
| `hdds.datanode.disk.balancer.service.interval` | `60s` | The time interval at which the Datanode DiskBalancer service checks for imbalance and updates its configuration. |
| `hdds.datanode.disk.balancer.stop.after.disk.even` | `true` | If true, the DiskBalancer will automatically stop its balancing activity once disks are considered balanced (i.e., all volume densities are within the threshold). |
| `hdds.datanode.disk.balancer.replica.deletion.delay` | `5m` | The delay after a container is successfully moved from source volume to destination volume before the source container replica is deleted. This lazy deletion provides a grace period before failing the read thread holding the old container replica. Unit: ns, ms, s, m, h, d. |
| `hdds.datanode.disk.balancer.volume.choosing.policy` | `org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultVolumeChoosingPolicy` | The policy class for selecting source and destination volumes for balancing. |
| `hdds.datanode.disk.balancer.container.choosing.policy` | `org.apache.hadoop.ozone.container.diskbalancer.policy.DefaultContainerChoosingPolicy` | The policy class for selecting which containers to move from a source volume to destination volume. |
| `hdds.datanode.disk.balancer.service.timeout` | `300s` | Timeout for the Datanode DiskBalancer service operations. |
| `hdds.datanode.disk.balancer.should.run.default` | `false` | If the balancer fails to read its persisted configuration, this value determines if the service should run by default. |

Loading