From 9e40d37a6c128edb5e60d74786be0e99f3ed74a6 Mon Sep 17 00:00:00 2001 From: Amol Patil <9298683+adp2201@users.noreply.github.com> Date: Wed, 11 Mar 2026 14:28:10 -0700 Subject: [PATCH 1/3] Add CRaC lifecycle integration test scaffold --- .../CracLifecycleIntegrationTest.java | 127 ++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 integration-tests/src/test/java/io/opentelemetry/CracLifecycleIntegrationTest.java diff --git a/integration-tests/src/test/java/io/opentelemetry/CracLifecycleIntegrationTest.java b/integration-tests/src/test/java/io/opentelemetry/CracLifecycleIntegrationTest.java new file mode 100644 index 00000000000..7e963c94cfe --- /dev/null +++ b/integration-tests/src/test/java/io/opentelemetry/CracLifecycleIntegrationTest.java @@ -0,0 +1,127 @@ +/* + * Copyright The OpenTelemetry Authors + * SPDX-License-Identifier: Apache-2.0 + */ + +package io.opentelemetry; + +import static org.assertj.core.api.Assertions.assertThat; + +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.Tracer; +import io.opentelemetry.sdk.OpenTelemetrySdk; +import io.opentelemetry.sdk.common.CompletableResultCode; +import io.opentelemetry.sdk.trace.SdkTracerProvider; +import io.opentelemetry.sdk.trace.data.SpanData; +import io.opentelemetry.sdk.trace.export.SimpleSpanProcessor; +import io.opentelemetry.sdk.trace.export.SpanExporter; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.TimeUnit; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +/** + * Integration-style lifecycle tests for CRaC-like checkpoint/restore workflows. + * + *
In CRaC flows, applications typically need to close resources at checkpoint and resume normal
+ * behavior after restore.
+ */
+class CracLifecycleIntegrationTest {
+
+ @Test
+ void exportsDoNotResumeAfterShutdown_currentBehavior() {
+ LifecycleSpanExporter exporter = new LifecycleSpanExporter();
+ OpenTelemetrySdk sdk =
+ OpenTelemetrySdk.builder()
+ .setTracerProvider(
+ SdkTracerProvider.builder()
+ .addSpanProcessor(SimpleSpanProcessor.create(exporter))
+ .build())
+ .build();
+
+ try {
+ Tracer tracer = sdk.getTracer("crac-lifecycle-test");
+
+ emitSpan(tracer, "before-checkpoint");
+ sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
+ assertThat(exporter.exportedSpanCount()).isEqualTo(1);
+
+ sdk.getSdkTracerProvider().shutdown().join(10, TimeUnit.SECONDS);
+
+ // Simulate post-restore traffic on the same initialized SDK.
+ emitSpan(tracer, "after-restore");
+ sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
+
+ assertThat(exporter.exportedSpanCount()).isEqualTo(1);
+ } finally {
+ sdk.close();
+ }
+ }
+
+ @Test
+ @Disabled("Expected to fail until #6756 is addressed with checkpoint/restore-safe lifecycle")
+ void exportsShouldResumeAfterRestore_expectedBehavior() {
+ LifecycleSpanExporter exporter = new LifecycleSpanExporter();
+ OpenTelemetrySdk sdk =
+ OpenTelemetrySdk.builder()
+ .setTracerProvider(
+ SdkTracerProvider.builder()
+ .addSpanProcessor(SimpleSpanProcessor.create(exporter))
+ .build())
+ .build();
+
+ try {
+ Tracer tracer = sdk.getTracer("crac-lifecycle-test");
+
+ emitSpan(tracer, "before-checkpoint");
+ sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
+ assertThat(exporter.exportedSpanCount()).isEqualTo(1);
+
+ sdk.getSdkTracerProvider().shutdown().join(10, TimeUnit.SECONDS);
+
+ // Desired behavior for CRaC-style restore: post-restore spans should export again.
+ emitSpan(tracer, "after-restore");
+ sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
+
+ assertThat(exporter.exportedSpanCount()).isEqualTo(2);
+ } finally {
+ sdk.close();
+ }
+ }
+
+ private static void emitSpan(Tracer tracer, String name) {
+ Span span = tracer.spanBuilder(name).startSpan();
+ span.end();
+ }
+
+ private static final class LifecycleSpanExporter implements SpanExporter {
+ private final List In CRaC flows, applications typically need to close resources at checkpoint and resume normal
- * behavior after restore.
+ * These tests use {@link MockCracContext} to simulate the CRaC checkpoint/restore lifecycle
+ * without a CRaC-enabled JDK. Resources register with the mock context; the test then drives {@code
+ * beforeCheckpoint} and {@code afterRestore} callbacks directly.
+ *
+ * See: #6756
*/
class CracLifecycleIntegrationTest {
+ /**
+ * Demonstrates the failure mode when the SDK is naively shut down at checkpoint with no
+ * corresponding restore logic. This is what happens today without proper CRaC support: the SDK is
+ * a one-shot object, so spans emitted after a restore are silently dropped.
+ */
@Test
- void exportsDoNotResumeAfterShutdown_currentBehavior() {
- LifecycleSpanExporter exporter = new LifecycleSpanExporter();
- OpenTelemetrySdk sdk =
- OpenTelemetrySdk.builder()
- .setTracerProvider(
- SdkTracerProvider.builder()
- .addSpanProcessor(SimpleSpanProcessor.create(exporter))
- .build())
- .build();
-
- try {
- Tracer tracer = sdk.getTracer("crac-lifecycle-test");
-
- emitSpan(tracer, "before-checkpoint");
- sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
- assertThat(exporter.exportedSpanCount()).isEqualTo(1);
-
- sdk.getSdkTracerProvider().shutdown().join(10, TimeUnit.SECONDS);
-
- // Simulate post-restore traffic on the same initialized SDK.
- emitSpan(tracer, "after-restore");
- sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
-
- assertThat(exporter.exportedSpanCount()).isEqualTo(1);
- } finally {
- sdk.close();
- }
+ void spansDroppedAfterRestore_naiveCracIntegration() throws Exception {
+ MockCracContext cracContext = new MockCracContext();
+ InMemorySpanExporter exporter = new InMemorySpanExporter();
+ OpenTelemetrySdk sdk = buildSdk(exporter);
+ Tracer tracer = sdk.getTracer("crac-lifecycle-test");
+
+ // Naive CRaC resource: shuts the SDK down at checkpoint, does nothing on restore.
+ cracContext.register(
+ new Resource() {
+ @Override
+ public void beforeCheckpoint(Context extends Resource> context) {
+ sdk.getSdkTracerProvider().shutdown().join(10, TimeUnit.SECONDS);
+ }
+
+ @Override
+ public void afterRestore(Context extends Resource> context) {
+ // No restore logic — this is the gap that #6756 addresses.
+ }
+ });
+
+ emitSpan(tracer, "before-checkpoint");
+ sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
+ assertThat(exporter.exportedCount()).isEqualTo(1);
+
+ cracContext.simulateCheckpoint();
+ cracContext.simulateRestore();
+
+ // Post-restore span is silently dropped: the SDK is shut down and has no way to reinitialize.
+ emitSpan(tracer, "after-restore");
+ sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
+ assertThat(exporter.exportedCount()).isEqualTo(1);
}
+ /**
+ * Describes the desired behavior once the SDK properly implements {@link Resource}: spans emitted
+ * after a CRaC restore should be exported normally.
+ *
+ * This test is disabled until #6756 is addressed.
+ * When that work lands, the SDK (or an adapter it exposes) should register with the CRaC context
+ * so that {@code beforeCheckpoint} flushes and quiesces, and {@code afterRestore} reinitializes
+ * exporters and processors. Replace the TODO below with the real SDK API.
+ */
@Test
- @Disabled("Expected to fail until #6756 is addressed with checkpoint/restore-safe lifecycle")
- void exportsShouldResumeAfterRestore_expectedBehavior() {
- LifecycleSpanExporter exporter = new LifecycleSpanExporter();
- OpenTelemetrySdk sdk =
- OpenTelemetrySdk.builder()
- .setTracerProvider(
- SdkTracerProvider.builder()
- .addSpanProcessor(SimpleSpanProcessor.create(exporter))
- .build())
- .build();
-
- try {
- Tracer tracer = sdk.getTracer("crac-lifecycle-test");
-
- emitSpan(tracer, "before-checkpoint");
- sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
- assertThat(exporter.exportedSpanCount()).isEqualTo(1);
-
- sdk.getSdkTracerProvider().shutdown().join(10, TimeUnit.SECONDS);
-
- // Desired behavior for CRaC-style restore: post-restore spans should export again.
- emitSpan(tracer, "after-restore");
- sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
-
- assertThat(exporter.exportedSpanCount()).isEqualTo(2);
- } finally {
- sdk.close();
- }
+ // @Disabled("Expected to fail until #6756 adds checkpoint/restore-safe SDK lifecycle")
+ void spansExportedAfterRestore_properCracIntegration() throws Exception {
+ MockCracContext cracContext = new MockCracContext();
+ InMemorySpanExporter exporter = new InMemorySpanExporter();
+ OpenTelemetrySdk sdk = buildSdk(exporter);
+ Tracer tracer = sdk.getTracer("crac-lifecycle-test");
+
+ // TODO(#6756): replace this placeholder with the real SDK CRaC API, e.g.:
+ // cracContext.register(sdk.asCracResource());
+ cracContext.register(
+ new Resource() {
+ @Override
+ public void beforeCheckpoint(Context extends Resource> context) throws Exception {
+ sdk.getSdkTracerProvider().shutdown().join(10, TimeUnit.SECONDS);
+ }
+
+ @Override
+ public void afterRestore(Context extends Resource> context) throws Exception {
+ // Reinitialize: reopen connections, restart background threads.
+ // No SDK API exists for this yet — this is the body of #6756.
+ }
+ });
+
+ emitSpan(tracer, "before-checkpoint");
+ sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
+ assertThat(exporter.exportedCount()).isEqualTo(1);
+
+ cracContext.simulateCheckpoint();
+ cracContext.simulateRestore();
+
+ emitSpan(tracer, "after-restore");
+ sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
+ assertThat(exporter.exportedCount()).isEqualTo(2);
+ }
+
+ private static OpenTelemetrySdk buildSdk(SpanExporter exporter) {
+ return OpenTelemetrySdk.builder()
+ .setTracerProvider(
+ SdkTracerProvider.builder()
+ .addSpanProcessor(SimpleSpanProcessor.create(exporter))
+ .build())
+ .build();
}
private static void emitSpan(Tracer tracer, String name) {
@@ -96,8 +133,8 @@ private static void emitSpan(Tracer tracer, String name) {
span.end();
}
- private static final class LifecycleSpanExporter implements SpanExporter {
- private final List Notification order follows the CRaC specification: checkpoint callbacks fire in reverse
+ * registration order; restore callbacks fire in forward registration order.
+ */
+final class MockCracContext extends Context