Skip to content

Commit 54d1211

Browse files
craig[bot]xxmplus
andcommitted
Merge #158301
158301: Pebble: bump Pebble version and update to support the new Metrics.WALMetrics r=xxmplus a=xxmplus This PR covers two main changes. First, bump the pebble changes: * [`de16e152`](cockroachdb/pebble@de16e152) db: export SpanPolicy and ValueStoragePolicyAdjustment as public type aliases * [`e6b971f9`](cockroachdb/pebble@e6b971f9) sstable, compact: move IsLikelyMVCCGarbage to RawWriter interface * [`495be96d`](cockroachdb/pebble@495be96d) valsep: create SSTBlobWriter to write external sst with blob files * [`c9a3b2a4`](cockroachdb/pebble@c9a3b2a4) db,base: move SpanPolicy and related structs to base pkg * [`2d1c267c`](cockroachdb/pebble@2d1c267c) db: move file number assignment to just before metadata creation * [`e2e0fe33`](cockroachdb/pebble@e2e0fe33) github: add nightly crossversion smoke test * [`532d015b`](cockroachdb/pebble@532d015b) github: separate cockroach-go workflow, add issue filing * [`94506e99`](cockroachdb/pebble@94506e99) scripts: add crossversion smoke test * [`b4f6e9e0`](cockroachdb/pebble@b4f6e9e0) fix windows nightly * [`455e5597`](cockroachdb/pebble@455e5597) *: use crstrings.Lines[Seq], strings.SplitSeq * [`91d1ff9c`](cockroachdb/pebble@91d1ff9c) *: use crstrings.LinesSeq * [`ce957f43`](cockroachdb/pebble@ce957f43) go.mod: upgrade github.com/cockroachdb/crlib * [`d7ce913e`](cockroachdb/pebble@d7ce913e) objstorage: support cold tier * [`4e659708`](cockroachdb/pebble@4e659708) objstorageprovider: improve "open" test directive * [`e0b71b58`](cockroachdb/pebble@e0b71b58) objstorage: local settings cleanup * [`818109f1`](cockroachdb/pebble@818109f1) objstorageprovider: minor local subsystem cleanup * [`936d14cd`](cockroachdb/pebble@936d14cd) github: remove labeled type from pr bot review triggering condition * [`a6c4a9df`](cockroachdb/pebble@a6c4a9df) cmd/pebble: ensure minimum mvcc is set for newPebbleDB * [`4f7159ac`](cockroachdb/pebble@4f7159ac) Update zstd_nocgo.go * [`6b56ed26`](cockroachdb/pebble@6b56ed26) Update zstd_cgo.go * [`a9404508`](cockroachdb/pebble@a9404508) Update zstd_nocgo.go * [`ed175670`](cockroachdb/pebble@ed175670) feat: add gozstd build tag to allow to configure zstd lib * [`4264547a`](cockroachdb/pebble@4264547a) db: improve metrics and logs to measure all filesystem ops * [`6f00a886`](cockroachdb/pebble@6f00a886) db: log full path to WAL file in disk slow info ------ And second, update LogWriter to the new Metrics.WALMetrics and expose WALSecondaryFileOpLatency to metrics Co-authored-by: Xuming Xu <andy.xu@cockroachlabs.com>
2 parents 013833d + 6a200e8 commit 54d1211

File tree

13 files changed

+116
-23
lines changed

13 files changed

+116
-23
lines changed

DEPS.bzl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1834,10 +1834,10 @@ def go_deps():
18341834
patches = [
18351835
"@com_github_cockroachdb_cockroach//build/patches:com_github_cockroachdb_pebble.patch",
18361836
],
1837-
sha256 = "e70a40a057027337af8f5bc3758dc0b1bd30c81a96f7d42455856c16f08b6309",
1838-
strip_prefix = "github.com/cockroachdb/pebble@v0.0.0-20251119182253-219f3fc13027",
1837+
sha256 = "f7297c003d1fc223ee139c0d27827072c4b4b2f131d391fb521eac598bd63b86",
1838+
strip_prefix = "github.com/cockroachdb/pebble@v0.0.0-20251125175721-de16e1520951",
18391839
urls = [
1840-
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/pebble/com_github_cockroachdb_pebble-v0.0.0-20251119182253-219f3fc13027.zip",
1840+
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/pebble/com_github_cockroachdb_pebble-v0.0.0-20251125175721-de16e1520951.zip",
18411841
],
18421842
)
18431843
go_repository(

build/bazelutil/distdir_files.bzl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ DISTDIR_FILES = {
352352
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/gostdlib/com_github_cockroachdb_gostdlib-v1.19.0.zip": "c4d516bcfe8c07b6fc09b8a9a07a95065b36c2855627cb3514e40c98f872b69e",
353353
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/logtags/com_github_cockroachdb_logtags-v0.0.0-20241215232642-bb51bb14a506.zip": "920068af09e3846d9ebb4e4a7787ff1dd10f3989c5f940ad861b0f6a9f824f6e",
354354
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/metamorphic/com_github_cockroachdb_metamorphic-v0.0.0-20231108215700-4ba948b56895.zip": "28c8cf42192951b69378cf537be5a9a43f2aeb35542908cc4fe5f689505853ea",
355-
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/pebble/com_github_cockroachdb_pebble-v0.0.0-20251119182253-219f3fc13027.zip": "e70a40a057027337af8f5bc3758dc0b1bd30c81a96f7d42455856c16f08b6309",
355+
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/pebble/com_github_cockroachdb_pebble-v0.0.0-20251125175721-de16e1520951.zip": "f7297c003d1fc223ee139c0d27827072c4b4b2f131d391fb521eac598bd63b86",
356356
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/redact/com_github_cockroachdb_redact-v1.1.6.zip": "018eccb5fb9ca52d43ec9eaf213539d01c1f2b94e0e822406ebfb2e9321ef6cf",
357357
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/returncheck/com_github_cockroachdb_returncheck-v0.0.0-20200612231554-92cdbca611dd.zip": "ce92ba4352deec995b1f2eecf16eba7f5d51f5aa245a1c362dfe24c83d31f82b",
358358
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/cockroachdb/stress/com_github_cockroachdb_stress-v0.0.0-20220803192808-1806698b1b7b.zip": "3fda531795c600daf25532a4f98be2a1335cd1e5e182c72789bca79f5f69fcc1",

build/patches/com_github_cockroachdb_pebble.patch

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
diff --git a/internal/buildtags/BUILD.bazel b/internal/buildtags/BUILD.bazel
2-
index a556ba522..db4fbfc14 100644
2+
index a556ba5..db4fbfc 100644
33
--- a/internal/buildtags/BUILD.bazel
44
+++ b/internal/buildtags/BUILD.bazel
55
@@ -5,17 +5,29 @@ go_library(
@@ -35,7 +35,7 @@ index a556ba522..db4fbfc14 100644
3535
name = "go_default_library",
3636
actual = ":buildtags",
3737
diff --git a/internal/invariants/BUILD.bazel b/internal/invariants/BUILD.bazel
38-
index 3dd80d650..36dae2b77 100644
38+
index 3dd80d6..36dae2b 100644
3939
--- a/internal/invariants/BUILD.bazel
4040
+++ b/internal/invariants/BUILD.bazel
4141
@@ -4,14 +4,25 @@ go_library(
@@ -68,10 +68,10 @@ index 3dd80d650..36dae2b77 100644
6868
name = "go_default_library",
6969
actual = ":invariants",
7070
diff --git a/internal/treesteps/BUILD.bazel b/internal/treesteps/BUILD.bazel
71-
index e4fbbab69..e1cccbe79 100644
71+
index b6e76b3..9498a30 100644
7272
--- a/internal/treesteps/BUILD.bazel
7373
+++ b/internal/treesteps/BUILD.bazel
74-
@@ -5,13 +5,25 @@ go_library(
74+
@@ -5,8 +5,10 @@ go_library(
7575
srcs = [
7676
"data.go",
7777
"doc.go",
@@ -83,7 +83,9 @@ index e4fbbab69..e1cccbe79 100644
8383
+ }),
8484
importpath = "github.com/cockroachdb/pebble/internal/treesteps",
8585
visibility = ["//:__subpackages__"],
86-
deps = ["//internal/treeprinter"],
86+
deps = [
87+
@@ -15,6 +17,16 @@ go_library(
88+
],
8789
)
8890

8991
+REMOVE_GO_BUILD_CONSTRAINTS = "cat $< | grep -v '//go:build' | grep -v '// +build' > $@"
@@ -100,7 +102,7 @@ index e4fbbab69..e1cccbe79 100644
100102
name = "go_default_library",
101103
actual = ":treesteps",
102104
diff --git a/objstorage/objstorageprovider/objiotracing/BUILD.bazel b/objstorage/objstorageprovider/objiotracing/BUILD.bazel
103-
index 171354745..198824703 100644
105+
index 90c6265..5092542 100644
104106
--- a/objstorage/objstorageprovider/objiotracing/BUILD.bazel
105107
+++ b/objstorage/objstorageprovider/objiotracing/BUILD.bazel
106108
@@ -5,6 +5,7 @@ go_library(

docs/generated/metrics/metrics.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10404,6 +10404,15 @@ layers:
1040410404
derivative: NONE
1040510405
how_to_use: If this value is greater than 100ms, it is an indication of a disk stall. To mitigate the effects of disk stalls, consider deploying your cluster with WAL failover configured. When WAL failover is configured, the more relevant metric is storage.wal.failover_write_and_sync.latency, as this metric reflects the fsync latency of the primary and/or the secondary WAL device.
1040610406
visibility: ESSENTIAL
10407+
- name: storage.wal.secondary.file_op.latency
10408+
exported_name: storage_wal_secondary_file_op_latency
10409+
description: The latency of file operations on the secondary Write-Ahead Log device.
10410+
y_axis_label: File Op Latency
10411+
type: HISTOGRAM
10412+
unit: NANOSECONDS
10413+
aggregation: AVG
10414+
derivative: NONE
10415+
how_to_use: Only populated when WAL failover is configured. This metric tracks file operation latency specifically on the secondary WAL device.
1040710416
- name: storage.write-stalls
1040810417
exported_name: storage_write_stalls
1040910418
description: Number of instances of intentional write stalls to backpressure incoming writes

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ require (
137137
github.com/cockroachdb/errors v1.12.1-0.20251010171200-64801262cd6f
138138
github.com/cockroachdb/gostdlib v1.19.0
139139
github.com/cockroachdb/logtags v0.0.0-20241215232642-bb51bb14a506
140-
github.com/cockroachdb/pebble v0.0.0-20251119182253-219f3fc13027
140+
github.com/cockroachdb/pebble v0.0.0-20251125175721-de16e1520951
141141
github.com/cockroachdb/redact v1.1.6
142142
github.com/cockroachdb/returncheck v0.0.0-20200612231554-92cdbca611dd
143143
github.com/cockroachdb/tokenbucket v0.0.0-20250429170803-42689b6311bb

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -572,8 +572,8 @@ github.com/cockroachdb/logtags v0.0.0-20241215232642-bb51bb14a506 h1:ASDL+UJcILM
572572
github.com/cockroachdb/logtags v0.0.0-20241215232642-bb51bb14a506/go.mod h1:Mw7HqKr2kdtu6aYGn3tPmAftiP3QPX63LdK/zcariIo=
573573
github.com/cockroachdb/metamorphic v0.0.0-20231108215700-4ba948b56895 h1:XANOgPYtvELQ/h4IrmPAohXqe2pWA8Bwhejr3VQoZsA=
574574
github.com/cockroachdb/metamorphic v0.0.0-20231108215700-4ba948b56895/go.mod h1:aPd7gM9ov9M8v32Yy5NJrDyOcD8z642dqs+F0CeNXfA=
575-
github.com/cockroachdb/pebble v0.0.0-20251119182253-219f3fc13027 h1:6Rb15Lct/OjpiK4IwuGunbcDXpB3CI8aXu1K8xopu6A=
576-
github.com/cockroachdb/pebble v0.0.0-20251119182253-219f3fc13027/go.mod h1:XCkfpLCGXq5/lUtsHapEvHih54DWRK/AqhWee96YHsU=
575+
github.com/cockroachdb/pebble v0.0.0-20251125175721-de16e1520951 h1:42C86t1Su5UyfzfdYU2lr+ehW41s4f734JHMMGLcW+o=
576+
github.com/cockroachdb/pebble v0.0.0-20251125175721-de16e1520951/go.mod h1:4p9u+hqtw8sRxiY5cNFWETymMf4brYGG90pS8MzlXmg=
577577
github.com/cockroachdb/redact v1.1.6 h1:zXJBwDZ84xJNlHl1rMyCojqyIxv+7YUpQiJLQ7n4314=
578578
github.com/cockroachdb/redact v1.1.6/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg=
579579
github.com/cockroachdb/returncheck v0.0.0-20200612231554-92cdbca611dd h1:KFOt5I9nEKZgCnOSmy8r4Oykh8BYQO8bFOTgHDS8YZA=

pkg/kv/kvserver/metrics.go

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3125,6 +3125,9 @@ var (
31253125
Measurement: "Events",
31263126
Unit: metric.Unit_COUNT,
31273127
}
3128+
// metaStorageFsyncLatency tracks the primary WAL device latency.
3129+
// For secondary WAL device latency (when WAL failover is configured),
3130+
// see metaStorageWALSecondaryFileOpLatency.
31283131
metaStorageFsyncLatency = metric.Metadata{
31293132
Name: "storage.wal.fsync.latency",
31303133
Help: "The fsync latency to the Write-Ahead Log device.",
@@ -3141,6 +3144,14 @@ var (
31413144
device.
31423145
`),
31433146
}
3147+
metaStorageWALSecondaryFileOpLatency = metric.Metadata{
3148+
Name: "storage.wal.secondary.file_op.latency",
3149+
Help: "The latency of file operations on the secondary Write-Ahead Log device.",
3150+
Measurement: "File Op Latency",
3151+
Unit: metric.Unit_NANOSECONDS,
3152+
Category: metric.Metadata_STORAGE,
3153+
HowToUse: "Only populated when WAL failover is configured. This metric tracks file operation latency specifically on the secondary WAL device.",
3154+
}
31443155
metaStorageWALFailoverSwitchCount = metric.Metadata{
31453156
Name: "storage.wal.failover.switch.count",
31463157
Help: crstrings.UnwrapText(`
@@ -3753,7 +3764,12 @@ type StoreMetrics struct {
37533764
SubsumeLocksWritten *metric.Counter
37543765

37553766
FlushUtilization *metric.GaugeFloat64
3756-
FsyncLatency *metric.ManualWindowHistogram
3767+
// FsyncLatency tracks file operation latency for the primary WAL device.
3768+
// When WAL failover is configured, see also WALSecondaryFileOpLatency.
3769+
FsyncLatency *metric.ManualWindowHistogram
3770+
// WALSecondaryFileOpLatency tracks file operation latency for the secondary
3771+
// WAL device. Only populated when WAL failover is configured.
3772+
WALSecondaryFileOpLatency *metric.ManualWindowHistogram
37573773

37583774
// Disk metrics
37593775
DiskReadBytes *metric.Counter
@@ -4567,6 +4583,11 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
45674583
pebble.FsyncLatencyBuckets,
45684584
false, /* withRotate */
45694585
),
4586+
WALSecondaryFileOpLatency: metric.NewManualWindowHistogram(
4587+
metaStorageWALSecondaryFileOpLatency,
4588+
pebble.FsyncLatencyBuckets,
4589+
false, /* withRotate */
4590+
),
45704591

45714592
ReplicaReadBatchDroppedLatchesBeforeEval: metric.NewCounter(metaReplicaReadBatchDroppedLatchesBeforeEval),
45724593
ReplicaReadBatchWithoutInterleavingIter: metric.NewCounter(metaReplicaReadBatchWithoutInterleavingIter),

pkg/kv/kvserver/metrics_test.go

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ func TestPebbleDiskWriteMetrics(t *testing.T) {
102102
defer cleanup()
103103

104104
ctx := context.Background()
105-
ts, _, kvDB := serverutils.StartServer(t, base.TestServerArgs{
105+
ts := serverutils.StartServerOnly(t, base.TestServerArgs{
106106
DefaultTestTenant: base.TestControlsTenantsExplicitly,
107107
StoreSpecs: []base.StoreSpec{
108108
{Size: storageconfig.BytesSize(storageconfig.MinimumStoreSize), Path: tmpDir},
@@ -111,7 +111,7 @@ func TestPebbleDiskWriteMetrics(t *testing.T) {
111111
defer ts.Stopper().Stop(ctx)
112112

113113
// Force a WAL write.
114-
require.NoError(t, kvDB.Put(ctx, "kev", "value"))
114+
require.NoError(t, ts.DB().Put(ctx, "kev", "value"))
115115

116116
if err := ts.GetStores().(*Stores).VisitStores(func(s *Store) error {
117117
testutils.SucceedsSoon(t, func() error {
@@ -125,3 +125,36 @@ func TestPebbleDiskWriteMetrics(t *testing.T) {
125125
t.Fatal(err)
126126
}
127127
}
128+
129+
// TestWALSecondaryFileOpLatencyMetric verifies that the secondary WAL file
130+
// operation latency metric is properly registered and accessible.
131+
func TestWALSecondaryFileOpLatencyMetric(t *testing.T) {
132+
defer leaktest.AfterTest(t)()
133+
defer log.Scope(t).Close(t)
134+
135+
tmpDir, cleanup := testutils.TempDir(t)
136+
defer cleanup()
137+
138+
ctx := context.Background()
139+
ts := serverutils.StartServerOnly(t, base.TestServerArgs{
140+
DefaultTestTenant: base.TestControlsTenantsExplicitly,
141+
StoreSpecs: []base.StoreSpec{
142+
{Size: storageconfig.BytesSize(storageconfig.MinimumStoreSize), Path: tmpDir},
143+
},
144+
})
145+
defer ts.Stopper().Stop(ctx)
146+
147+
// Verify the secondary WAL file operation latency metric is registered.
148+
if err := ts.GetStores().(*Stores).VisitStores(func(s *Store) error {
149+
if ok := s.Registry().Contains("storage.wal.secondary.file_op.latency"); !ok {
150+
return fmt.Errorf("missing secondary WAL file operation latency metric")
151+
}
152+
// Verify the metric is non-nil in the store metrics.
153+
if s.metrics.WALSecondaryFileOpLatency == nil {
154+
return fmt.Errorf("WALSecondaryFileOpLatency metric is nil")
155+
}
156+
return nil
157+
}); err != nil {
158+
t.Fatal(err)
159+
}
160+
}

pkg/kv/kvserver/raftstorebench/stats.go

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,22 @@ func newAggStats() *aggStats {
9090
func printLogWriterMetrics(t T, prefix string, m storage.Metrics) {
9191
var b strings.Builder
9292
_, _ = fmt.Fprintf(&b, "%s logWriter: rate(peak): %s(%s) len(pending,sync): (%.2f,%.2f)",
93-
prefix, humanize.IBytes(uint64(m.LogWriter.WriteThroughput.Rate())),
94-
humanize.IBytes(uint64(m.LogWriter.WriteThroughput.PeakRate())),
95-
m.LogWriter.PendingBufferLen.Mean(), m.LogWriter.SyncQueueLen.Mean())
96-
qs := calculateQuantiles(m.LogWriter.FsyncLatency, 0.5, 0.9, 0.99)
97-
_, _ = fmt.Fprintf(&b, " sync: p50,p90,p99: %.2fms,%.2fms,%.2fms", qs[0]/1e6, qs[1]/1e6, qs[2]/1e6)
93+
prefix, humanize.IBytes(uint64(m.Metrics.WALMetrics.WriteThroughput.Rate())),
94+
humanize.IBytes(uint64(m.Metrics.WALMetrics.WriteThroughput.PeakRate())),
95+
m.Metrics.WALMetrics.PendingBufferLen.Mean(), m.Metrics.WALMetrics.SyncQueueLen.Mean())
96+
97+
// Log primary file operation latency
98+
qsPrimary := calculateQuantiles(m.Metrics.WALMetrics.PrimaryFileOpLatency, 0.5, 0.9, 0.99)
99+
_, _ = fmt.Fprintf(&b, " primary: p50,p90,p99: %.2fms,%.2fms,%.2fms",
100+
qsPrimary[0]/1e6, qsPrimary[1]/1e6, qsPrimary[2]/1e6)
101+
102+
// Log secondary file operation latency if available (WAL failover scenarios)
103+
if m.Metrics.WALMetrics.SecondaryFileOpLatency != nil {
104+
qsSecondary := calculateQuantiles(m.Metrics.WALMetrics.SecondaryFileOpLatency, 0.5, 0.9, 0.99)
105+
_, _ = fmt.Fprintf(&b, " secondary: p50,p90,p99: %.2fms,%.2fms,%.2fms",
106+
qsSecondary[0]/1e6, qsSecondary[1]/1e6, qsSecondary[2]/1e6)
107+
}
108+
98109
logf(t, "%s", &b)
99110
}
100111

pkg/kv/kvserver/store.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3805,9 +3805,15 @@ func (s *Store) ComputeMetricsPeriodically(
38053805
wt.Subtract(prevMetrics.FlushWriteThroughput)
38063806

38073807
if err := updateWindowedHistogram(
3808-
prevMetrics.WALFsyncLatency, m.LogWriter.FsyncLatency, s.metrics.FsyncLatency); err != nil {
3808+
prevMetrics.WALFsyncLatency, m.Metrics.WALMetrics.PrimaryFileOpLatency, s.metrics.FsyncLatency); err != nil {
38093809
return m, err
38103810
}
3811+
if m.Metrics.WALMetrics.SecondaryFileOpLatency != nil {
3812+
if err := updateWindowedHistogram(
3813+
prevMetrics.WALSecondaryFileOpLatency, m.Metrics.WALMetrics.SecondaryFileOpLatency, s.metrics.WALSecondaryFileOpLatency); err != nil {
3814+
return m, err
3815+
}
3816+
}
38113817
if m.WAL.Failover.FailoverWriteAndSyncLatency != nil {
38123818
if err := updateWindowedHistogram(prevMetrics.WALFailoverWriteAndSyncLatency,
38133819
m.WAL.Failover.FailoverWriteAndSyncLatency, s.metrics.WALFailoverWriteAndSyncLatency); err != nil {

0 commit comments

Comments
 (0)