Skip to content

Commit 798c8dd

Browse files
craig[bot]herkolategan
andcommitted
Merge #158781
158781: drtprod: 300 node scale test config for 2025-12 r=shailendra-patel a=herkolategan Created a config for the 300 node scale test Dec 2025. Release note: None Epic: None Co-authored-by: Herko Lategan <herko@cockroachlabs.com>
2 parents a76eb5d + 413181b commit 798c8dd

File tree

4 files changed

+219
-36
lines changed

4 files changed

+219
-36
lines changed
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
# Yaml for creating and configuring the drt-scale cluster. This also configures Datadog.
2+
# Build the roachprod and roachtest binaries (using --cross) before running this script
3+
environment:
4+
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: 622274581499-compute@developer.gserviceaccount.com
5+
ROACHPROD_DNS: drt.crdb.io
6+
ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
7+
ROACHPROD_GCE_DNS_ZONE: drt
8+
ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
9+
CLUSTER: drt-scale-300
10+
WORKLOAD_CLUSTER: workload-scale-300
11+
CLUSTER_NODES: 300
12+
TOTAL_PARTITIONS: 15
13+
WORKLOAD_NODES: 15
14+
15+
dependent_file_locations:
16+
- pkg/cmd/drtprod/scripts/setup_datadog_cluster
17+
- pkg/cmd/drtprod/scripts/setup_datadog_workload
18+
- pkg/cmd/drtprod/scripts/tpcc_init.sh
19+
- pkg/cmd/drtprod/scripts/generate_tpcc_run.sh
20+
- artifacts/roachtest
21+
- artifacts/drtprod
22+
23+
targets:
24+
# crdb cluster specs
25+
- target_name: $CLUSTER
26+
steps:
27+
- command: create
28+
args:
29+
- $CLUSTER
30+
flags:
31+
clouds: gce
32+
gce-managed: true
33+
gce-enable-multiple-stores: true
34+
gce-zones: "us-central1-a:30,us-central1-b:30,us-central1-c:30"
35+
nodes: $CLUSTER_NODES
36+
gce-machine-type: n2-standard-16
37+
local-ssd: false
38+
gce-pd-volume-size: 2048
39+
gce-pd-volume-type: pd-ssd
40+
gce-pd-volume-count: 2
41+
os-volume-size: 100
42+
username: drt
43+
lifetime: 8760h
44+
gce-image: "ubuntu-2204-jammy-v20250112"
45+
- command: sync
46+
flags:
47+
clouds: gce
48+
- command: stage
49+
args:
50+
- $CLUSTER
51+
- release
52+
- v25.2.0-rc.1 # for libgeos
53+
- command: stage
54+
args:
55+
- $CLUSTER
56+
- cockroach
57+
- release-25.2.1-rc
58+
- script: "pkg/cmd/drtprod/scripts/setup_datadog_cluster"
59+
- command: start
60+
args:
61+
- $CLUSTER
62+
- "--binary"
63+
- "./cockroach"
64+
flags:
65+
# add flag to set provisioned throughput on each store according to their cloud provider limits
66+
enable-fluent-sink: true
67+
store-count: 2
68+
args: --wal-failover=among-stores
69+
restart: false
70+
sql-port: 26257
71+
- command: run
72+
args:
73+
- $CLUSTER
74+
- --
75+
- "sudo systemctl unmask cron.service ; sudo systemctl enable cron.service ; echo \"crontab -l ; echo '@reboot sleep 100 && ~/cockroach.sh' | crontab -\" > t.sh ; sh t.sh ; rm t.sh"
76+
- command: sql
77+
args:
78+
- $CLUSTER:1
79+
- --
80+
- -e
81+
- "SET CLUSTER SETTING kv.snapshot_rebalance.max_rate='256 MB'"
82+
# workload cluster specs
83+
- target_name: $WORKLOAD_CLUSTER
84+
steps:
85+
- command: create
86+
args:
87+
- $WORKLOAD_CLUSTER
88+
flags:
89+
clouds: gce
90+
gce-zones: "us-central1-a"
91+
nodes: $WORKLOAD_NODES
92+
gce-machine-type: n2-standard-8
93+
os-volume-size: 100
94+
username: workload
95+
lifetime: 8760h
96+
gce-image: "ubuntu-2204-jammy-v20250112"
97+
on_rollback:
98+
- command: destroy
99+
args:
100+
- $WORKLOAD_CLUSTER
101+
- command: sync
102+
flags:
103+
clouds: gce
104+
- command: stage
105+
args:
106+
- $WORKLOAD_CLUSTER
107+
- cockroach
108+
- command: put
109+
args:
110+
- $WORKLOAD_CLUSTER
111+
- artifacts/roachtest
112+
- roachtest-operations
113+
- command: put
114+
args:
115+
- $WORKLOAD_CLUSTER
116+
- artifacts/drtprod
117+
- script: "pkg/cmd/drtprod/scripts/setup_datadog_workload"
118+
- target_name: post_tasks
119+
dependent_targets:
120+
- $CLUSTER
121+
- $WORKLOAD_CLUSTER
122+
steps:
123+
- script: rm
124+
args:
125+
- -rf
126+
- certs-$CLUSTER
127+
- command: get
128+
args:
129+
- $CLUSTER:1
130+
- certs
131+
- certs-$CLUSTER
132+
- command: put
133+
args:
134+
- $WORKLOAD_CLUSTER
135+
- certs-$CLUSTER
136+
- certs
137+
- command: ssh
138+
args:
139+
- $WORKLOAD_CLUSTER
140+
- --
141+
- chmod
142+
- 600
143+
- './certs/*'
144+
- script: "pkg/cmd/drtprod/scripts/tpcc_init.sh"
145+
args:
146+
- cct_tpcc
147+
- false
148+
flags:
149+
warehouses: 4000000
150+
db: cct_tpcc
151+
- script: pkg/cmd/drtprod/scripts/populate_workload_keys.sh
152+
- target_name: tpcc_run
153+
dependent_targets:
154+
- $CLUSTER
155+
- $WORKLOAD_CLUSTER
156+
steps:
157+
- script: "pkg/cmd/drtprod/scripts/generate_tpcc_run.sh"
158+
args:
159+
- cct_tpcc
160+
- false
161+
flags:
162+
db: cct_tpcc
163+
warehouses: 4000000
164+
active-warehouses: 500000
165+
active-workers: 2000
166+
conns: 2000
167+
max-rate: 2500
168+
workers: 500000
169+
duration: 12h
170+
ramp: 1h
171+
wait: 0

pkg/cmd/drtprod/configs/drt_scale_300_bench.yaml renamed to pkg/cmd/drtprod/configs/archived/2025_07_drt_scale_300_bench.yaml

File renamed without changes.

pkg/cmd/drtprod/configs/drt_scale_300.yaml

Lines changed: 47 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
# Yaml for creating and configuring the drt-scale cluster. This also configures Datadog.
2-
# Build the roachprod and roachtest binaries (using --cross) before running this script
1+
# YAML for creating and configuring the drt-scale cluster. This also configures Datadog.
2+
# Build the drtprod and roachtest binaries (using --cross=linux) before running this script
3+
#
4+
# Planned Execution Date: 2025-12
35
environment:
46
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: 622274581499-compute@developer.gserviceaccount.com
57
ROACHPROD_DNS: drt.crdb.io
@@ -9,20 +11,28 @@ environment:
911
CLUSTER: drt-scale-300
1012
WORKLOAD_CLUSTER: workload-scale-300
1113
CLUSTER_NODES: 300
12-
TOTAL_PARTITIONS: 15
14+
RACKS: 300
15+
NODES_PER_ZONE: 100
16+
TOTAL_PARTITIONS: 300
17+
PARTITION_TYPE: partitions
1318
WORKLOAD_NODES: 15
19+
VERSION: v25.4.1 # TBD - update once a custom binary is available (also update stage command)
20+
WAREHOUSES: 4000000
1421

1522
dependent_file_locations:
23+
- pkg/cmd/drtprod/scripts/setup_dmsetup_disk_staller
1624
- pkg/cmd/drtprod/scripts/setup_datadog_cluster
1725
- pkg/cmd/drtprod/scripts/setup_datadog_workload
1826
- pkg/cmd/drtprod/scripts/tpcc_init.sh
1927
- pkg/cmd/drtprod/scripts/generate_tpcc_run.sh
28+
- pkg/cmd/drtprod/scripts/populate_workload_keys.sh
2029
- artifacts/roachtest
2130
- artifacts/drtprod
2231

2332
targets:
2433
# crdb cluster specs
25-
- target_name: $CLUSTER
34+
- target_name: $CLUSTER cluster initialisation
35+
notify_progress: true
2636
steps:
2737
- command: create
2838
args:
@@ -31,7 +41,7 @@ targets:
3141
clouds: gce
3242
gce-managed: true
3343
gce-enable-multiple-stores: true
34-
gce-zones: "us-central1-a:30,us-central1-b:30,us-central1-c:30"
44+
gce-zones: "us-central1-a:$NODES_PER_ZONE,us-central1-b:$NODES_PER_ZONE,us-central1-c:$NODES_PER_ZONE"
3545
nodes: $CLUSTER_NODES
3646
gce-machine-type: n2-standard-16
3747
local-ssd: false
@@ -43,36 +53,38 @@ targets:
4353
lifetime: 8760h
4454
gce-image: "ubuntu-2204-jammy-v20250112"
4555
- command: sync
56+
skip_notification: true
4657
flags:
4758
clouds: gce
59+
- script: "pkg/cmd/drtprod/scripts/setup_dmsetup_disk_staller"
60+
skip_notification: true
4861
- command: stage
62+
skip_notification: true
4963
args:
5064
- $CLUSTER
5165
- release
52-
- v25.2.0-rc.1 # for libgeos
53-
- command: stage
54-
args:
55-
- $CLUSTER
56-
- cockroach
57-
- release-25.2.1-rc
66+
- $VERSION
5867
- script: "pkg/cmd/drtprod/scripts/setup_datadog_cluster"
68+
skip_notification: true
5969
- command: start
6070
args:
6171
- $CLUSTER
6272
- "--binary"
6373
- "./cockroach"
74+
- "--env" # from defaults
75+
- "COCKROACH_TESTING_FORCE_RELEASE_BRANCH=true"
76+
- "--env" # from defaults
77+
- "COCKROACH_INTERNAL_DISABLE_METAMORPHIC_TESTING=true"
78+
- "--env" # for MMA test case scenario
79+
- "COCKROACH_ALLOW_MMA=true"
6480
flags:
6581
# add flag to set provisioned throughput on each store according to their cloud provider limits
6682
enable-fluent-sink: true
6783
store-count: 2
6884
args: --wal-failover=among-stores
6985
restart: false
7086
sql-port: 26257
71-
- command: run
72-
args:
73-
- $CLUSTER
74-
- --
75-
- "sudo systemctl unmask cron.service ; sudo systemctl enable cron.service ; echo \"crontab -l ; echo '@reboot sleep 100 && ~/cockroach.sh' | crontab -\" > t.sh ; sh t.sh ; rm t.sh"
87+
racks: $RACKS
7688
- command: sql
7789
args:
7890
- $CLUSTER:1
@@ -104,7 +116,8 @@ targets:
104116
- command: stage
105117
args:
106118
- $WORKLOAD_CLUSTER
107-
- cockroach
119+
- release
120+
- $VERSION
108121
- command: put
109122
args:
110123
- $WORKLOAD_CLUSTER
@@ -116,42 +129,42 @@ targets:
116129
- artifacts/drtprod
117130
- script: "pkg/cmd/drtprod/scripts/setup_datadog_workload"
118131
- target_name: post_tasks
132+
notify_progress: true
119133
dependent_targets:
120-
- $CLUSTER
134+
- $CLUSTER cluster initialisation
121135
- $WORKLOAD_CLUSTER
122136
steps:
123137
- script: rm
138+
skip_notification: true
124139
args:
125140
- -rf
126141
- certs-$CLUSTER
127-
- command: get
142+
- command: fetch-certs
143+
skip_notification: true
128144
args:
129145
- $CLUSTER:1
130-
- certs
131146
- certs-$CLUSTER
132147
- command: put
148+
skip_notification: true
133149
args:
134150
- $WORKLOAD_CLUSTER
135151
- certs-$CLUSTER
136152
- certs
137-
- command: ssh
138-
args:
139-
- $WORKLOAD_CLUSTER
140-
- --
141-
- chmod
142-
- 600
143-
- './certs/*'
144153
- script: "pkg/cmd/drtprod/scripts/tpcc_init.sh"
145154
args:
146155
- cct_tpcc
147156
- false
148157
flags:
149-
warehouses: 4000000
158+
partitions: $TOTAL_PARTITIONS
159+
replicate-static-columns: true
160+
partition-strategy: leases
161+
warehouses: $WAREHOUSES
150162
db: cct_tpcc
151163
- script: pkg/cmd/drtprod/scripts/populate_workload_keys.sh
152164
- target_name: tpcc_run
165+
notify_progress: true
153166
dependent_targets:
154-
- $CLUSTER
167+
- $CLUSTER cluster initialisation
155168
- $WORKLOAD_CLUSTER
156169
steps:
157170
- script: "pkg/cmd/drtprod/scripts/generate_tpcc_run.sh"
@@ -160,12 +173,11 @@ targets:
160173
- false
161174
flags:
162175
db: cct_tpcc
163-
warehouses: 4000000
164-
active-warehouses: 500000
165-
active-workers: 2000
166-
conns: 2000
167-
max-rate: 2500
168-
workers: 500000
176+
warehouses: $WAREHOUSES
177+
active-warehouses: 266666
178+
workers: 266666
179+
conns: 1000
180+
active-workers: 1000
169181
duration: 12h
170182
ramp: 1h
171183
wait: 0

pkg/cmd/drtprod/scripts/setup_dmsetup_disk_staller

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22

3-
# Sets up datadog for the drt clusters.
3+
# Sets up disk staller support (dmsetup) for /mnt/data1.
44
# NOTE - This uses CLUSTER environment variable, if not set the script fails
55

66
if [ -z "${CLUSTER}" ]; then

0 commit comments

Comments
 (0)