Skip to content

Commit ca91a97

Browse files
craig[bot]golgeek
andcommitted
Merge #156821
156821: roachprod/roachtest: uniform storage capabilities r=DarrylWong a=golgeek Until now, each cloud provider implementation had its own capabilities with regards to storage options. GCE was the only first class citizen with the most available options exposed in roachtest. This patch attempts to bridge the feature parity gap between the cloud providers (up to what's exposed by each providers), bringing support for the following options in roachprod and roachtest: - GCE: - local SSD - network disk size - network disk type (pd-standard, pd-ssd) - network disk count - RAID0 or multiple stores - AWS: - local SSD - network disk size - network disk throughput - network disk IOPS - NEW: network disk type (gp2, gp3, io1, io2, st1, sc1, standard) - NEW: network disk count - NEW: RAID0 or multiple stores - Azure: - local SSD - network disk size - NEW: network disk IOPS (ultra-disk only) - NEW: network disk type (standard-ssd, premium-ssd, premium-ssd-v2, ultra-disk) - NEW: network disk count - NEW: RAID0 or multiple stores - IBM: - network disk size - network disk IOPS - network disk type (general-purpose, 5iops-tier, 10iops-tier, custom) - network disk count - RAID0 or multiple stores This patch also splits the disk setup startup script snippets, with: - a provider-specific way of detecting the attached disks - a common logic to mount, format and aggregate the disks This allows to offer the following filesystems across the board in roachprod (and roachtest): - Ext4 - ZFS - XFS - F2FS (available for AWS, Azure and GCP, pending newer kernel for IBM) - Btrfs Epic: none Closes: #123775 Informs: #146661, #113869 Release note: None Co-authored-by: Ludovic Leroux <ludo.leroux@cockroachlabs.com>
2 parents 42643f5 + 92339d6 commit ca91a97

35 files changed

+2221
-916
lines changed

pkg/cmd/roachprod/cli/flags.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,8 @@ func initCreateCmdFlags(createCmd *cobra.Command) {
137137
"lifetime", "l", 12*time.Hour, "Lifetime of the cluster")
138138
createCmd.Flags().BoolVar(&createVMOpts.SSDOpts.UseLocalSSD,
139139
"local-ssd", true, "Use local SSD")
140-
createCmd.Flags().StringVar(&createVMOpts.SSDOpts.FileSystem,
141-
"filesystem", vm.Ext4, "The underlying file system(ext4/zfs). ext4 is used by default.")
140+
createCmd.Flags().StringVar((*string)(&createVMOpts.SSDOpts.FileSystem),
141+
"filesystem", string(vm.Ext4), "The underlying file system(ext4/zfs/xfs/f2fs/btrfs). ext4 is used by default.")
142142
createCmd.Flags().BoolVar(&createVMOpts.SSDOpts.NoExt4Barrier,
143143
"local-ssd-no-ext4-barrier", true,
144144
`Mount the local SSD with the "-o nobarrier" flag. Ignored if --local-ssd=false is specified.`)

pkg/cmd/roachtest/cluster.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1909,7 +1909,7 @@ func (c *clusterImpl) CreateSnapshot(
19091909
func (c *clusterImpl) ApplySnapshots(ctx context.Context, snapshots []vm.VolumeSnapshot) error {
19101910
opts := vm.VolumeCreateOpts{
19111911
Size: c.spec.VolumeSize,
1912-
Type: c.spec.GCE.VolumeType, // TODO(irfansharif): This is only applicable to GCE. Change that.
1912+
Type: c.spec.VolumeType,
19131913
Labels: map[string]string{
19141914
vm.TagUsage: "roachtest",
19151915
},

pkg/cmd/roachtest/roachtestflags/flags.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -523,10 +523,10 @@ var (
523523
Usage: `Override use of local SSD`,
524524
})
525525

526-
OverrideFilesystem string
526+
OverrideFilesystem vm.Filesystem
527527
_ = registerRunFlag(&OverrideFilesystem, FlagInfo{
528528
Name: "filesystem",
529-
Usage: `Override the underlying file system(ext4/zfs)`,
529+
Usage: `Override the underlying file system(ext4/zfs/xfs/f2fs/btrfs)`,
530530
})
531531

532532
OverrideNoExt4Barrier bool

pkg/cmd/roachtest/roachtestflags/manager.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"time"
1212

1313
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
14+
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
1415
"github.com/cockroachdb/errors"
1516
"github.com/spf13/pflag"
1617
)
@@ -74,6 +75,8 @@ func (m *manager) AddFlagsToCommand(cmd cmdID, cmdFlags *pflag.FlagSet) {
7475
cmdFlags.StringToStringVarP(p, f.Name, f.Shorthand, *p, usage)
7576
case *spec.Cloud:
7677
cmdFlags.VarP(&cloudValue{val: p}, f.Name, f.Shorthand, usage)
78+
case *vm.Filesystem:
79+
cmdFlags.VarP(&filesystemValue{val: p}, f.Name, f.Shorthand, usage)
7780
default:
7881
panic(fmt.Sprintf("unsupported pointer type %T", p))
7982
}
@@ -146,3 +149,26 @@ func (cv *cloudValue) Set(str string) error {
146149
*cv.val = val
147150
return nil
148151
}
152+
153+
type filesystemValue struct {
154+
val *vm.Filesystem
155+
}
156+
157+
var _ pflag.Value = (*filesystemValue)(nil)
158+
159+
func (fv *filesystemValue) String() string {
160+
return string(*fv.val)
161+
}
162+
163+
func (fv *filesystemValue) Type() string {
164+
return "string"
165+
}
166+
167+
func (fv *filesystemValue) Set(str string) error {
168+
val, err := vm.ParseFilesystemString(str)
169+
if err != nil {
170+
return err
171+
}
172+
*fv.val = val
173+
return nil
174+
}

pkg/cmd/roachtest/spec/cluster_spec.go

Lines changed: 77 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,11 @@ import (
2525
type fileSystemType int
2626

2727
const (
28-
// Since ext4 is the default of 0, it isn't being
29-
// used anywhere in the code. Therefore, it isn't
30-
// added as a const here since it is unused, and
31-
// leads to a lint error.
32-
33-
// Zfs file system.
34-
Zfs fileSystemType = 1
28+
Ext4 fileSystemType = iota
29+
Zfs
30+
Xfs
31+
F2fs
32+
Btrfs
3533

3634
// Extra labels added by roachtest
3735
RoachtestBranch = "roachtest-branch"
@@ -205,6 +203,8 @@ type ClusterSpec struct {
205203
SSDs int
206204
RAID0 bool
207205
VolumeSize int
206+
VolumeType string
207+
VolumeCount int
208208
LocalSSD LocalSSDSetting
209209
Geo bool
210210
Lifetime time.Duration
@@ -224,8 +224,6 @@ type ClusterSpec struct {
224224
GCE struct {
225225
MachineType string
226226
MinCPUPlatform string
227-
VolumeType string
228-
VolumeCount int // volume count is only supported for GCE. This can be moved up if we start supporting other clouds
229227
Zones string
230228
} `cloud:"gce"`
231229

@@ -242,13 +240,13 @@ type ClusterSpec struct {
242240
// Azure-specific arguments. These values apply only on clusters instantiated on Azure.
243241
Azure struct {
244242
Zones string
243+
// VolumeIOPS is the provisioned IOPS for ultra-disks.
244+
VolumeIOPS int
245245
} `cloud:"azure"`
246246
// IBM-specific arguments. These values apply only on clusters instantiated on IBM.
247247
IBM struct {
248248
MachineType string
249-
VolumeType string
250249
VolumeIOPS int
251-
VolumeCount int
252250
Zones string
253251
} `cloud:"ibm"`
254252
}
@@ -319,16 +317,24 @@ func awsMachineSupportsSSD(machineType string) bool {
319317

320318
func getAWSOpts(
321319
machineType string,
322-
volumeSize, ebsThroughput int,
320+
volumeSize, volumeCount, ebsThroughput int,
321+
volumeType string,
323322
ebsIOPS int,
324323
localSSD bool,
324+
RAID0 bool,
325325
useSpotVMs bool,
326326
bootDiskOnly bool,
327327
) vm.ProviderOpts {
328328
opts := aws.DefaultProviderOpts()
329329
if volumeSize != 0 {
330330
opts.DefaultEBSVolume.Disk.VolumeSize = volumeSize
331331
}
332+
if volumeType != "" {
333+
opts.DefaultEBSVolume.Disk.VolumeType = volumeType
334+
}
335+
if volumeCount != 0 {
336+
opts.EBSVolumeCount = volumeCount
337+
}
332338
if ebsIOPS != 0 {
333339
opts.DefaultEBSVolume.Disk.IOPs = ebsIOPS
334340
}
@@ -340,6 +346,7 @@ func getAWSOpts(
340346
} else {
341347
opts.MachineType = machineType
342348
}
349+
opts.UseMultipleDisks = !RAID0
343350
opts.UseSpot = useSpotVMs
344351
opts.BootDiskOnly = bootDiskOnly
345352
return opts
@@ -389,13 +396,31 @@ func getGCEOpts(
389396
return opts
390397
}
391398

392-
func getAzureOpts(machineType string, volumeSize int, bootDiskOnly bool) vm.ProviderOpts {
399+
func getAzureOpts(
400+
machineType string,
401+
volumeSize int,
402+
volumeType string,
403+
volumeCount int,
404+
volumeIOPS int,
405+
RAID0 bool,
406+
bootDiskOnly bool,
407+
) vm.ProviderOpts {
393408
opts := azure.DefaultProviderOpts()
394409
opts.MachineType = machineType
395410
if volumeSize != 0 {
396411
opts.NetworkDiskSize = int32(volumeSize)
397412
}
398413
opts.BootDiskOnly = bootDiskOnly
414+
if volumeType != "" {
415+
opts.NetworkDiskType = volumeType
416+
}
417+
if volumeCount != 0 {
418+
opts.NetworkDiskCount = volumeCount
419+
}
420+
if volumeIOPS != 0 {
421+
opts.UltraDiskIOPS = int64(volumeIOPS)
422+
}
423+
opts.UseMultipleDisks = !RAID0
399424
return opts
400425
}
401426

@@ -405,7 +430,7 @@ func getIBMOpts(
405430
volumeSize int,
406431
volumeType string,
407432
volumeIOPS int,
408-
extraVolumeCount int,
433+
volumeCount int,
409434
RAID0 bool,
410435
bootDiskOnly bool,
411436
) vm.ProviderOpts {
@@ -424,17 +449,10 @@ func getIBMOpts(
424449
}
425450

426451
// We reuse the parameters of the default data volume for extra volumes.
427-
opts.AttachedVolumes = make(ibm.IbmVolumeList, 0)
428-
if extraVolumeCount > 0 {
429-
for i := 0; i < extraVolumeCount; i++ {
430-
opts.AttachedVolumes = append(opts.AttachedVolumes, &ibm.IbmVolume{
431-
VolumeType: opts.DefaultVolume.VolumeType,
432-
VolumeSize: opts.DefaultVolume.VolumeSize,
433-
IOPS: opts.DefaultVolume.IOPS,
434-
})
435-
}
436-
opts.UseMultipleDisks = !RAID0
452+
if volumeCount != 0 {
453+
opts.AttachedVolumesCount = volumeCount
437454
}
455+
opts.UseMultipleDisks = !RAID0
438456
opts.BootDiskOnly = bootDiskOnly
439457

440458
return opts
@@ -594,18 +612,25 @@ func (s *ClusterSpec) RoachprodOpts(
594612
}
595613
}
596614

597-
if s.FileSystem == Zfs {
598-
if cloud != GCE && cloud != IBM {
599-
return vm.CreateOpts{}, nil, nil, "", errors.Errorf(
600-
"node creation with zfs file system not yet supported on %s", cloud,
601-
)
615+
switch s.FileSystem {
616+
case Ext4:
617+
// ext4 is the default, do nothing unless we randomly want to use zfs
618+
if s.RandomlyUseZfs {
619+
rng, _ := randutil.NewPseudoRand()
620+
if rng.Float64() <= 0.2 {
621+
createVMOpts.SSDOpts.FileSystem = vm.Zfs
622+
}
602623
}
624+
case Zfs:
603625
createVMOpts.SSDOpts.FileSystem = vm.Zfs
604-
} else if s.RandomlyUseZfs && (cloud == GCE || cloud == IBM) {
605-
rng, _ := randutil.NewPseudoRand()
606-
if rng.Float64() <= 0.2 {
607-
createVMOpts.SSDOpts.FileSystem = vm.Zfs
608-
}
626+
case Xfs:
627+
createVMOpts.SSDOpts.FileSystem = vm.Xfs
628+
case F2fs:
629+
createVMOpts.SSDOpts.FileSystem = vm.F2fs
630+
case Btrfs:
631+
createVMOpts.SSDOpts.FileSystem = vm.Btrfs
632+
default:
633+
return vm.CreateOpts{}, nil, nil, "", errors.Errorf("unknown file system type: %v", s.FileSystem)
609634
}
610635

611636
var workloadMachineType string
@@ -633,30 +658,34 @@ func (s *ClusterSpec) RoachprodOpts(
633658
var workloadProviderOpts vm.ProviderOpts
634659
switch cloud {
635660
case AWS:
636-
providerOpts = getAWSOpts(machineType, s.VolumeSize, s.AWS.VolumeThroughput, s.AWS.VolumeIOPS,
637-
createVMOpts.SSDOpts.UseLocalSSD, s.UseSpotVMs, false)
638-
workloadProviderOpts = getAWSOpts(workloadMachineType, s.VolumeSize, s.AWS.VolumeThroughput, s.AWS.VolumeIOPS,
639-
createVMOpts.SSDOpts.UseLocalSSD, s.UseSpotVMs, !s.WorkloadRequiresDisk)
661+
providerOpts = getAWSOpts(machineType, s.VolumeSize, s.VolumeCount, s.AWS.VolumeThroughput, s.VolumeType, s.AWS.VolumeIOPS,
662+
createVMOpts.SSDOpts.UseLocalSSD, s.RAID0, s.UseSpotVMs, false)
663+
workloadProviderOpts = getAWSOpts(workloadMachineType, s.VolumeSize, s.VolumeCount, s.AWS.VolumeThroughput, s.VolumeType, s.AWS.VolumeIOPS,
664+
createVMOpts.SSDOpts.UseLocalSSD, s.RAID0, s.UseSpotVMs, !s.WorkloadRequiresDisk)
640665
case GCE:
641666
providerOpts = getGCEOpts(machineType, s.VolumeSize, ssdCount,
642667
createVMOpts.SSDOpts.UseLocalSSD, s.RAID0, s.TerminateOnMigration,
643-
s.GCE.MinCPUPlatform, vm.ParseArch(createVMOpts.Arch), s.GCE.VolumeType,
644-
s.GCE.VolumeCount, s.UseSpotVMs, false,
668+
s.GCE.MinCPUPlatform, vm.ParseArch(createVMOpts.Arch), s.VolumeType,
669+
s.VolumeCount, s.UseSpotVMs, false,
645670
)
646671
workloadProviderOpts = getGCEOpts(workloadMachineType, s.VolumeSize, ssdCount,
647672
createVMOpts.SSDOpts.UseLocalSSD, s.RAID0, s.TerminateOnMigration,
648-
s.GCE.MinCPUPlatform, vm.ParseArch(createVMOpts.Arch), s.GCE.VolumeType,
649-
s.GCE.VolumeCount, s.UseSpotVMs, !s.WorkloadRequiresDisk,
673+
s.GCE.MinCPUPlatform, vm.ParseArch(createVMOpts.Arch), s.VolumeType,
674+
s.VolumeCount, s.UseSpotVMs, !s.WorkloadRequiresDisk,
650675
)
651676
case Azure:
652-
providerOpts = getAzureOpts(machineType, s.VolumeSize, false)
653-
workloadProviderOpts = getAzureOpts(workloadMachineType, s.VolumeSize, true)
677+
providerOpts = getAzureOpts(machineType,
678+
s.VolumeSize, s.VolumeType, s.VolumeCount, s.Azure.VolumeIOPS, s.RAID0, false,
679+
)
680+
workloadProviderOpts = getAzureOpts(workloadMachineType,
681+
s.VolumeSize, s.VolumeType, s.VolumeCount, s.Azure.VolumeIOPS, s.RAID0, true,
682+
)
654683
case IBM:
655-
providerOpts = getIBMOpts(machineType, s.TerminateOnMigration, s.VolumeSize,
656-
s.IBM.VolumeType, s.IBM.VolumeIOPS, s.IBM.VolumeCount, s.RAID0, false,
684+
providerOpts = getIBMOpts(machineType, s.TerminateOnMigration,
685+
s.VolumeSize, s.VolumeType, s.IBM.VolumeIOPS, s.VolumeCount, s.RAID0, false,
657686
)
658-
workloadProviderOpts = getIBMOpts(workloadMachineType, s.TerminateOnMigration, s.VolumeSize,
659-
s.IBM.VolumeType, s.IBM.VolumeIOPS, s.IBM.VolumeCount, s.RAID0, true,
687+
workloadProviderOpts = getIBMOpts(workloadMachineType, s.TerminateOnMigration,
688+
s.VolumeSize, s.VolumeType, s.IBM.VolumeIOPS, s.VolumeCount, s.RAID0, true,
660689
)
661690
}
662691

pkg/cmd/roachtest/spec/cluster_spec_test.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,22 @@ func TestClustersCompatible(t *testing.T) {
2525
t.Run("spec has different GCE spec with cloud as GCE", func(t *testing.T) {
2626
s1 := ClusterSpec{NodeCount: 5}
2727
s2 := ClusterSpec{NodeCount: 5}
28-
s1.GCE.VolumeType = "mock_volume1"
29-
s2.GCE.VolumeType = "mock_volume2"
28+
s1.VolumeType = "mock_volume1"
29+
s2.VolumeType = "mock_volume2"
3030
require.False(t, ClustersCompatible(s1, s2, GCE))
3131
})
3232
t.Run("spec has different GCE spec with cloud as AWS", func(t *testing.T) {
3333
s1 := ClusterSpec{NodeCount: 5}
3434
s2 := ClusterSpec{NodeCount: 5}
35-
s1.GCE.VolumeType = "mock_volume1"
36-
s2.GCE.VolumeType = "mock_volume2"
35+
s1.VolumeType = "mock_volume1"
36+
s2.VolumeType = "mock_volume2"
37+
require.False(t, ClustersCompatible(s1, s2, AWS))
38+
})
39+
t.Run("spec has different spec with cloud as AWS", func(t *testing.T) {
40+
s1 := ClusterSpec{NodeCount: 5}
41+
s2 := ClusterSpec{NodeCount: 5}
42+
s1.GCE.MinCPUPlatform = "mock_platform1"
43+
s2.GCE.MinCPUPlatform = "mock_platform2"
3744
require.True(t, ClustersCompatible(s1, s2, AWS))
3845
})
3946
}

0 commit comments

Comments
 (0)