Merge pull request #7992 from voelzmo/enh/concurrent-recommender

k8s-ci-robot · web-flow · commit d0a297372ad4 · 2025-05-21T01:18:34.000-07:00
Make VPA and Checkpoint updates concurrent
diff --git a/vertical-pod-autoscaler/common/flags.go b/vertical-pod-autoscaler/common/flags.go
@@ -39,8 +39,8 @@ type CommonFlags struct {
 func InitCommonFlags() *CommonFlags {
 	cf := &CommonFlags{}
 	flag.StringVar(&cf.KubeConfig, "kubeconfig", "", "Path to a kubeconfig. Only required if out-of-cluster.")
-	flag.Float64Var(&cf.KubeApiQps, "kube-api-qps", 5.0, "QPS limit when making requests to Kubernetes apiserver")
-	flag.Float64Var(&cf.KubeApiBurst, "kube-api-burst", 10.0, "QPS burst limit when making requests to Kubernetes apiserver")
+	flag.Float64Var(&cf.KubeApiQps, "kube-api-qps", 50.0, "QPS limit when making requests to Kubernetes apiserver")
+	flag.Float64Var(&cf.KubeApiBurst, "kube-api-burst", 100.0, "QPS burst limit when making requests to Kubernetes apiserver")
 	flag.BoolVar(&cf.EnableProfiling, "profiling", false, "Is debug/pprof endpoint enabled")
 	flag.StringVar(&cf.VpaObjectNamespace, "vpa-object-namespace", apiv1.NamespaceAll, "Specifies the namespace to search for VPA objects. Leave empty to include all namespaces. If provided, the garbage collector will only clean this namespace.")
 	flag.StringVar(&cf.IgnoredVpaObjectNamespaces, "ignored-vpa-object-namespaces", "", "A comma-separated list of namespaces to ignore when searching for VPA objects. Leave empty to avoid ignoring any namespaces. These namespaces will not be cleaned by the garbage collector.")
diff --git a/vertical-pod-autoscaler/docs/flags.md b/vertical-pod-autoscaler/docs/flags.md
@@ -16,8 +16,8 @@ This document is auto-generated from the flag definitions in the VPA admission-c
 | `--client-ca-file` | "/etc/tls-certs/caCert.pem" |                  Path to CA PEM file. |
 | `--feature-gates` |  |            A set of key=value pairs that describe feature gates for alpha/experimental features. Options are: |
 | `--ignored-vpa-object-namespaces` |  |   A comma-separated list of namespaces to ignore when searching for VPA objects. Leave empty to avoid ignoring any namespaces. These namespaces will not be cleaned by the garbage collector. |
-| `--kube-api-burst` | 10 |                   QPS burst limit when making requests to Kubernetes apiserver |
-| `--kube-api-qps` | 5 |                     QPS limit when making requests to Kubernetes apiserver |
+| `--kube-api-burst` | 100 |                   QPS burst limit when making requests to Kubernetes apiserver |
+| `--kube-api-qps` | 50 |                     QPS limit when making requests to Kubernetes apiserver |
 | `--kubeconfig` |  |                      Path to a kubeconfig. Only required if out-of-cluster. |
 | `--log-backtrace-at` | :0 |         when logging hits line file:N, emit a stack trace |
 | `--log-dir` |  |                         If non-empty, write log files in this directory (no effect when -logtostderr=true) |
@@ -73,8 +73,8 @@ This document is auto-generated from the flag definitions in the VPA recommender
 | `--history-resolution` | "1h" |                              Resolution at which Prometheus is queried for historical metrics |
 | `--humanize-memory` |  |                                        Convert memory values in recommendations to the highest appropriate SI unit with up to 2 decimal places for better readability. |
 | `--ignored-vpa-object-namespaces` |  |                   A comma-separated list of namespaces to ignore when searching for VPA objects. Leave empty to avoid ignoring any namespaces. These namespaces will not be cleaned by the garbage collector. |
-| `--kube-api-burst` | 10 |                                   QPS burst limit when making requests to Kubernetes apiserver |
-| `--kube-api-qps` | 5 |                                     QPS limit when making requests to Kubernetes apiserver |
+| `--kube-api-burst` | 100 |                                   QPS burst limit when making requests to Kubernetes apiserver |
+| `--kube-api-qps` | 50 |                                     QPS limit when making requests to Kubernetes apiserver |
 | `--kubeconfig` |  |                                      Path to a kubeconfig. Only required if out-of-cluster. |
 | `--leader-elect` |  |                                           Start a leader election client and gain leadership before executing the main loop. Enable this when running replicated components for high availability. |
 | `--leader-elect-lease-duration` | 15s |                   The duration that non-leader candidates will wait after observing a leadership renewal until attempting to acquire leadership of a led but unrenewed leader slot. This is effectively the maximum duration that a leader can be stopped before it is replaced by another candidate. This is only applicable if leader election is enabled. |
@@ -93,7 +93,7 @@ This document is auto-generated from the flag definitions in the VPA recommender
 | `--memory-histogram-decay-half-life` | 24h0m0s |              The amount of time it takes a historical memory usage sample to lose half of its weight. In other words, a fresh usage sample is twice as 'important' as one with age equal to the half life period. |
 | `--memory-saver` |  |                                           If true, only track pods which have an associated VPA |
 | `--metric-for-pod-labels` | "up{job=\"kubernetes-pods\"}" |                           Which metric to look for pod labels in metrics |
-| `--min-checkpoints` | 10 |                                    Minimum number of checkpoints to write per recommender's main loop |
+| `--min-checkpoints` | 10 |                                    Minimum number of checkpoints to write per recommender's main loop. WARNING: this flag is deprecated and doesn't have any effect. It will be removed in a future release. Refer to update-worker-count to influence the minimum number of checkpoints written per loop. |
 | `--one-output` |  |                                             If true, only write logs to their native severity level (vs also writing to each lower severity level; no effect when -logtostderr=true) |
 | `--oom-bump-up-ratio` | 1.2 |                                The memory bump up ratio when OOM occurred, default is 1.2. |
 | `--oom-min-bump-up-bytes` | 1.048576e+08 |                            The minimal increase of memory when OOM occurred in bytes, default is 100 * 1024 * 1024 |
@@ -121,6 +121,7 @@ This document is auto-generated from the flag definitions in the VPA recommender
 | `--storage` |  |                                         Specifies storage mode. Supported values: prometheus, checkpoint |
 | `--target-cpu-percentile` | 0.9 |                            CPU usage percentile that will be used as a base for CPU target recommendation. Doesn't affect CPU lower bound, CPU upper bound nor memory recommendations. |
 | `--target-memory-percentile` | 0.9 |                         Memory usage percentile that will be used as a base for memory target recommendation. Doesn't affect memory lower bound nor memory upper bound. |
+| `--update-worker-count` | 10 |                       Number of concurrent workers to update VPA recommendations and checkpoints. When increasing this setting, make sure the client-side rate limits (kube-api-qps and `kube-api-burst`) are either increased or turned off as well. Determines the minimum number of VPA checkpoints written per recommender loop. |
 | `--use-external-metrics` |  |                                   ALPHA.  Use an external metrics provider instead of metrics_server. |
 | `--username` |  |                                        The username used in the prometheus server basic auth |
 | `--v` | 4 | Set the log level verbosity |
@@ -142,8 +143,8 @@ This document is auto-generated from the flag definitions in the VPA updater cod
 | `--feature-gates` |  |                                     A set of key=value pairs that describe feature gates for alpha/experimental features. Options are: |
 | `--ignored-vpa-object-namespaces` |  |                            A comma-separated list of namespaces to ignore when searching for VPA objects. Leave empty to avoid ignoring any namespaces. These namespaces will not be cleaned by the garbage collector. |
 | `--in-recommendation-bounds-eviction-lifetime-threshold` | 12h0m0s |   Pods that live for at least that long can be evicted even if their request is within the [MinRecommended...MaxRecommended] range |
-| `--kube-api-burst` | 10 |                                            QPS burst limit when making requests to Kubernetes apiserver |
-| `--kube-api-qps` | 5 |                                              QPS limit when making requests to Kubernetes apiserver |
+| `--kube-api-burst` | 100 |                                            QPS burst limit when making requests to Kubernetes apiserver |
+| `--kube-api-qps` | 50 |                                              QPS limit when making requests to Kubernetes apiserver |
 | `--kubeconfig` |  |                                               Path to a kubeconfig. Only required if out-of-cluster. |
 | `--leader-elect` |  |                                                    Start a leader election client and gain leadership before executing the main loop. Enable this when running replicated components for high availability. |
 | `--leader-elect-lease-duration` | 15s |                            The duration that non-leader candidates will wait after observing a leadership renewal until attempting to acquire leadership of a led but unrenewed leader slot. This is effectively the maximum duration that a leader can be stopped before it is replaced by another candidate. This is only applicable if leader election is enabled. |
diff --git a/vertical-pod-autoscaler/pkg/recommender/checkpoint/checkpoint_writer.go b/vertical-pod-autoscaler/pkg/recommender/checkpoint/checkpoint_writer.go
@@ -20,6 +20,7 @@ import (
 	"context"
 	"fmt"
 	"sort"
+	"sync"
 	"time"
 
 	v1 "k8s.io/api/core/v1"
@@ -35,9 +36,9 @@ import (
 // CheckpointWriter persistently stores aggregated historical usage of containers
 // controlled by VPA objects. This state can be restored to initialize the model after restart.
 type CheckpointWriter interface {
-	// StoreCheckpoints writes at least minCheckpoints if there are more checkpoints to write.
+	// StoreCheckpoints writes checkpoints for at least `concurrentWorkers` number of VPAs.
 	// Checkpoints are written until ctx permits or all checkpoints are written.
-	StoreCheckpoints(ctx context.Context, now time.Time, minCheckpoints int) error
+	StoreCheckpoints(ctx context.Context, concurrentWorkers int)
 }
 
 type checkpointWriter struct {
@@ -76,48 +77,72 @@ func getVpasToCheckpoint(clusterVpas map[model.VpaID]*model.Vpa) []*model.Vpa {
 	return vpas
 }
 
-func (writer *checkpointWriter) StoreCheckpoints(ctx context.Context, now time.Time, minCheckpoints int) error {
+func processCheckpointUpdateForVPA(vpa *model.Vpa, writer *checkpointWriter) {
+	now := time.Now()
+	aggregateContainerStateMap := buildAggregateContainerStateMap(vpa, writer.cluster, now)
+	for container, aggregatedContainerState := range aggregateContainerStateMap {
+		containerCheckpoint, err := aggregatedContainerState.SaveToCheckpoint()
+		if err != nil {
+			klog.ErrorS(err, "Cannot serialize checkpoint", "vpa", klog.KRef(vpa.ID.Namespace, vpa.ID.VpaName), "container", container)
+			continue
+		}
+		checkpointName := fmt.Sprintf("%s-%s", vpa.ID.VpaName, container)
+		vpaCheckpoint := vpa_types.VerticalPodAutoscalerCheckpoint{
+			ObjectMeta: metav1.ObjectMeta{Name: checkpointName},
+			Spec: vpa_types.VerticalPodAutoscalerCheckpointSpec{
+				ContainerName: container,
+				VPAObjectName: vpa.ID.VpaName,
+			},
+			Status: *containerCheckpoint,
+		}
+		err = api_util.CreateOrUpdateVpaCheckpoint(writer.vpaCheckpointClient.VerticalPodAutoscalerCheckpoints(vpa.ID.Namespace), &vpaCheckpoint)
+		if err != nil {
+			klog.ErrorS(err, "Cannot save checkpoint for VPA", "vpa", klog.KRef(vpa.ID.Namespace, vpaCheckpoint.Spec.VPAObjectName), "container", vpaCheckpoint.Spec.ContainerName)
+		} else {
+			klog.V(3).InfoS("Saved checkpoint for VPA", "vpa", klog.KRef(vpa.ID.Namespace, vpaCheckpoint.Spec.VPAObjectName), "container", vpaCheckpoint.Spec.ContainerName)
+			vpa.CheckpointWritten = now
+		}
+	}
+}
+
+func (writer *checkpointWriter) StoreCheckpoints(ctx context.Context, concurrentWorkers int) {
 	vpas := getVpasToCheckpoint(writer.cluster.VPAs())
+
+	// Create a channel to send VPA updates to workers
+	vpaCheckpointUpdates := make(chan *model.Vpa, len(vpas))
+
+	// Create a wait group to wait for all workers to finish
+	var wg sync.WaitGroup
+	// Start workers. Each worker processes at least one checkpoint before checking for a cancelled context.
+	for i := 0; i < concurrentWorkers; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for vpaToCheckpoint := range vpaCheckpointUpdates {
+				processCheckpointUpdateForVPA(vpaToCheckpoint, writer)
+				select {
+				case <-ctx.Done():
+					return
+				default:
+				}
+			}
+		}()
+	}
+
+	// Send VPA Checkpoint updates to the workers
 	for _, vpa := range vpas {
+		vpaCheckpointUpdates <- vpa
+	}
 
-		// Draining ctx.Done() channel. ctx.Err() will be checked if timeout occurred, but minCheckpoints have
-		// to be written before return from this function.
-		select {
-		case <-ctx.Done():
-		default:
-		}
+	// Close the channel to signal workers to stop after draining the channel
+	close(vpaCheckpointUpdates)
 
-		if ctx.Err() != nil && minCheckpoints <= 0 {
-			return ctx.Err()
-		}
+	// Wait for all workers to finish
+	wg.Wait()
 
-		aggregateContainerStateMap := buildAggregateContainerStateMap(vpa, writer.cluster, now)
-		for container, aggregatedContainerState := range aggregateContainerStateMap {
-			containerCheckpoint, err := aggregatedContainerState.SaveToCheckpoint()
-			if err != nil {
-				klog.ErrorS(err, "Cannot serialize checkpoint", "vpa", klog.KRef(vpa.ID.Namespace, vpa.ID.VpaName), "container", container)
-				continue
-			}
-			checkpointName := fmt.Sprintf("%s-%s", vpa.ID.VpaName, container)
-			vpaCheckpoint := vpa_types.VerticalPodAutoscalerCheckpoint{
-				ObjectMeta: metav1.ObjectMeta{Name: checkpointName},
-				Spec: vpa_types.VerticalPodAutoscalerCheckpointSpec{
-					ContainerName: container,
-					VPAObjectName: vpa.ID.VpaName,
-				},
-				Status: *containerCheckpoint,
-			}
-			err = api_util.CreateOrUpdateVpaCheckpoint(writer.vpaCheckpointClient.VerticalPodAutoscalerCheckpoints(vpa.ID.Namespace), &vpaCheckpoint)
-			if err != nil {
-				klog.ErrorS(err, "Cannot save checkpoint for VPA", "vpa", klog.KRef(vpa.ID.Namespace, vpaCheckpoint.Spec.VPAObjectName), "container", vpaCheckpoint.Spec.ContainerName)
-			} else {
-				klog.V(3).InfoS("Saved checkpoint for VPA", "vpa", klog.KRef(vpa.ID.Namespace, vpaCheckpoint.Spec.VPAObjectName), "container", vpaCheckpoint.Spec.ContainerName)
-				vpa.CheckpointWritten = now
-			}
-			minCheckpoints--
-		}
+	if ctx.Err() != nil {
+		klog.V(0).InfoS("Failed to store all checkpoints within the configured `checkpoints-timeout`", "err", ctx.Err())
 	}
-	return nil
 }
 
 // Build the AggregateContainerState for the purpose of the checkpoint. This is an aggregation of state of all
diff --git a/vertical-pod-autoscaler/pkg/recommender/checkpoint/checkpoint_writer_test.go b/vertical-pod-autoscaler/pkg/recommender/checkpoint/checkpoint_writer_test.go
@@ -17,17 +17,26 @@ limitations under the License.
 package checkpoint
 
 import (
+	"bytes"
+	"context"
 	"fmt"
 	"testing"
 	"time"
 
+	"github.com/stretchr/testify/assert"
+	autoscalingv1 "k8s.io/api/autoscaling/v1"
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/runtime"
+	core "k8s.io/client-go/testing"
+	"k8s.io/klog/v2"
+	klogtest "k8s.io/klog/v2/test"
 
 	vpa_types "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"
+	fakeautoscalingv1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/client/clientset/versioned/typed/autoscaling.k8s.io/v1/fake"
 	"k8s.io/autoscaler/vertical-pod-autoscaler/pkg/recommender/model"
-
-	"github.com/stretchr/testify/assert"
+	"k8s.io/autoscaler/vertical-pod-autoscaler/pkg/utils/test"
 )
 
 // TODO: Extract these constants to a common test module.
@@ -171,3 +180,69 @@ func TestGetVpasToCheckpointSorts(t *testing.T) {
 	assert.Equal(t, genVpaID(2), result[2].ID)
 
 }
+
+func TestStoreCheckpointsMakesProgressEvenForCancelledContext(t *testing.T) {
+	klogtest.InitKlog(t)
+	tmpLogBuffer := bytes.NewBuffer(nil)
+	klog.SetOutput(tmpLogBuffer)
+
+	concurrentWorkers := 2
+
+	// immediately cancel the context to check if at least checkpoints for `concurrentWorkers` number of VPAs get written
+	ctx, cancelFunc := context.WithCancel(context.Background())
+	cancelFunc()
+	clusterState := model.NewClusterState(testGcPeriod)
+
+	// prepare ClusterState with 5 VPAs referencing Pods
+	vpaBuilder := test.VerticalPodAutoscaler().WithContainer("container-1").WithContainer("container-2").WithNamespace("test-namespace")
+
+	for i := 0; i < 5; i++ {
+		targetRef := &autoscalingv1.CrossVersionObjectReference{
+			Kind:       "Pod",
+			Name:       fmt.Sprintf("pod-%d", i),
+			APIVersion: "apps/v1",
+		}
+		labelSelector, _ := labels.Parse(fmt.Sprintf("app=pod-%d", i))
+		vpa := vpaBuilder.WithName(fmt.Sprintf("vpa-%d", i)).WithTargetRef(targetRef).Get()
+		err := clusterState.AddOrUpdateVpa(vpa, labelSelector)
+		assert.NoError(t, err)
+	}
+
+	// prepare ClusterState with 5 pods that have 2 containers each
+	for i := 0; i < 5; i++ {
+		podID := model.PodID{
+			Namespace: "test-namespace",
+			PodName:   fmt.Sprintf("pod-%d", i),
+		}
+		podLabels := map[string]string{"app": fmt.Sprintf("pod-%d", i)}
+		clusterState.AddOrUpdatePod(podID, podLabels, v1.PodRunning)
+		for j := 0; j < 2; j++ {
+			containerID := model.ContainerID{
+				PodID:         podID,
+				ContainerName: fmt.Sprintf("container-%d", j),
+			}
+			err := clusterState.AddOrUpdateContainer(containerID, testRequest)
+			assert.NoError(t, err)
+		}
+	}
+
+	patchedCheckpoints := []string{}
+	checkpointClient := &fakeautoscalingv1.FakeAutoscalingV1{Fake: &core.Fake{}}
+	checkpointClient.Fake.AddReactor("patch", "verticalpodautoscalercheckpoints", func(action core.Action) (handled bool, ret runtime.Object, err error) {
+		patchAction := action.(core.PatchAction)
+		name := patchAction.GetName()
+		time.Sleep(2 * time.Millisecond) // Simulate some delay in patching, such that we can test the timeout
+		patchedCheckpoints = append(patchedCheckpoints, name)
+
+		return true, nil, nil
+	})
+
+	writer := NewCheckpointWriter(clusterState, checkpointClient)
+	writer.StoreCheckpoints(ctx, concurrentWorkers)
+
+	// Because we have 2 concurrent workers, expect 2 VPAs to get processed. Each worker picks a VPA to process before checking if the context has been cancelled.
+	// Each VPA has 2 Containers, therefore we expect 4 Checkpoints to be written
+	assert.Equal(t, 4, len(patchedCheckpoints), "Expected 4 checkpoints to be written, but got %d", len(patchedCheckpoints))
+
+	assert.Contains(t, tmpLogBuffer.String(), "context canceled")
+}
diff --git a/vertical-pod-autoscaler/pkg/recommender/main.go b/vertical-pod-autoscaler/pkg/recommender/main.go
@@ -65,6 +65,7 @@ var (
 	address                = flag.String("address", ":8942", "The address to expose Prometheus metrics.")
 	storage                = flag.String("storage", "", `Specifies storage mode. Supported values: prometheus, checkpoint (default)`)
 	memorySaver            = flag.Bool("memory-saver", false, `If true, only track pods which have an associated VPA`)
+	updateWorkerCount      = flag.Int("update-worker-count", 10, "Number of concurrent workers to update VPA recommendations and checkpoints. When increasing this setting, make sure the client-side rate limits (`kube-api-qps` and `kube-api-burst`) are either increased or turned off as well. Determines the minimum number of VPA checkpoints written per recommender loop.")
 )
 
 // Prometheus history provider flags
@@ -142,6 +143,11 @@ func main() {
 		klog.ErrorS(nil, "--vpa-object-namespace and --ignored-vpa-object-namespaces are mutually exclusive and can't be set together.")
 		os.Exit(255)
 	}
+
+	if *routines.MinCheckpointsPerRun != 10 { // Default value is 10
+		klog.InfoS("DEPRECATION WARNING: The 'min-checkpoints' flag is deprecated and has no effect. It will be removed in a future release.")
+	}
+
 	ctx := context.Background()
 
 	healthCheck := metrics.NewHealthCheck(*metricsFetcherInterval * 5)
@@ -284,6 +290,7 @@ func run(ctx context.Context, healthCheck *metrics.HealthCheck, commonFlag *comm
 		RecommendationPostProcessors: postProcessors,
 		CheckpointsGCInterval:        *checkpointsGCInterval,
 		UseCheckpoints:               useCheckpoints,
+		UpdateWorkerCount:            *updateWorkerCount,
 	}.Make()
 
 	promQueryTimeout, err := time.ParseDuration(*queryTimeout)
diff --git a/vertical-pod-autoscaler/pkg/recommender/routines/recommender.go b/vertical-pod-autoscaler/pkg/recommender/routines/recommender.go