Skip to content

Commit bf74f7a

Browse files
authored
Merge pull request #8136 from omerap12/add-metric-for-failed-in-place
Add metric for failed in-place update attempt
2 parents f302346 + b1ed5ce commit bf74f7a

File tree

3 files changed

+17
-3
lines changed

3 files changed

+17
-3
lines changed

vertical-pod-autoscaler/docs/features.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,4 +126,5 @@ VPA provides metrics to track in-place update operations:
126126
* `vpa_in_place_updatable_pods_total`: Number of pods matching in-place update criteria
127127
* `vpa_in_place_updated_pods_total`: Number of pods successfully updated in-place
128128
* `vpa_vpas_with_in_place_updatable_pods_total`: Number of VPAs with pods eligible for in-place updates
129-
* `vpa_vpas_with_in_place_updated_pods_total`: Number of VPAs with successfully in-place updated pods
129+
* `vpa_vpas_with_in_place_updated_pods_total`: Number of VPAs with successfully in-place updated pods
130+
* `vpa_updater_failed_in_place_update_attempts_total`: Number of failed attempts to update pods in-place.

vertical-pod-autoscaler/pkg/updater/logic/updater.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ func (u *updater) RunOnce(ctx context.Context) {
284284
err := inPlaceLimiter.InPlaceUpdate(pod, vpa, u.eventRecorder)
285285
if err != nil {
286286
klog.V(0).InfoS("In-place update failed", "error", err, "pod", klog.KObj(pod))
287+
metrics_updater.RecordFailedInPlaceUpdate(vpaSize, "InPlaceUpdateError")
287288
continue
288289
}
289290
withInPlaceUpdated = true

vertical-pod-autoscaler/pkg/utils/metrics/updater/updater.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,15 +108,21 @@ var (
108108
}, []string{"vpa_size_log2"},
109109
)
110110

111-
// TODO: Add metrics for failed in-place update attempts
111+
failedInPlaceUpdateAttempts = prometheus.NewCounterVec(
112+
prometheus.CounterOpts{
113+
Namespace: metricsNamespace,
114+
Name: "failed_in_place_update_attempts_total",
115+
Help: "Number of failed attempts to update Pods in-place.",
116+
}, []string{"vpa_size_log2", "reason"},
117+
)
112118

113119
functionLatency = metrics.CreateExecutionTimeMetric(metricsNamespace,
114120
"Time spent in various parts of VPA Updater main loop.")
115121
)
116122

117123
// Register initializes all metrics for VPA Updater
118124
func Register() {
119-
prometheus.MustRegister(controlledCount, evictableCount, evictedCount, vpasWithEvictablePodsCount, vpasWithEvictedPodsCount, inPlaceUpdatableCount, inPlaceUpdatedCount, vpasWithInPlaceUpdatablePodsCount, vpasWithInPlaceUpdatedPodsCount, functionLatency)
125+
prometheus.MustRegister(controlledCount, evictableCount, evictedCount, vpasWithEvictablePodsCount, vpasWithEvictedPodsCount, inPlaceUpdatableCount, inPlaceUpdatedCount, vpasWithInPlaceUpdatablePodsCount, vpasWithInPlaceUpdatedPodsCount, failedInPlaceUpdateAttempts, functionLatency)
120126
}
121127

122128
// NewExecutionTimer provides a timer for Updater's RunOnce execution
@@ -179,6 +185,12 @@ func AddInPlaceUpdatedPod(vpaSize int) {
179185
inPlaceUpdatedCount.WithLabelValues(strconv.Itoa(log2)).Inc()
180186
}
181187

188+
// RecordFailedInPlaceUpdate increases the counter of failed in-place update attempts by given VPA size and reason
189+
func RecordFailedInPlaceUpdate(vpaSize int, reason string) {
190+
log2 := metrics.GetVpaSizeLog2(vpaSize)
191+
failedInPlaceUpdateAttempts.WithLabelValues(strconv.Itoa(log2), reason).Inc()
192+
}
193+
182194
// Add increases the counter for the given VPA size
183195
func (g *SizeBasedGauge) Add(vpaSize int, value int) {
184196
log2 := metrics.GetVpaSizeLog2(vpaSize)

0 commit comments

Comments
 (0)