diff --git a/src/go/plugin/go.d/collector/mssql/collector.go b/src/go/plugin/go.d/collector/mssql/collector.go index 4c6f3fe5d60ceb..22aa0112206a22 100644 --- a/src/go/plugin/go.d/collector/mssql/collector.go +++ b/src/go/plugin/go.d/collector/mssql/collector.go @@ -7,6 +7,7 @@ import ( "database/sql" _ "embed" "errors" + "strings" "sync" "time" @@ -72,6 +73,26 @@ type Config struct { // Default: true - MSSQL Query Store may contain unmasked PII in query text QueryStoreFunctionEnabled *bool `yaml:"query_store_function_enabled,omitempty" json:"query_store_function_enabled"` + // DeadlockInfoFunctionEnabled controls whether the deadlock-info function is available + // Uses pointer to distinguish "unset" from explicit "false": + // - nil (unset): Apply default of true (enabled) + // - false: Explicitly disabled + // - true: Explicitly enabled + // Default: true + DeadlockInfoFunctionEnabled *bool `yaml:"deadlock_info_function_enabled,omitempty" json:"deadlock_info_function_enabled"` + + // ErrorInfoFunctionEnabled controls whether the error-info function is available + // Uses pointer to distinguish "unset" from explicit "false": + // - nil (unset): Apply default of true (enabled) + // - false: Explicitly disabled + // - true: Explicitly enabled + // Default: true + ErrorInfoFunctionEnabled *bool `yaml:"error_info_function_enabled,omitempty" json:"error_info_function_enabled"` + + // ErrorInfoSessionName sets the Extended Events session name for error-info + // Default: "netdata_errors" + ErrorInfoSessionName string `yaml:"error_info_session_name,omitempty" json:"error_info_session_name,omitempty"` + // TopQueriesLimit is the maximum number of queries to return TopQueriesLimit int `yaml:"top_queries_limit,omitempty" json:"top_queries_limit,omitempty"` } @@ -92,6 +113,30 @@ func (c *Config) GetQueryStoreFunctionEnabled() bool { return *c.QueryStoreFunctionEnabled } +// GetDeadlockInfoFunctionEnabled returns whether the deadlock-info function is enabled (default: true) +func (c *Config) GetDeadlockInfoFunctionEnabled() bool { + if c.DeadlockInfoFunctionEnabled == nil { + return true + } + return *c.DeadlockInfoFunctionEnabled +} + +// GetErrorInfoFunctionEnabled returns whether the error-info function is enabled (default: true) +func (c *Config) GetErrorInfoFunctionEnabled() bool { + if c.ErrorInfoFunctionEnabled == nil { + return true + } + return *c.ErrorInfoFunctionEnabled +} + +// GetErrorInfoSessionName returns the Extended Events session name for error-info. +func (c *Config) GetErrorInfoSessionName() string { + if strings.TrimSpace(c.ErrorInfoSessionName) == "" { + return "netdata_errors" + } + return c.ErrorInfoSessionName +} + type Collector struct { module.Base Config `yaml:",inline" json:""` diff --git a/src/go/plugin/go.d/collector/mssql/config_schema.json b/src/go/plugin/go.d/collector/mssql/config_schema.json index a31fb499af1d31..0b1da76af82524 100644 --- a/src/go/plugin/go.d/collector/mssql/config_schema.json +++ b/src/go/plugin/go.d/collector/mssql/config_schema.json @@ -50,6 +50,24 @@ "minimum": 1, "maximum": 5000, "default": 500 + }, + "deadlock_info_function_enabled": { + "title": "Enable Deadlock Info Function", + "description": "Enable the deadlock-info function. WARNING: query text may contain unmasked sensitive literals (PII). This function reads deadlock graphs from the system_health session and requires VIEW SERVER STATE. Grant with: GRANT VIEW SERVER STATE TO [netdata_user];", + "type": "boolean", + "default": true + }, + "error_info_function_enabled": { + "title": "Enable Error Info Function", + "description": "Enable the error-info function. WARNING: error messages and query text may contain unmasked sensitive literals (PII). This function reads from a user-managed Extended Events session and requires VIEW SERVER STATE.", + "type": "boolean", + "default": true + }, + "error_info_session_name": { + "title": "Error Info Session Name", + "description": "Name of the Extended Events session that captures error_reported events for error-info. The session must be created by an administrator and include a ring_buffer target.", + "type": "string", + "default": "netdata_errors" } }, "required": [ @@ -79,6 +97,15 @@ "query_store_time_window_days": { "ui:help": "Limits Query Store data to recent days. Lower values improve performance on busy servers." }, + "deadlock_info_function_enabled": { + "ui:help": "When enabled, the deadlock-info function becomes available in the Netdata dashboard. WARNING: query text may contain unmasked sensitive literals and requires VIEW SERVER STATE permission." + }, + "error_info_function_enabled": { + "ui:help": "When enabled, the error-info function becomes available in the Netdata dashboard. WARNING: error messages and query text may include sensitive literals." + }, + "error_info_session_name": { + "ui:help": "The Extended Events session must be created by an administrator and include a ring_buffer target capturing sqlserver.error_reported with sql_text action." + }, "ui:flavour": "tabs", "ui:options": { "tabs": [ @@ -88,7 +115,8 @@ "update_every", "dsn", "timeout", - "vnode" + "vnode", + "deadlock_info_function_enabled" ] }, { diff --git a/src/go/plugin/go.d/collector/mssql/deadlock_info.go b/src/go/plugin/go.d/collector/mssql/deadlock_info.go new file mode 100644 index 00000000000000..1ae4e6c58d603c --- /dev/null +++ b/src/go/plugin/go.d/collector/mssql/deadlock_info.go @@ -0,0 +1,673 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package mssql + +import ( + "context" + "database/sql" + "encoding/xml" + "errors" + "fmt" + "sort" + "strconv" + "strings" + "time" + + mssqlDriver "github.com/microsoft/go-mssqldb" + + "github.com/netdata/netdata/go/plugins/pkg/funcapi" + "github.com/netdata/netdata/go/plugins/plugin/go.d/agent/module" + "github.com/netdata/netdata/go/plugins/plugin/go.d/pkg/strmutil" +) + +const ( + deadlockInfoHelp = "Latest deadlock from the system_health Extended Events ring buffer. WARNING: query text may include unmasked sensitive literals; restrict dashboard access." + deadlockParseErrorStatus = 561 +) + +const ( + deadlockIdxRowID = iota + deadlockIdxDeadlockID + deadlockIdxTimestamp + deadlockIdxProcessID + deadlockIdxSpid + deadlockIdxEcid + deadlockIdxIsVictim + deadlockIdxQueryText + deadlockIdxLockMode + deadlockIdxLockStatus + deadlockIdxWaitResource + deadlockIdxDatabase + deadlockColumnCount +) + +type mssqlDeadlockTxn struct { + processID string + spid string + ecid string + dbid string + queryText string + lockMode string + lockStatus string + waitResource string +} + +type mssqlDeadlockParseResult struct { + deadlockTime time.Time + transactions []*mssqlDeadlockTxn + victimProcessID string + parseErr error + found bool +} + +type mssqlDeadlockGraph struct { + XMLName xml.Name `xml:"deadlock"` + VictimList mssqlDeadlockVictimList `xml:"victim-list"` + ProcessList mssqlDeadlockProcessList `xml:"process-list"` + ResourceList mssqlDeadlockResourceList `xml:"resource-list"` +} + +type mssqlDeadlockResourceList struct { + Resources []mssqlDeadlockResource `xml:",any"` +} + +type mssqlDeadlockVictimList struct { + Victims []mssqlDeadlockVictim `xml:"victimProcess"` +} + +type mssqlDeadlockVictim struct { + ID string `xml:"id,attr"` +} + +type mssqlDeadlockProcessList struct { + Processes []mssqlDeadlockProcess `xml:"process"` +} + +type mssqlDeadlockProcess struct { + ID string `xml:"id,attr"` + SPID string `xml:"spid,attr"` + ECID string `xml:"ecid,attr"` + DBID string `xml:"dbid,attr"` + LockMode string `xml:"lockMode,attr"` + WaitResource string `xml:"waitresource,attr"` + InputBuf string `xml:"inputbuf"` +} + +type mssqlDeadlockResource struct { + XMLName xml.Name + DBID string `xml:"dbid,attr"` + OwnerList mssqlDeadlockOwnerList `xml:"owner-list"` + WaiterList mssqlDeadlockWaiterList `xml:"waiter-list"` +} + +type mssqlDeadlockOwnerList struct { + Owners []mssqlDeadlockResourceEntry `xml:"owner"` +} + +type mssqlDeadlockWaiterList struct { + Waiters []mssqlDeadlockResourceEntry `xml:"waiter"` +} + +type mssqlDeadlockResourceEntry struct { + ID string `xml:"id,attr"` + Mode string `xml:"mode,attr"` +} + +func (c *Collector) deadlockInfoParams(context.Context) ([]funcapi.ParamConfig, error) { + if !c.Config.GetDeadlockInfoFunctionEnabled() { + return nil, fmt.Errorf("deadlock-info function disabled in configuration") + } + return []funcapi.ParamConfig{}, nil +} + +func (c *Collector) collectDeadlockInfo(ctx context.Context) *module.FunctionResponse { + if !c.Config.GetDeadlockInfoFunctionEnabled() { + return &module.FunctionResponse{ + Status: 503, + Message: "deadlock-info function has been disabled in configuration. " + + "To enable, set deadlock_info_function_enabled: true in the MSSQL collector config.", + } + } + + deadlockTime, deadlockXML, err := c.queryLatestDeadlock(ctx) + if err != nil { + if errors.Is(err, context.DeadlineExceeded) { + return c.deadlockInfoResponse(504, "deadlock query timed out", nil) + } + if isDeadlockPermissionError(err) { + return c.deadlockInfoResponse(403, deadlockPermissionMessage(), nil) + } + c.Warningf("deadlock-info: query failed: %v", err) + return c.deadlockInfoResponse(500, fmt.Sprintf("deadlock query failed: %v", err), nil) + } + + if deadlockXML == "" { + return c.deadlockInfoResponse(200, "no deadlock found in system_health ring buffer", nil) + } + + dbNames, dbErr := c.queryDatabaseNames(ctx) + if dbErr != nil { + c.Debugf("deadlock-info: database name mapping failed: %v", dbErr) + dbNames = map[int]string{} + } + + parseRes := parseDeadlockGraph(deadlockXML, deadlockTime) + if parseRes.parseErr != nil { + c.Warningf("deadlock-info: parse failed: %v", parseRes.parseErr) + return c.deadlockInfoResponse(deadlockParseErrorStatus, "deadlock graph could not be parsed", nil) + } + if !parseRes.found { + return c.deadlockInfoResponse(200, "no deadlock found in system_health ring buffer", nil) + } + + deadlockID := generateDeadlockID(parseRes.deadlockTime) + rows := buildDeadlockRows(parseRes, deadlockID, dbNames) + if len(rows) == 0 { + return c.deadlockInfoResponse(200, "deadlock detected but no processes could be parsed", nil) + } + + return c.deadlockInfoResponse(200, "latest detected deadlock", rows) +} + +func (c *Collector) deadlockInfoResponse(status int, message string, data [][]any) *module.FunctionResponse { + if data == nil { + data = make([][]any, 0) + } + return &module.FunctionResponse{ + Status: status, + Help: deadlockInfoHelp, + Message: message, + Columns: c.buildDeadlockColumns(), + Data: data, + DefaultSortColumn: "timestamp", + } +} + +func (c *Collector) queryLatestDeadlock(ctx context.Context) (time.Time, string, error) { + qctx, cancel := context.WithTimeout(ctx, c.Timeout.Duration()) + defer cancel() + + var deadlockTime sql.NullTime + var deadlockXML sql.NullString + err := c.db.QueryRowContext(qctx, querySystemHealthLatestDeadlock).Scan(&deadlockTime, &deadlockXML) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return time.Time{}, "", nil + } + return time.Time{}, "", err + } + + if !deadlockXML.Valid || strings.TrimSpace(deadlockXML.String) == "" { + return time.Time{}, "", nil + } + + if deadlockTime.Valid { + return deadlockTime.Time, deadlockXML.String, nil + } + return time.Now().UTC(), deadlockXML.String, nil +} + +func (c *Collector) queryDatabaseNames(ctx context.Context) (map[int]string, error) { + qctx, cancel := context.WithTimeout(ctx, c.Timeout.Duration()) + defer cancel() + + rows, err := c.db.QueryContext(qctx, queryDatabaseNamesByID) + if err != nil { + return nil, err + } + defer rows.Close() + + names := make(map[int]string) + for rows.Next() { + var id int + var name string + if err := rows.Scan(&id, &name); err != nil { + return nil, err + } + names[id] = name + } + if err := rows.Err(); err != nil { + return nil, err + } + return names, nil +} + +func (c *Collector) buildDeadlockColumns() map[string]any { + const ( + ftString = funcapi.FieldTypeString + ftInteger = funcapi.FieldTypeInteger + ftTimestamp = funcapi.FieldTypeTimestamp + + trNone = funcapi.FieldTransformNone + trNumber = funcapi.FieldTransformNumber + trDatetime = funcapi.FieldTransformDatetime + + visValue = funcapi.FieldVisualValue + visPill = funcapi.FieldVisualPill + + sortAsc = funcapi.FieldSortAscending + sortDesc = funcapi.FieldSortDescending + + summaryCount = funcapi.FieldSummaryCount + summaryMax = funcapi.FieldSummaryMax + + filterMulti = funcapi.FieldFilterMultiselect + filterRange = funcapi.FieldFilterRange + ) + + return map[string]any{ + "row_id": funcapi.Column{ + Index: deadlockIdxRowID, + Name: "Row ID", + Type: ftString, + Visualization: visValue, + Sort: sortAsc, + Sortable: true, + Summary: summaryCount, + Filter: filterMulti, + UniqueKey: true, + Visible: false, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + "deadlock_id": funcapi.Column{ + Index: deadlockIdxDeadlockID, + Name: "Deadlock ID", + Type: ftString, + Visualization: visValue, + Sort: sortDesc, + Sortable: true, + Sticky: true, + Summary: summaryCount, + Filter: filterMulti, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + "timestamp": funcapi.Column{ + Index: deadlockIdxTimestamp, + Name: "Timestamp", + Type: ftTimestamp, + Units: "", + Visualization: visValue, + Sort: sortDesc, + Sortable: true, + Sticky: true, + Summary: summaryMax, + Filter: filterRange, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trDatetime, + }, + }.BuildColumn(), + "process_id": funcapi.Column{ + Index: deadlockIdxProcessID, + Name: "Process ID", + Type: ftString, + Visualization: visValue, + Sort: sortAsc, + Sortable: true, + Sticky: true, + Summary: summaryCount, + Filter: filterMulti, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + "spid": funcapi.Column{ + Index: deadlockIdxSpid, + Name: "SPID", + Type: ftInteger, + Visualization: visValue, + Sort: sortAsc, + Sortable: true, + Sticky: false, + Summary: summaryCount, + Filter: filterRange, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNumber, + }, + }.BuildColumn(), + "ecid": funcapi.Column{ + Index: deadlockIdxEcid, + Name: "ECID", + Type: ftInteger, + Visualization: visValue, + Sort: sortAsc, + Sortable: true, + Sticky: false, + Summary: summaryCount, + Filter: filterRange, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNumber, + }, + }.BuildColumn(), + "is_victim": funcapi.Column{ + Index: deadlockIdxIsVictim, + Name: "Victim", + Type: ftString, + Visualization: visPill, + Sort: sortAsc, + Sortable: true, + Summary: summaryCount, + Filter: filterMulti, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + "query_text": funcapi.Column{ + Index: deadlockIdxQueryText, + Name: "Query", + Type: ftString, + Visualization: visValue, + Sort: sortAsc, + Sortable: false, + Sticky: false, + Summary: summaryCount, + Filter: filterMulti, + FullWidth: true, + Wrap: true, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + "lock_mode": funcapi.Column{ + Index: deadlockIdxLockMode, + Name: "Lock Mode", + Type: ftString, + Visualization: visValue, + Sort: sortAsc, + Sortable: true, + Sticky: false, + Summary: summaryCount, + Filter: filterMulti, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + "lock_status": funcapi.Column{ + Index: deadlockIdxLockStatus, + Name: "Lock Status", + Type: ftString, + Visualization: visPill, + Sort: sortAsc, + Sortable: true, + Sticky: false, + Summary: summaryCount, + Filter: filterMulti, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + "wait_resource": funcapi.Column{ + Index: deadlockIdxWaitResource, + Name: "Wait Resource", + Type: ftString, + Visualization: visValue, + Sort: sortAsc, + Sortable: false, + Sticky: false, + Summary: summaryCount, + Filter: filterMulti, + FullWidth: true, + Wrap: true, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + "database": funcapi.Column{ + Index: deadlockIdxDatabase, + Name: "Database", + Type: ftString, + Visualization: visValue, + Sort: sortAsc, + Sortable: true, + Sticky: false, + Summary: summaryCount, + Filter: filterMulti, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + } +} + +func parseDeadlockGraph(deadlockXML string, deadlockTime time.Time) mssqlDeadlockParseResult { + now := time.Now().UTC() + result := mssqlDeadlockParseResult{ + deadlockTime: now, + found: strings.TrimSpace(deadlockXML) != "", + } + if result.found && !deadlockTime.IsZero() { + result.deadlockTime = deadlockTime.UTC() + } + + if !result.found { + return result + } + + var graph mssqlDeadlockGraph + if err := xml.Unmarshal([]byte(deadlockXML), &graph); err != nil { + result.parseErr = fmt.Errorf("failed to parse deadlock XML: %w", err) + return result + } + + if len(graph.VictimList.Victims) > 0 { + result.victimProcessID = strings.TrimSpace(graph.VictimList.Victims[0].ID) + } + + txnByID := make(map[string]*mssqlDeadlockTxn) + ensureTxn := func(id string) *mssqlDeadlockTxn { + if id == "" { + return &mssqlDeadlockTxn{} + } + if txn, ok := txnByID[id]; ok { + return txn + } + txn := &mssqlDeadlockTxn{processID: id} + txnByID[id] = txn + return txn + } + + for _, proc := range graph.ProcessList.Processes { + processID := strings.TrimSpace(proc.ID) + if processID == "" { + continue + } + txn := ensureTxn(processID) + txn.spid = strings.TrimSpace(proc.SPID) + txn.ecid = strings.TrimSpace(proc.ECID) + txn.dbid = strings.TrimSpace(proc.DBID) + txn.queryText = strings.TrimSpace(proc.InputBuf) + txn.lockMode = strings.TrimSpace(proc.LockMode) + txn.waitResource = strings.TrimSpace(proc.WaitResource) + if txn.waitResource != "" { + txn.lockStatus = "WAITING" + } else if txn.lockStatus == "" { + txn.lockStatus = "GRANTED" + } + } + + for _, resource := range graph.ResourceList.Resources { + resourceDBID := strings.TrimSpace(resource.DBID) + for _, owner := range resource.OwnerList.Owners { + id := strings.TrimSpace(owner.ID) + if id == "" { + continue + } + txn := ensureTxn(id) + if txn.dbid == "" && resourceDBID != "" { + txn.dbid = resourceDBID + } + if txn.lockStatus != "WAITING" { + if txn.lockStatus == "" { + txn.lockStatus = "GRANTED" + } + mode := strings.TrimSpace(owner.Mode) + if mode != "" { + txn.lockMode = mode + } + } + } + for _, waiter := range resource.WaiterList.Waiters { + id := strings.TrimSpace(waiter.ID) + if id == "" { + continue + } + txn := ensureTxn(id) + if txn.dbid == "" && resourceDBID != "" { + txn.dbid = resourceDBID + } + txn.lockStatus = "WAITING" + mode := strings.TrimSpace(waiter.Mode) + if mode != "" { + txn.lockMode = mode + } + } + } + + if len(txnByID) == 0 { + result.parseErr = fmt.Errorf("deadlock graph detected but no processes could be parsed") + return result + } + + result.transactions = make([]*mssqlDeadlockTxn, 0, len(txnByID)) + for _, txn := range txnByID { + if txn.processID == "" { + continue + } + if txn.lockStatus == "" { + if strings.TrimSpace(txn.waitResource) != "" { + txn.lockStatus = "WAITING" + } else { + txn.lockStatus = "GRANTED" + } + } + result.transactions = append(result.transactions, txn) + } + + sort.Slice(result.transactions, func(i, j int) bool { + return result.transactions[i].processID < result.transactions[j].processID + }) + + if len(result.transactions) == 0 { + result.parseErr = fmt.Errorf("deadlock graph detected but no valid processes could be parsed") + } + + return result +} + +func buildDeadlockRows(parseRes mssqlDeadlockParseResult, deadlockID string, dbNames map[int]string) [][]any { + timestamp := parseRes.deadlockTime.UTC().Format(time.RFC3339Nano) + rows := make([][]any, 0, len(parseRes.transactions)) + + for _, txn := range parseRes.transactions { + processID := strings.TrimSpace(txn.processID) + if processID == "" { + continue + } + + spid := parseOptionalInt(txn.spid) + ecid := parseOptionalInt(txn.ecid) + dbidInt, hasDBID := parseIntString(txn.dbid) + + var database any + if hasDBID { + if name, ok := dbNames[dbidInt]; ok { + database = name + } + } + + isVictim := "false" + if parseRes.victimProcessID != "" && processID == parseRes.victimProcessID { + isVictim = "true" + } + + queryText := strmutil.TruncateText(strings.TrimSpace(txn.queryText), maxQueryTextLength) + lockMode := strings.TrimSpace(txn.lockMode) + lockStatus := strings.TrimSpace(txn.lockStatus) + waitResource := strmutil.TruncateText(strings.TrimSpace(txn.waitResource), maxQueryTextLength) + + row := make([]any, deadlockColumnCount) + row[deadlockIdxRowID] = fmt.Sprintf("%s:%s", deadlockID, processID) + row[deadlockIdxDeadlockID] = deadlockID + row[deadlockIdxTimestamp] = timestamp + row[deadlockIdxProcessID] = processID + row[deadlockIdxSpid] = spid + row[deadlockIdxEcid] = ecid + row[deadlockIdxIsVictim] = isVictim + row[deadlockIdxQueryText] = queryText + row[deadlockIdxLockMode] = lockMode + row[deadlockIdxLockStatus] = lockStatus + row[deadlockIdxWaitResource] = waitResource + row[deadlockIdxDatabase] = database + + rows = append(rows, row) + } + + return rows +} + +func parseIntString(s string) (int, bool) { + s = strings.TrimSpace(s) + if s == "" { + return 0, false + } + n, err := strconv.Atoi(s) + if err != nil { + return 0, false + } + return n, true +} + +func parseOptionalInt(s string) any { + if n, ok := parseIntString(s); ok { + return n + } + return nil +} + +func generateDeadlockID(t time.Time) string { + if t.IsZero() { + t = time.Now().UTC() + } + t = t.UTC() + micros := t.Nanosecond() / 1000 + return t.Format("20060102150405") + fmt.Sprintf("%06d", micros) +} + +func isDeadlockPermissionError(err error) bool { + var sqlErr mssqlDriver.Error + if errors.As(err, &sqlErr) { + if sqlErr.Number == 297 || sqlErr.Number == 229 { + return true + } + if permissionMessage := strings.ToLower(sqlErr.Message); permissionMessage != "" { + if strings.Contains(permissionMessage, "view server state") || + strings.Contains(permissionMessage, "permission") || + strings.Contains(permissionMessage, "denied") { + return true + } + } + } + + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "view server state") || + strings.Contains(msg, "permission") || + strings.Contains(msg, "denied") +} + +func deadlockPermissionMessage() string { + return "deadlock info requires VIEW SERVER STATE permission. Grant with: GRANT VIEW SERVER STATE TO [netdata_user];" +} diff --git a/src/go/plugin/go.d/collector/mssql/deadlock_info_test.go b/src/go/plugin/go.d/collector/mssql/deadlock_info_test.go new file mode 100644 index 00000000000000..cc2885d8219914 --- /dev/null +++ b/src/go/plugin/go.d/collector/mssql/deadlock_info_test.go @@ -0,0 +1,440 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package mssql + +import ( + "context" + "errors" + "strings" + "testing" + "time" + + "github.com/DATA-DOG/go-sqlmock" + mssqlDriver "github.com/microsoft/go-mssqldb" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestConfig_GetDeadlockInfoFunctionEnabled(t *testing.T) { + tests := []struct { + name string + cfg Config + want bool + }{ + { + name: "default enabled when unset", + cfg: Config{}, + want: true, + }, + { + name: "explicitly enabled", + cfg: Config{ + DeadlockInfoFunctionEnabled: boolPtr(true), + }, + want: true, + }, + { + name: "explicitly disabled", + cfg: Config{ + DeadlockInfoFunctionEnabled: boolPtr(false), + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, tt.cfg.GetDeadlockInfoFunctionEnabled()) + }) + } +} + +func TestConfig_GetErrorInfoFunctionEnabled(t *testing.T) { + tests := []struct { + name string + cfg Config + want bool + }{ + { + name: "default enabled when unset", + cfg: Config{}, + want: true, + }, + { + name: "explicitly enabled", + cfg: Config{ + ErrorInfoFunctionEnabled: boolPtr(true), + }, + want: true, + }, + { + name: "explicitly disabled", + cfg: Config{ + ErrorInfoFunctionEnabled: boolPtr(false), + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, tt.cfg.GetErrorInfoFunctionEnabled()) + }) + } +} + +func TestConfig_GetErrorInfoSessionName(t *testing.T) { + tests := []struct { + name string + cfg Config + want string + }{ + { + name: "default session name", + cfg: Config{}, + want: "netdata_errors", + }, + { + name: "explicit session name", + cfg: Config{ + ErrorInfoSessionName: "custom_errors", + }, + want: "custom_errors", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, tt.cfg.GetErrorInfoSessionName()) + }) + } +} + +func TestParseDeadlockGraph_WithDeadlock(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 123456000, time.UTC) + res := parseDeadlockGraph(sampleDeadlockGraph, now) + + require.True(t, res.found) + require.NoError(t, res.parseErr) + require.Equal(t, now.UTC(), res.deadlockTime) + require.Equal(t, "process1", res.victimProcessID) + require.Len(t, res.transactions, 2) + + txn1 := findTxn(res.transactions, "process1") + txn2 := findTxn(res.transactions, "process2") + + require.NotNil(t, txn1) + require.NotNil(t, txn2) + + assert.Equal(t, "WAITING", txn1.lockStatus) + assert.Equal(t, "X", txn1.lockMode) + assert.Contains(t, txn1.queryText, "deadlock_a") + + assert.Equal(t, "WAITING", txn2.lockStatus) + assert.Equal(t, "X", txn2.lockMode) + assert.Contains(t, txn2.queryText, "deadlock_b") +} + +func TestParseDeadlockGraph_ThreeWayDeadlock(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 222000000, time.UTC) + res := parseDeadlockGraph(sampleDeadlockGraphThreeWay, now) + + require.True(t, res.found) + require.NoError(t, res.parseErr) + require.Equal(t, "process2", res.victimProcessID) + require.Len(t, res.transactions, 3) + + deadlockID := generateDeadlockID(now) + rows := buildDeadlockRows(res, deadlockID, map[int]string{5: "netdata"}) + require.Len(t, rows, 3) + + victimCount := 0 + for _, row := range rows { + assert.Equal(t, deadlockID, row[deadlockIdxDeadlockID]) + assert.True(t, strings.HasPrefix(row[deadlockIdxRowID].(string), deadlockID+":")) + if row[deadlockIdxIsVictim] == "true" { + victimCount++ + } + } + assert.Equal(t, 1, victimCount) +} + +func TestParseDeadlockGraph_WaitingWinsOverOwner(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 0, time.UTC) + res := parseDeadlockGraph(sampleDeadlockOwnerAfterWaiter, now) + + require.True(t, res.found) + require.NoError(t, res.parseErr) + + txn := findTxn(res.transactions, "process1") + require.NotNil(t, txn) + assert.Equal(t, "WAITING", txn.lockStatus) + assert.Equal(t, "X", txn.lockMode) +} + +func TestParseDeadlockGraph_NoDeadlock(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 0, time.UTC) + res := parseDeadlockGraph("", now) + + assert.False(t, res.found) + assert.NoError(t, res.parseErr) + assert.Len(t, res.transactions, 0) +} + +func TestParseDeadlockGraph_Malformed(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 0, time.UTC) + res := parseDeadlockGraph("", now) + + assert.True(t, res.found) + assert.Error(t, res.parseErr) +} + +func TestCollectDeadlockInfo_ParseError(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherRegexp)) + require.NoError(t, err) + defer db.Close() + + deadlockTime := time.Date(2026, time.January, 25, 12, 34, 56, 0, time.UTC) + deadlockRows := sqlmock.NewRows([]string{"deadlock_time", "deadlock_xml"}). + AddRow(deadlockTime, "") + mock.ExpectQuery("WITH xevents").WillReturnRows(deadlockRows) + + dbNameRows := sqlmock.NewRows([]string{"database_id", "name"}) + mock.ExpectQuery("SELECT\\s+database_id").WillReturnRows(dbNameRows) + + c := New() + c.db = db + + resp := c.collectDeadlockInfo(context.Background()) + require.Equal(t, deadlockParseErrorStatus, resp.Status) + assert.Contains(t, strings.ToLower(resp.Message), "could not be parsed") + require.NoError(t, mock.ExpectationsWereMet()) +} + +func TestCollectDeadlockInfo_QueryError(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherRegexp)) + require.NoError(t, err) + defer db.Close() + + mock.ExpectQuery("WITH xevents"). + WillReturnError(errors.New("boom")) + + c := New() + c.db = db + + resp := c.collectDeadlockInfo(context.Background()) + require.Equal(t, 500, resp.Status) + assert.Contains(t, strings.ToLower(resp.Message), "deadlock query failed") + require.NoError(t, mock.ExpectationsWereMet()) +} + +func TestBuildDeadlockRows(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 654321000, time.UTC) + res := parseDeadlockGraph(sampleDeadlockGraph, now) + require.NoError(t, res.parseErr) + + deadlockID := generateDeadlockID(now) + dbNames := map[int]string{5: "netdata"} + rows := buildDeadlockRows(res, deadlockID, dbNames) + + require.Len(t, rows, 2) + + row := rows[0] + assert.Equal(t, deadlockID, row[deadlockIdxDeadlockID]) + assert.Equal(t, deadlockID+":"+row[deadlockIdxProcessID].(string), row[deadlockIdxRowID]) + assert.Equal(t, "netdata", row[deadlockIdxDatabase]) +} + +func TestDeadlockPermissionErrorDetection(t *testing.T) { + err := mssqlDriver.Error{Number: 297, Message: "VIEW SERVER STATE permission was denied"} + assert.True(t, isDeadlockPermissionError(err)) +} + +func TestCollectDeadlockInfo_PermissionDenied(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherRegexp)) + require.NoError(t, err) + defer db.Close() + + mock.ExpectQuery("WITH xevents"). + WillReturnError(mssqlDriver.Error{Number: 297, Message: "VIEW SERVER STATE permission was denied"}) + + c := New() + c.db = db + + resp := c.collectDeadlockInfo(context.Background()) + require.Equal(t, 403, resp.Status) + assert.Contains(t, strings.ToLower(resp.Message), "view server state") + require.NoError(t, mock.ExpectationsWereMet()) +} + +func TestCollectDeadlockInfo_Disabled(t *testing.T) { + c := New() + c.Config.DeadlockInfoFunctionEnabled = boolPtr(false) + + resp := c.collectDeadlockInfo(context.Background()) + require.Equal(t, 503, resp.Status) + assert.Contains(t, strings.ToLower(resp.Message), "disabled") +} + +func TestCollectDeadlockInfo_Timeout(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherRegexp)) + require.NoError(t, err) + defer db.Close() + + mock.ExpectQuery("WITH xevents"). + WillReturnError(context.DeadlineExceeded) + + c := New() + c.db = db + + resp := c.collectDeadlockInfo(context.Background()) + require.Equal(t, 504, resp.Status) + assert.Contains(t, strings.ToLower(resp.Message), "timed out") + require.NoError(t, mock.ExpectationsWereMet()) +} + +func TestCollectDeadlockInfo_Success(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherRegexp)) + require.NoError(t, err) + defer db.Close() + + now := time.Date(2026, time.January, 25, 12, 0, 0, 0, time.UTC) + + mock.ExpectQuery("WITH xevents"). + WillReturnRows( + sqlmock.NewRows([]string{"deadlock_time", "deadlock_xml"}). + AddRow(now, sampleDeadlockGraph), + ) + + mock.ExpectQuery("SELECT database_id, name"). + WillReturnRows( + sqlmock.NewRows([]string{"database_id", "name"}). + AddRow(5, "netdata"), + ) + + c := New() + c.db = db + + resp := c.collectDeadlockInfo(context.Background()) + require.Equal(t, 200, resp.Status) + require.NotEmpty(t, resp.Data) + require.NoError(t, mock.ExpectationsWereMet()) +} + +func findTxn(txns []*mssqlDeadlockTxn, id string) *mssqlDeadlockTxn { + for _, txn := range txns { + if txn.processID == id { + return txn + } + } + return nil +} + +func boolPtr(v bool) *bool { return &v } + +const sampleDeadlockGraph = ` + + + + + + + UPDATE dbo.deadlock_a SET value = value + 1 WHERE id = 1 + + + UPDATE dbo.deadlock_b SET value = value + 1 WHERE id = 1 + + + + + + + + + + + + + + + + + + + + + +` + +const sampleDeadlockOwnerAfterWaiter = ` + + + + + + + UPDATE dbo.deadlock_a SET value = value + 1 WHERE id = 1 + + + + + + + + + + + + + + + + + + +` + +const sampleDeadlockGraphThreeWay = ` + + + + + + + UPDATE dbo.deadlock_a SET value = value + 1 WHERE id = 1 + + + UPDATE dbo.deadlock_b SET value = value + 1 WHERE id = 1 + + + UPDATE dbo.deadlock_c SET value = value + 1 WHERE id = 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +` diff --git a/src/go/plugin/go.d/collector/mssql/error_info.go b/src/go/plugin/go.d/collector/mssql/error_info.go new file mode 100644 index 00000000000000..c83d41fbdb87c9 --- /dev/null +++ b/src/go/plugin/go.d/collector/mssql/error_info.go @@ -0,0 +1,522 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package mssql + +import ( + "context" + "database/sql" + "encoding/xml" + "fmt" + "strings" + "time" + + "github.com/netdata/netdata/go/plugins/pkg/funcapi" + "github.com/netdata/netdata/go/plugins/plugin/go.d/agent/module" +) + +const ( + mssqlErrorAttrEnabled = "enabled" + mssqlErrorAttrNotEnabled = "not_enabled" + mssqlErrorAttrNotSupported = "not_supported" + mssqlErrorAttrNoData = "no_data" +) + +type mssqlErrorRow struct { + Time time.Time + ErrorNumber *int64 + Message string + Query string + QueryHash string +} + +type mssqlPlanOps struct { + HashMatch int64 + MergeJoin int64 + NestedLoops int64 + Sorts int64 +} + +func (c *Collector) errorInfoParams(context.Context) ([]funcapi.ParamConfig, error) { + if !c.Config.GetErrorInfoFunctionEnabled() { + return nil, fmt.Errorf("error-info function disabled in configuration") + } + return []funcapi.ParamConfig{}, nil +} + +func (c *Collector) collectErrorInfo(ctx context.Context) *module.FunctionResponse { + if !c.Config.GetErrorInfoFunctionEnabled() { + return &module.FunctionResponse{ + Status: 503, + Message: "error-info not enabled: function disabled in configuration. " + + "To enable, set error_info_function_enabled: true in the MSSQL collector config.", + } + } + + sessionName := c.Config.GetErrorInfoSessionName() + status, rows, err := c.fetchMSSQLErrorRows(ctx, sessionName, c.TopQueriesLimit) + if err != nil { + if isDeadlockPermissionError(err) { + return &module.FunctionResponse{Status: 403, Message: deadlockPermissionMessage()} + } + if status == mssqlErrorAttrNotEnabled { + return &module.FunctionResponse{Status: 503, Message: "error-info not enabled: Extended Events session not found or ring_buffer target missing"} + } + return &module.FunctionResponse{Status: 500, Message: fmt.Sprintf("error-info query failed: %v", err)} + } + + data := make([][]any, 0, len(rows)) + for _, row := range rows { + var errNo any + if row.ErrorNumber != nil { + errNo = *row.ErrorNumber + } + data = append(data, []any{ + row.Time, + errNo, + row.Message, + row.Query, + row.QueryHash, + }) + } + + return &module.FunctionResponse{ + Status: 200, + Help: "Recent SQL errors from Extended Events error_reported", + Columns: buildMSSQLErrorInfoColumns(), + Data: data, + DefaultSortColumn: "timestamp", + } +} + +func buildMSSQLErrorInfoColumns() map[string]any { + columns := map[string]any{ + "timestamp": funcapi.Column{ + Index: 0, + Name: "Timestamp", + Type: funcapi.FieldTypeTimestamp, + Sortable: true, + ValueOptions: funcapi.ValueOptions{Transform: funcapi.FieldTransformDatetime}, + }.BuildColumn(), + "errorNumber": funcapi.Column{ + Index: 1, + Name: "Error Number", + Type: ftInteger, + Sortable: true, + ValueOptions: funcapi.ValueOptions{Transform: trNumber}, + }.BuildColumn(), + "errorMessage": funcapi.Column{ + Index: 2, + Name: "Error Message", + Type: ftString, + Sortable: false, + FullWidth: true, + ValueOptions: funcapi.ValueOptions{Transform: trNone}, + }.BuildColumn(), + "query": funcapi.Column{ + Index: 3, + Name: "Query", + Type: ftString, + Sortable: false, + FullWidth: true, + ValueOptions: funcapi.ValueOptions{Transform: trNone}, + }.BuildColumn(), + "queryHash": funcapi.Column{ + Index: 4, + Name: "Query Hash", + Type: ftString, + Sortable: true, + Visible: false, + ValueOptions: funcapi.ValueOptions{Transform: trNone}, + }.BuildColumn(), + } + return columns +} + +func mssqlErrorAttributionColumns() []mssqlColumnMeta { + return []mssqlColumnMeta{ + { + uiKey: "errorAttribution", + displayName: "Error Attribution", + dataType: ftString, + visible: true, + transform: trNone, + sortDir: sortAsc, + summary: summaryCount, + filter: filterMulti, + }, + { + uiKey: "errorNumber", + displayName: "Error Number", + dataType: ftInteger, + visible: true, + transform: trNumber, + sortDir: sortDesc, + summary: summaryMax, + filter: filterRange, + }, + { + uiKey: "sqlState", + displayName: "SQL State", + dataType: ftString, + visible: false, + transform: trNone, + sortDir: sortAsc, + summary: summaryCount, + filter: filterMulti, + }, + { + uiKey: "errorMessage", + displayName: "Error Message", + dataType: ftString, + visible: true, + transform: trNone, + sortDir: sortAsc, + summary: summaryCount, + filter: filterMulti, + fullWidth: true, + }, + } +} + +func mssqlPlanAttributionColumns() []mssqlColumnMeta { + return []mssqlColumnMeta{ + { + uiKey: "hashMatch", + displayName: "Hash Match Joins", + dataType: ftInteger, + visible: true, + transform: trNumber, + sortDir: sortDesc, + summary: summarySum, + filter: filterRange, + }, + { + uiKey: "mergeJoin", + displayName: "Merge Joins", + dataType: ftInteger, + visible: true, + transform: trNumber, + sortDir: sortDesc, + summary: summarySum, + filter: filterRange, + }, + { + uiKey: "nestedLoops", + displayName: "Nested Loops", + dataType: ftInteger, + visible: true, + transform: trNumber, + sortDir: sortDesc, + summary: summarySum, + filter: filterRange, + }, + { + uiKey: "sorts", + displayName: "Sorts", + dataType: ftInteger, + visible: true, + transform: trNumber, + sortDir: sortDesc, + summary: summarySum, + filter: filterRange, + }, + } +} + +func normalizeSQLText(text string) string { + if strings.TrimSpace(text) == "" { + return "" + } + fields := strings.Fields(text) + normalized := strings.Join(fields, " ") + normalized = strings.TrimSpace(normalized) + normalized = strings.TrimRight(normalized, ";") + return strings.TrimSpace(normalized) +} + +func rowString(value any) string { + switch v := value.(type) { + case nil: + return "" + case string: + return v + case []byte: + return string(v) + default: + return fmt.Sprint(v) + } +} + +func nullableString(value string) any { + if strings.TrimSpace(value) == "" { + return nil + } + return value +} + +func (c *Collector) collectMSSQLErrorDetails(ctx context.Context) (string, map[string]mssqlErrorRow) { + status, rows, err := c.fetchMSSQLErrorRows(ctx, c.Config.GetErrorInfoSessionName(), c.TopQueriesLimit) + if err != nil { + if status == mssqlErrorAttrNotEnabled { + return mssqlErrorAttrNotEnabled, nil + } + mapped := classifyMSSQLErrorAttrError(err) + c.Debugf("error attribution query failed: %v (status=%s)", err, mapped) + return mapped, nil + } + + if len(rows) == 0 { + return mssqlErrorAttrNoData, nil + } + + out := make(map[string]mssqlErrorRow, len(rows)*2) + for _, row := range rows { + if row.QueryHash != "" { + if _, ok := out[row.QueryHash]; !ok { + out[row.QueryHash] = row + } + } + key := normalizeSQLText(row.Query) + if key != "" { + if _, ok := out[key]; !ok { + out[key] = row + } + } + } + return mssqlErrorAttrEnabled, out +} + +func classifyMSSQLErrorAttrError(err error) string { + if err == nil { + return mssqlErrorAttrNotEnabled + } + if isDeadlockPermissionError(err) { + return mssqlErrorAttrNotEnabled + } + msg := strings.ToLower(err.Error()) + if strings.Contains(msg, "permission") || strings.Contains(msg, "denied") { + return mssqlErrorAttrNotEnabled + } + if strings.Contains(msg, "invalid column") || + strings.Contains(msg, "invalid object") || + strings.Contains(msg, "could not find") { + return mssqlErrorAttrNotSupported + } + return mssqlErrorAttrNotEnabled +} + +func (c *Collector) collectMSSQLPlanOps(ctx context.Context, data [][]any, cols []mssqlColumnMeta) map[string]map[string]mssqlPlanOps { + dbIdx := -1 + hashIdx := -1 + for i, col := range cols { + switch col.uiKey { + case "database": + dbIdx = i + case "queryHash": + hashIdx = i + } + } + if dbIdx < 0 || hashIdx < 0 { + return map[string]map[string]mssqlPlanOps{} + } + + hashesByDB := make(map[string][]string) + seen := make(map[string]map[string]bool) + for _, row := range data { + if dbIdx >= len(row) || hashIdx >= len(row) { + continue + } + dbName := rowString(row[dbIdx]) + queryHash := rowString(row[hashIdx]) + if dbName == "" || queryHash == "" { + continue + } + if seen[dbName] == nil { + seen[dbName] = make(map[string]bool) + } + if seen[dbName][queryHash] { + continue + } + seen[dbName][queryHash] = true + hashesByDB[dbName] = append(hashesByDB[dbName], queryHash) + } + + out := make(map[string]map[string]mssqlPlanOps) + for dbName, hashes := range hashesByDB { + ops, err := c.fetchMSSQLPlanOpsForDB(ctx, dbName, hashes) + if err != nil { + c.Debugf("plan attribution query failed for %s: %v", dbName, err) + continue + } + out[dbName] = ops + } + return out +} + +func (c *Collector) fetchMSSQLPlanOpsForDB(ctx context.Context, dbName string, hashes []string) (map[string]mssqlPlanOps, error) { + if len(hashes) == 0 { + return map[string]mssqlPlanOps{}, nil + } + + validHashes := make([]string, 0, len(hashes)) + for _, hash := range hashes { + if strings.HasPrefix(hash, "0x") { + validHashes = append(validHashes, hash) + } + } + if len(validHashes) == 0 { + return map[string]mssqlPlanOps{}, nil + } + + escapedDB := strings.ReplaceAll(dbName, "]", "]]") + query := fmt.Sprintf(` +SELECT + CONVERT(VARCHAR(64), q.query_hash, 1) AS query_hash, + CAST(p.query_plan AS NVARCHAR(MAX)) AS query_plan +FROM [%s].sys.query_store_query q +INNER JOIN [%s].sys.query_store_plan p ON q.query_id = p.query_id +WHERE q.query_hash IN (%s); +`, escapedDB, escapedDB, strings.Join(validHashes, ",")) + + qctx, cancel := context.WithTimeout(ctx, c.Timeout.Duration()) + defer cancel() + + rows, err := c.db.QueryContext(qctx, query) + if err != nil { + return nil, err + } + defer rows.Close() + + out := make(map[string]mssqlPlanOps) + for rows.Next() { + var hash sql.NullString + var plan sql.NullString + if err := rows.Scan(&hash, &plan); err != nil { + return nil, err + } + if !hash.Valid || !plan.Valid { + continue + } + ops := countPlanOperators(plan.String) + current := out[hash.String] + current.HashMatch += ops.HashMatch + current.MergeJoin += ops.MergeJoin + current.NestedLoops += ops.NestedLoops + current.Sorts += ops.Sorts + out[hash.String] = current + } + if err := rows.Err(); err != nil { + return nil, err + } + + return out, nil +} + +func (c *Collector) fetchMSSQLErrorRows(ctx context.Context, sessionName string, limit int) (string, []mssqlErrorRow, error) { + if limit <= 0 { + limit = 500 + } + + sessionExists, err := c.mssqlErrorSessionAvailable(ctx, sessionName) + if err != nil { + return mssqlErrorAttrNotEnabled, nil, err + } + if !sessionExists { + return mssqlErrorAttrNotEnabled, nil, fmt.Errorf("session not found") + } + + qctx, cancel := context.WithTimeout(ctx, c.Timeout.Duration()) + defer cancel() + + rows, err := c.db.QueryContext(qctx, queryMSSQLErrorInfo, sql.Named("sessionName", sessionName), sql.Named("limit", limit)) + if err != nil { + return mssqlErrorAttrNotSupported, nil, err + } + defer rows.Close() + + var results []mssqlErrorRow + for rows.Next() { + var ( + ts sql.NullTime + errNo sql.NullInt64 + message sql.NullString + sqlText sql.NullString + queryHash sql.NullString + errNoPtr *int64 + ) + if err := rows.Scan(&ts, &errNo, &message, &sqlText, &queryHash); err != nil { + return mssqlErrorAttrNotSupported, nil, err + } + if errNo.Valid { + val := errNo.Int64 + errNoPtr = &val + } + results = append(results, mssqlErrorRow{ + Time: ts.Time, + ErrorNumber: errNoPtr, + Message: message.String, + Query: sqlText.String, + QueryHash: queryHash.String, + }) + } + if err := rows.Err(); err != nil { + return mssqlErrorAttrNotSupported, nil, err + } + + return mssqlErrorAttrEnabled, results, nil +} + +func (c *Collector) mssqlErrorSessionAvailable(ctx context.Context, sessionName string) (bool, error) { + qctx, cancel := context.WithTimeout(ctx, c.Timeout.Duration()) + defer cancel() + + var count int + err := c.db.QueryRowContext(qctx, queryMSSQLErrorSessionExists, sql.Named("sessionName", sessionName)).Scan(&count) + if err != nil { + return false, err + } + if count == 0 { + return false, nil + } + + err = c.db.QueryRowContext(qctx, queryMSSQLErrorSessionHasRingBuffer, sql.Named("sessionName", sessionName)).Scan(&count) + if err != nil { + return false, err + } + return count > 0, nil +} + +func countPlanOperators(planXML string) mssqlPlanOps { + var ops mssqlPlanOps + if strings.TrimSpace(planXML) == "" { + return ops + } + decoder := xml.NewDecoder(strings.NewReader(planXML)) + for { + tok, err := decoder.Token() + if err != nil { + break + } + start, ok := tok.(xml.StartElement) + if !ok || start.Name.Local != "RelOp" { + continue + } + for _, attr := range start.Attr { + if attr.Name.Local != "PhysicalOp" { + continue + } + switch strings.ToLower(strings.TrimSpace(attr.Value)) { + case "hash match": + ops.HashMatch++ + case "merge join": + ops.MergeJoin++ + case "nested loops": + ops.NestedLoops++ + case "sort": + ops.Sorts++ + } + break + } + } + return ops +} diff --git a/src/go/plugin/go.d/collector/mssql/functions.go b/src/go/plugin/go.d/collector/mssql/functions.go index f168dc32975a29..0d081d471b22cd 100644 --- a/src/go/plugin/go.d/collector/mssql/functions.go +++ b/src/go/plugin/go.d/collector/mssql/functions.go @@ -179,6 +179,8 @@ var mssqlChartGroups = []mssqlChartGroup{ {key: "Rows", title: "Rows", columns: []string{"avgRows", "lastRows", "minRows", "maxRows", "stdevRows"}}, {key: "LogBytes", title: "Log Bytes", columns: []string{"avgLogBytes", "lastLogBytes", "minLogBytes", "maxLogBytes", "stdevLogBytes"}}, {key: "TempDB", title: "TempDB Usage", columns: []string{"avgTempdb", "lastTempdb", "minTempdb", "maxTempdb", "stdevTempdb"}}, + {key: "Joins", title: "Join Operators", columns: []string{"hashMatch", "mergeJoin", "nestedLoops"}}, + {key: "Sorts", title: "Sort Operations", columns: []string{"sorts"}}, } var mssqlLabelColumns = map[string]bool{ @@ -224,6 +226,20 @@ func mssqlMethods() []module.MethodConfig { }, }, }, + { + UpdateEvery: 10, + ID: "deadlock-info", + Name: "Deadlock Info", + Help: deadlockInfoHelp, + RequireCloud: true, + }, + { + UpdateEvery: 10, + ID: "error-info", + Name: "Error Info", + Help: "Recent SQL errors from Extended Events error_reported", + RequireCloud: true, + }, } } @@ -238,6 +254,10 @@ func mssqlMethodParams(ctx context.Context, job *module.Job, method string) ([]f switch method { case "top-queries": return collector.topQueriesParams(ctx) + case "deadlock-info": + return collector.deadlockInfoParams(ctx) + case "error-info": + return collector.errorInfoParams(ctx) default: return nil, fmt.Errorf("unknown method: %s", method) } @@ -261,6 +281,10 @@ func mssqlHandleMethod(ctx context.Context, job *module.Job, method string, para switch method { case "top-queries": return collector.collectTopQueries(ctx, params.Column(paramSort)) + case "deadlock-info": + return collector.collectDeadlockInfo(ctx) + case "error-info": + return collector.collectErrorInfo(ctx) default: return &module.FunctionResponse{Status: 404, Message: fmt.Sprintf("unknown method: %s", method)} } @@ -738,6 +762,89 @@ func (c *Collector) collectTopQueries(ctx context.Context, sortColumn string) *m return &module.FunctionResponse{Status: 500, Message: err.Error()} } + errorStatus, errorDetails := c.collectMSSQLErrorDetails(ctx) + planOpsByDB := c.collectMSSQLPlanOps(ctx, data, cols) + extraCols := append(mssqlErrorAttributionColumns(), mssqlPlanAttributionColumns()...) + + queryIdx := -1 + queryHashIdx := -1 + dbIdx := -1 + for i, col := range cols { + switch col.uiKey { + case "query": + queryIdx = i + case "queryHash": + queryHashIdx = i + case "database": + dbIdx = i + } + } + + if len(extraCols) > 0 { + for i := range data { + status := errorStatus + var errRow mssqlErrorRow + if errorStatus == mssqlErrorAttrEnabled { + found := false + if queryHashIdx >= 0 && queryHashIdx < len(data[i]) { + queryHash := rowString(data[i][queryHashIdx]) + if queryHash != "" { + if row, ok := errorDetails[queryHash]; ok { + status = mssqlErrorAttrEnabled + errRow = row + found = true + } + } + } + if !found && queryIdx >= 0 && queryIdx < len(data[i]) { + queryText := normalizeSQLText(rowString(data[i][queryIdx])) + if queryText != "" { + if row, ok := errorDetails[queryText]; ok { + status = mssqlErrorAttrEnabled + errRow = row + found = true + } + } + } + if !found { + status = mssqlErrorAttrNoData + } + } + + var hashMatch, mergeJoin, nestedLoops, sorts any + if dbIdx >= 0 && dbIdx < len(data[i]) && queryHashIdx >= 0 && queryHashIdx < len(data[i]) { + dbName := rowString(data[i][dbIdx]) + queryHash := rowString(data[i][queryHashIdx]) + if dbName != "" && queryHash != "" { + if opsByHash, ok := planOpsByDB[dbName]; ok { + if ops, ok := opsByHash[queryHash]; ok { + hashMatch = ops.HashMatch + mergeJoin = ops.MergeJoin + nestedLoops = ops.NestedLoops + sorts = ops.Sorts + } + } + } + } + + var errNo any + if errRow.ErrorNumber != nil { + errNo = *errRow.ErrorNumber + } + data[i] = append(data[i], + status, + errNo, + nil, // SQL State is not available in MSSQL + nullableString(rowString(errRow.Message)), + hashMatch, + mergeJoin, + nestedLoops, + sorts, + ) + } + cols = append(cols, extraCols...) + } + // Build dynamic sort options from available columns (only those actually detected) sortParam, sortOptions := c.topQueriesSortParam(cols) diff --git a/src/go/plugin/go.d/collector/mssql/functions_test.go b/src/go/plugin/go.d/collector/mssql/functions_test.go index 48e6eecdbecf25..e513a0939c58a9 100644 --- a/src/go/plugin/go.d/collector/mssql/functions_test.go +++ b/src/go/plugin/go.d/collector/mssql/functions_test.go @@ -13,16 +13,43 @@ func TestMssqlMethods(t *testing.T) { methods := mssqlMethods() require := assert.New(t) - require.Len(methods, 1) - require.Equal("top-queries", methods[0].ID) - require.Equal("Top Queries", methods[0].Name) - require.NotEmpty(methods[0].RequiredParams) + require.Len(methods, 3) + + topIdx := -1 + deadlockIdx := -1 + errorIdx := -1 + for i := range methods { + switch methods[i].ID { + case "top-queries": + topIdx = i + case "deadlock-info": + deadlockIdx = i + case "error-info": + errorIdx = i + } + } + + require.NotEqual(-1, topIdx, "expected top-queries method") + require.NotEqual(-1, deadlockIdx, "expected deadlock-info method") + require.NotEqual(-1, errorIdx, "expected error-info method") + + topMethod := methods[topIdx] + require.Equal("Top Queries", topMethod.Name) + require.NotEmpty(topMethod.RequiredParams) + + deadlockMethod := methods[deadlockIdx] + require.Equal("Deadlock Info", deadlockMethod.Name) + require.Empty(deadlockMethod.RequiredParams) + + errorMethod := methods[errorIdx] + require.Equal("Error Info", errorMethod.Name) + require.Empty(errorMethod.RequiredParams) // Verify at least one default sort option exists var sortParam *funcapi.ParamConfig - for i := range methods[0].RequiredParams { - if methods[0].RequiredParams[i].ID == "__sort" { - sortParam = &methods[0].RequiredParams[i] + for i := range topMethod.RequiredParams { + if topMethod.RequiredParams[i].ID == "__sort" { + sortParam = &topMethod.RequiredParams[i] break } } @@ -260,7 +287,12 @@ func TestCollector_buildMSSQLDynamicColumns(t *testing.T) { func TestMssqlMethods_SortOptionsHaveLabels(t *testing.T) { methods := mssqlMethods() + foundTopQueries := false for _, method := range methods { + if method.ID != "top-queries" { + continue + } + foundTopQueries = true var sortParam *funcapi.ParamConfig for i := range method.RequiredParams { if method.RequiredParams[i].ID == "__sort" { @@ -275,6 +307,7 @@ func TestMssqlMethods_SortOptionsHaveLabels(t *testing.T) { assert.Contains(t, opt.Name, "Top queries by", "label should have standard prefix") } } + assert.True(t, foundTopQueries, "expected top-queries method") } // TestMapAndValidateMSSQLSortColumn_NoSortOptions verifies fallback when no sort columns exist diff --git a/src/go/plugin/go.d/collector/mssql/integrations/microsoft_sql_server.md b/src/go/plugin/go.d/collector/mssql/integrations/microsoft_sql_server.md index 3e9cc9f543744c..689a792d37c4c2 100644 --- a/src/go/plugin/go.d/collector/mssql/integrations/microsoft_sql_server.md +++ b/src/go/plugin/go.d/collector/mssql/integrations/microsoft_sql_server.md @@ -28,7 +28,7 @@ It collects metrics from: - Performance counters (buffer manager, memory manager, SQL statistics) - Dynamic management views (DMVs) for wait statistics, locks, and sessions - Per-database transaction and lock statistics -- SQL Server Agent job status (if permissions allow) +- SQL Server Agent job status It connects to the SQL Server instance via TCP using the go-mssqldb driver and executes queries against: @@ -41,7 +41,7 @@ It connects to the SQL Server instance via TCP using the go-mssqldb driver and e - `sys.dm_os_process_memory` - SQL Server process memory - `sys.dm_os_sys_memory` - OS physical memory and page file - `sys.master_files` - Database file sizes -- `msdb.dbo.sysjobs` - SQL Agent job status (optional) +- `msdb.dbo.sysjobs` - SQL Agent job status This collector is supported on all platforms. @@ -49,7 +49,8 @@ This collector is supported on all platforms. This collector supports collecting metrics from multiple instances of this integration, including remote instances. The monitoring user requires the VIEW SERVER STATE permission to access DMVs. -For SQL Agent job monitoring, access to the msdb database is required. +For SQL Agent job monitoring (queried during collector startup), access to +`msdb.dbo.sysjobs` is required. ### Default Behavior @@ -316,6 +317,14 @@ Aggregated query execution statistics from Query Store runtime views, providing | Query | string | | | The SQL query text with literal values truncated at 4096 characters. Use this to identify the actual SQL being executed and spot parameterized queries or injection risks. | | Database | string | | | Database name where the query was executed. Essential for multi-database analysis to identify which database is experiencing query load. | | Calls | integer | | | Total number of times this query pattern has been executed. High values indicate frequently run queries that may impact server performance significantly. | +| Error Attribution | string | | | Status of error detail attribution for this query. Values: enabled, no_data, not_enabled, not_supported. | +| Error Number | integer | | | Most recent error number observed for this query (when error attribution is enabled). | +| SQL State | string | | hidden | SQLSTATE code (not available for SQL Server; usually empty). | +| Error Message | string | | | Most recent error message for this query (when error attribution is enabled). | +| Hash Match Joins | integer | | | Count of Hash Match join operators across all stored plans for this query. | +| Merge Joins | integer | | | Count of Merge Join operators across all stored plans for this query. | +| Nested Loops | integer | | | Count of Nested Loops operators across all stored plans for this query. | +| Sorts | integer | | | Count of Sort operators across all stored plans for this query. | | Total Time | duration | milliseconds | | Cumulative execution time across all query executions. This is a key metric for identifying the most resource-intensive queries in terms of total server time consumption. | | Avg Time | duration | milliseconds | | Average execution time per query run, calculated as weighted average when execution count is greater than zero. Compare with Total Time to determine if individual executions or high frequency drives resource usage. | | Last Time | duration | milliseconds | hidden | Execution time of the most recent execution for this query pattern. Useful for identifying recent performance changes or individual outlier executions. | @@ -374,6 +383,126 @@ Aggregated query execution statistics from Query Store runtime views, providing | StdDev TempDB | float | | hidden | Standard deviation of tempdb space usage. High variability suggests inconsistent temporary object usage patterns, potentially varying by query complexity, parameter types, or different data access patterns affecting temporary object creation. | +### Deadlock Info + +Retrieves the most recent deadlock event from SQL Server's `system_health` Extended Events ring buffer (`xml_deadlock_report`). + +The deadlock graph XML is parsed to attribute the deadlock to the participating processes and their query text, lock mode, lock status, and wait resource. + +Use cases: +- Identify which process was chosen as the deadlock victim +- Inspect the waiting resource and lock mode involved in the deadlock +- Correlate deadlocks with recent application changes or deployments + +Query text and wait resource strings are truncated at 4096 characters for display purposes. + + +| Aspect | Description | +|:-------|:------------| +| Name | `Mssql:deadlock-info` | +| Require Cloud | yes | +| Performance | Executes on-demand queries against the `system_health` ring buffer:
• Not part of regular metric collection
• Overhead is limited to function execution time and XML parsing | +| Security | Query text and wait resource strings may include unmasked literal values including sensitive data (PII/secrets):
• SQL literals such as emails, IDs, or tokens
• Schema and table names that may be sensitive in some environments
• Restrict dashboard access to authorized personnel only | +| Availability | Available when:
• The collector has successfully connected to SQL Server
• `deadlock_info_function_enabled` is true
• The account has `VIEW SERVER STATE` permission
• Returns HTTP 200 with empty data when no deadlock is found
• Returns HTTP 403 when permission is missing
• Returns HTTP 500 if the query fails
• Returns HTTP 561 when the deadlock graph cannot be parsed
• Returns HTTP 503 if the collector is still initializing or the function is disabled
• Returns HTTP 504 if the query times out | + +#### Prerequisites + +1. Ensure the account has the required permission: + ```sql + GRANT VIEW SERVER STATE TO [netdata]; + ``` +2. Enable the function in Netdata collector config: + ```yaml + jobs: + - name: local + dsn: "sqlserver://user:pass@localhost:1433" + deadlock_info_function_enabled: true + ``` +3. Verify the deadlock source is accessible: + ```sql + SELECT name + FROM sys.dm_xe_sessions + WHERE name = 'system_health'; + ``` + +#### Parameters + +This function has no parameters. + +#### Returns + +Parsed deadlock participants from the latest detected deadlock event. Each row represents one process involved in the deadlock. + +| Column | Type | Unit | Visibility | Description | +|:-------|:-----|:-----|:-----------|:------------| +| Row ID | string | | hidden | Unique row identifier composed of deadlock ID and process ID. | +| Deadlock ID | string | | | Identifier for the deadlock event, derived from the deadlock timestamp to group participating processes. | +| Timestamp | timestamp | | | Timestamp of the deadlock event from the ring buffer when available; otherwise the function execution time. | +| Process ID | string | | | Deadlock graph process identifier for the process involved in the deadlock. | +| SPID | integer | | | SQL Server session ID (SPID) for the process when available. | +| ECID | integer | | | Execution context ID (ECID) for parallel execution contexts when available. | +| Victim | string | | | "true" when the process was chosen as the deadlock victim and rolled back; otherwise "false". | +| Query | string | | | SQL query text for the process involved in the deadlock. Truncated to 4096 characters. | +| Lock Mode | string | | | Lock mode reported for the process within the deadlock graph (for example X or S). | +| Lock Status | string | | | Lock status for the process. WAITING indicates the process was waiting on a lock. | +| Wait Resource | string | | | Lock resource identifier from the deadlock graph showing what the process was waiting on. | +| Database | string | | | Database name mapped from the deadlock graph database ID when available. | + +### Error Info + +Retrieves recent SQL errors from a user-managed Extended Events session that captures `sqlserver.error_reported` +with the `sql_text` and `query_hash` actions (query_hash enables reliable mapping to top-queries). + +| Aspect | Description | +|:-------|:------------| +| Name | `Mssql:error-info` | +| Require Cloud | yes | +| Performance | Executes on-demand queries against the configured Extended Events ring buffer:
• Not part of regular metric collection
• Overhead is limited to function execution time and XML parsing | +| Security | Error messages and query text may include unmasked literal values including sensitive data (PII/secrets):
• Restrict dashboard access to authorized personnel only | +| Availability | Available when:
• The collector has successfully connected to SQL Server
• `error_info_function_enabled` is true
• The Extended Events session exists and has a ring_buffer target
• The account has `VIEW SERVER STATE` permission
• Returns HTTP 200 with empty data when no errors are found
• Returns HTTP 403 when permission is missing
• Returns HTTP 500 if the query fails
• Returns HTTP 503 if the session is not enabled or the function is disabled
• Returns HTTP 504 if the query times out | + +#### Prerequisites + +1. Create an Extended Events session (admin-controlled) that captures `sqlserver.error_reported` with `sql_text` and `query_hash`: + ```sql + CREATE EVENT SESSION [netdata_errors] ON SERVER + ADD EVENT sqlserver.error_reported( + ACTION(sqlserver.sql_text, sqlserver.query_hash) + ) + ADD TARGET package0.ring_buffer; + GO + ALTER EVENT SESSION [netdata_errors] ON SERVER STATE = START; + ``` +2. Ensure the account has the required permission: + ```sql + GRANT VIEW SERVER STATE TO [netdata]; + ``` +3. Enable the function and (optionally) set the session name in Netdata config: + ```yaml + jobs: + - name: local + dsn: "sqlserver://user:pass@localhost:1433" + error_info_function_enabled: true + error_info_session_name: netdata_errors + ``` + +#### Parameters + +This function has no parameters. + +#### Returns + +Recent error events from the configured Extended Events session. + +| Column | Type | Unit | Visibility | Description | +|:-------|:-----|:-----|:-----------|:------------| +| Timestamp | timestamp | | | Timestamp of the error event. | +| Error Number | integer | | | SQL Server error number. | +| Error Message | string | | | Error message text. | +| Query | string | | | SQL text captured with the error event. | +| Query Hash | string | | hidden | Query hash captured with the error event (used for mapping into top-queries). | +| Query Hash | string | | hidden | Query hash captured with the error event (used for mapping into top-queries). | + ## Alerts @@ -410,7 +539,7 @@ CREATE LOGIN netdata_user WITH PASSWORD = 'YourStrongPassword!'; -- Grant VIEW SERVER STATE (required for DMVs) GRANT VIEW SERVER STATE TO netdata_user; --- Optional: Grant access to msdb for SQL Agent job monitoring +-- Grant access to msdb for SQL Agent job monitoring (required) USE msdb; CREATE USER netdata_user FOR LOGIN netdata_user; GRANT SELECT ON dbo.sysjobs TO netdata_user; @@ -426,9 +555,9 @@ GRANT SELECT ON dbo.MSsubscriptions TO netdata_user; **Required permissions:** - `VIEW SERVER STATE` - Access to dynamic management views +- `SELECT on msdb.dbo.sysjobs` - SQL Agent job status monitoring **Optional permissions:** -- `SELECT on msdb.dbo.sysjobs` - SQL Agent job status monitoring - `SELECT on distribution.dbo.MSreplication_monitordata` - Replication monitoring - `SELECT on distribution.dbo.MSpublications` - Publication information - `SELECT on distribution.dbo.MSsubscriptions` - Subscription counts @@ -678,6 +807,3 @@ Ensure SQL Server is configured for mixed mode authentication if using SQL login The monitoring user needs VIEW SERVER STATE permission. Grant it with: `GRANT VIEW SERVER STATE TO netdata_user;` - - - diff --git a/src/go/plugin/go.d/collector/mssql/metadata.yaml b/src/go/plugin/go.d/collector/mssql/metadata.yaml index dc4dd9ee7edbe7..02f41bc6733a10 100644 --- a/src/go/plugin/go.d/collector/mssql/metadata.yaml +++ b/src/go/plugin/go.d/collector/mssql/metadata.yaml @@ -37,7 +37,7 @@ modules: - Performance counters (buffer manager, memory manager, SQL statistics) - Dynamic management views (DMVs) for wait statistics, locks, and sessions - Per-database transaction and lock statistics - - SQL Server Agent job status (if permissions allow) + - SQL Server Agent job status method_description: | It connects to the SQL Server instance via TCP using the go-mssqldb driver and executes queries against: @@ -49,7 +49,7 @@ modules: - `sys.dm_os_process_memory` - SQL Server process memory - `sys.dm_os_sys_memory` - OS physical memory and page file - `sys.master_files` - Database file sizes - - `msdb.dbo.sysjobs` - SQL Agent job status (optional) + - `msdb.dbo.sysjobs` - SQL Agent job status default_behavior: auto_detection: description: | @@ -64,7 +64,8 @@ modules: additional_permissions: description: | The monitoring user requires the VIEW SERVER STATE permission to access DMVs. - For SQL Agent job monitoring, access to the msdb database is required. + SQL Agent job monitoring is part of collector startup, so access to + `msdb.dbo.sysjobs` is required. supported_platforms: include: [] exclude: [] @@ -82,7 +83,7 @@ modules: -- Grant VIEW SERVER STATE (required for DMVs) GRANT VIEW SERVER STATE TO netdata_user; - -- Optional: Grant access to msdb for SQL Agent job monitoring + -- Grant access to msdb for SQL Agent job monitoring (required) USE msdb; CREATE USER netdata_user FOR LOGIN netdata_user; GRANT SELECT ON dbo.sysjobs TO netdata_user; @@ -98,9 +99,9 @@ modules: **Required permissions:** - `VIEW SERVER STATE` - Access to dynamic management views + - `SELECT on msdb.dbo.sysjobs` - SQL Agent job status monitoring **Optional permissions:** - - `SELECT on msdb.dbo.sysjobs` - SQL Agent job status monitoring - `SELECT on distribution.dbo.MSreplication_monitordata` - Replication monitoring - `SELECT on distribution.dbo.MSpublications` - Publication information - `SELECT on distribution.dbo.MSsubscriptions` - Subscription counts @@ -271,6 +272,39 @@ modules: type: integer unit: "" description: Total number of times this query pattern has been executed. High values indicate frequently run queries that may impact server performance significantly. + - name: Error Attribution + type: string + unit: "" + description: "Status of error detail attribution for this query. Values: enabled, no_data, not_enabled, not_supported." + - name: Error Number + type: integer + unit: "" + description: "Most recent error number observed for this query (when error attribution is enabled)." + - name: SQL State + type: string + unit: "" + visibility: hidden + description: "SQLSTATE code (not available for SQL Server; usually empty)." + - name: Error Message + type: string + unit: "" + description: "Most recent error message for this query (when error attribution is enabled)." + - name: Hash Match Joins + type: integer + unit: "" + description: "Count of Hash Match join operators across all stored plans for this query." + - name: Merge Joins + type: integer + unit: "" + description: "Count of Merge Join operators across all stored plans for this query." + - name: Nested Loops + type: integer + unit: "" + description: "Count of Nested Loops operators across all stored plans for this query." + - name: Sorts + type: integer + unit: "" + description: "Count of Sort operators across all stored plans for this query." - name: Total Time type: duration unit: "milliseconds" @@ -582,6 +616,123 @@ modules: availability: | Available when:
• The collector has successfully connected to SQL Server
• Query Store is enabled on at least one user database
• Returns HTTP 503 if collector is still initializing
• Returns HTTP 500 if the query fails
• Returns HTTP 504 if the query times out require_cloud: true + - id: deadlock-info + name: Deadlock Info + description: | + Retrieves the most recent deadlock event from SQL Server's `system_health` Extended Events ring buffer (`xml_deadlock_report`). + + The deadlock graph XML is parsed to attribute the deadlock to the participating processes and their query text, lock mode, lock status, and wait resource. + + Use cases: + - Identify which process was chosen as the deadlock victim + - Inspect the waiting resource and lock mode involved in the deadlock + - Correlate deadlocks with recent application changes or deployments + + Query text and wait resource strings are truncated at 4096 characters for display purposes. + returns: + description: Parsed deadlock participants from the latest detected deadlock event. Each row represents one process involved in the deadlock. + columns: + - name: Row ID + type: string + unit: "" + visibility: hidden + description: "Unique row identifier composed of deadlock ID and process ID." + - name: Deadlock ID + type: string + unit: "" + description: "Identifier for the deadlock event, derived from the deadlock timestamp to group participating processes." + - name: Timestamp + type: timestamp + unit: "" + description: "Timestamp of the deadlock event from the ring buffer when available; otherwise the function execution time." + - name: Process ID + type: string + unit: "" + description: "Deadlock graph process identifier for the process involved in the deadlock." + - name: SPID + type: integer + unit: "" + description: "SQL Server session ID (SPID) for the process when available." + - name: ECID + type: integer + unit: "" + description: "Execution context ID (ECID) for parallel execution contexts when available." + - name: Victim + type: string + unit: "" + description: "\"true\" when the process was chosen as the deadlock victim and rolled back; otherwise \"false\"." + - name: Query + type: string + unit: "" + description: "SQL query text for the process involved in the deadlock. Truncated to 4096 characters." + - name: Lock Mode + type: string + unit: "" + description: "Lock mode reported for the process within the deadlock graph (for example X or S)." + - name: Lock Status + type: string + unit: "" + description: "Lock status for the process. WAITING indicates the process was waiting on a lock." + - name: Wait Resource + type: string + unit: "" + description: "Lock resource identifier from the deadlock graph showing what the process was waiting on." + - name: Database + type: string + unit: "" + description: "Database name mapped from the deadlock graph database ID when available." + performance: | + Executes on-demand queries against the `system_health` ring buffer:
• Not part of regular metric collection
• Overhead is limited to function execution time and XML parsing + security: | + Query text and wait resource strings may include unmasked literal values including sensitive data (PII/secrets):
• SQL literals such as emails, IDs, or tokens
• Schema and table names that may be sensitive in some environments
• Restrict dashboard access to authorized personnel only + availability: | + Available when:
• The collector has successfully connected to SQL Server
• `deadlock_info_function_enabled` is true
• The account has `VIEW SERVER STATE` permission
• Returns HTTP 200 with empty data when no deadlock is found
• Returns HTTP 403 when permission is missing
• Returns HTTP 500 if the query fails
• Returns HTTP 561 when the deadlock graph cannot be parsed
• Returns HTTP 503 if the collector is still initializing or the function is disabled
• Returns HTTP 504 if the query times out + require_cloud: true + - id: error-info + name: Error Info + description: | + Retrieves recent SQL errors from a user-managed Extended Events session that captures `sqlserver.error_reported` + with both the `sql_text` and `query_hash` actions. + + The session must be created by an administrator and include a `ring_buffer` target. Netdata reads the ring buffer + and returns recent error events with error number, message, and SQL text. The `query_hash` action is required for + reliable mapping into `top-queries` (query text fallback is best-effort). + + Use cases: + - Identify recent query errors and their messages + - Correlate errors to query text + - Validate error rates seen in top-queries + returns: + description: Recent error events from the configured Extended Events session. + columns: + - name: Timestamp + type: timestamp + unit: "" + description: "Timestamp of the error event." + - name: Error Number + type: integer + unit: "" + description: "SQL Server error number." + - name: Error Message + type: string + unit: "" + description: "Error message text." + - name: Query + type: string + unit: "" + description: "SQL text captured with the error event." + - name: Query Hash + type: string + unit: "" + visibility: hidden + description: "Query hash captured with the error event (used for mapping into top-queries)." + performance: | + Executes on-demand queries against the configured Extended Events ring buffer:
• Not part of regular metric collection
• Overhead is limited to function execution time and XML parsing + security: | + Error messages and query text may include unmasked literal values including sensitive data (PII/secrets):
• Restrict dashboard access to authorized personnel only + availability: | + Available when:
• The collector has successfully connected to SQL Server
• `error_info_function_enabled` is true
• The Extended Events session exists and has a ring_buffer target
• The account has `VIEW SERVER STATE` permission
• Returns HTTP 200 with empty data when no errors are found
• Returns HTTP 403 when permission is missing
• Returns HTTP 500 if the query fails
• Returns HTTP 503 if the session is not enabled or the function is disabled
• Returns HTTP 504 if the query times out + require_cloud: true metrics: folding: title: Metrics diff --git a/src/go/plugin/go.d/collector/mssql/queries.go b/src/go/plugin/go.d/collector/mssql/queries.go index fb4852127431c9..d64e2399a4dcfe 100644 --- a/src/go/plugin/go.d/collector/mssql/queries.go +++ b/src/go/plugin/go.d/collector/mssql/queries.go @@ -201,12 +201,72 @@ WHERE object_name LIKE '%SQL Errors%' AND counter_name = 'Errors/sec'; ` +// querySystemHealthLatestDeadlock retrieves the latest xml_deadlock_report event +// from the system_health Extended Events ring buffer. +const querySystemHealthLatestDeadlock = ` +WITH xevents AS ( + SELECT CAST(xet.target_data AS XML) AS target_data + FROM sys.dm_xe_session_targets AS xet + JOIN sys.dm_xe_sessions AS xs ON xs.address = xet.event_session_address + WHERE xs.name = 'system_health' + AND xet.target_name = 'ring_buffer' +) +SELECT TOP (1) + xevent.value('@timestamp', 'datetime2(7)') AS deadlock_time, + CONVERT(nvarchar(max), xevent.query('(data/value/deadlock)[1]')) AS deadlock_xml +FROM xevents +CROSS APPLY target_data.nodes('RingBufferTarget/event[@name="xml_deadlock_report"]') AS T(xevent) +ORDER BY deadlock_time DESC; +` + // queryDatabaseStatus gets database state and read-only status const queryDatabaseStatus = ` SELECT name, state, is_read_only FROM sys.databases; ` +// queryDatabaseNamesByID retrieves database_id to name mappings. +const queryDatabaseNamesByID = ` +SELECT database_id, name +FROM sys.databases; +` + +// queryMSSQLErrorSessionExists checks for the configured Extended Events session. +const queryMSSQLErrorSessionExists = ` +SELECT COUNT(*) +FROM sys.dm_xe_sessions +WHERE name = @sessionName; +` + +// queryMSSQLErrorSessionHasRingBuffer verifies that the session has a ring_buffer target. +const queryMSSQLErrorSessionHasRingBuffer = ` +SELECT COUNT(*) +FROM sys.dm_xe_session_targets AS xet +JOIN sys.dm_xe_sessions AS xs ON xs.address = xet.event_session_address +WHERE xs.name = @sessionName + AND xet.target_name = 'ring_buffer'; +` + +// queryMSSQLErrorInfo reads recent error_reported events from the ring_buffer target. +const queryMSSQLErrorInfo = ` +WITH xevents AS ( + SELECT CAST(xet.target_data AS XML) AS target_data + FROM sys.dm_xe_session_targets AS xet + JOIN sys.dm_xe_sessions AS xs ON xs.address = xet.event_session_address + WHERE xs.name = @sessionName + AND xet.target_name = 'ring_buffer' +) +SELECT TOP (@limit) + xevent.value('@timestamp', 'datetime2(7)') AS event_time, + xevent.value('(data[@name="error_number"]/value)[1]', 'int') AS error_number, + xevent.value('(data[@name="message"]/value)[1]', 'nvarchar(max)') AS message, + xevent.value('(action[@name="sql_text"]/value)[1]', 'nvarchar(max)') AS sql_text, + CONVERT(VARCHAR(64), xevent.value('(action[@name="query_hash"]/value)[1]', 'varbinary(8)'), 1) AS query_hash +FROM xevents +CROSS APPLY target_data.nodes('RingBufferTarget/event[@name="error_reported"]') AS T(xevent) +ORDER BY event_time DESC; +` + // queryReplicationStatus gets replication publication status (if configured) // Groups by publication to aggregate across agent types and excludes 'ALL' placeholder const queryReplicationStatus = ` diff --git a/src/go/plugin/go.d/collector/mysql/collector.go b/src/go/plugin/go.d/collector/mysql/collector.go index e3fbab536253be..4e69c879576c83 100644 --- a/src/go/plugin/go.d/collector/mysql/collector.go +++ b/src/go/plugin/go.d/collector/mysql/collector.go @@ -64,13 +64,15 @@ func New() *Collector { } type Config struct { - Vnode string `yaml:"vnode,omitempty" json:"vnode"` - UpdateEvery int `yaml:"update_every,omitempty" json:"update_every"` - AutoDetectionRetry int `yaml:"autodetection_retry,omitempty" json:"autodetection_retry"` - DSN string `yaml:"dsn" json:"dsn"` - MyCNF string `yaml:"my.cnf,omitempty" json:"my.cnf"` - Timeout confopt.Duration `yaml:"timeout,omitempty" json:"timeout"` - TopQueriesLimit int `yaml:"top_queries_limit,omitempty" json:"top_queries_limit,omitempty"` + Vnode string `yaml:"vnode,omitempty" json:"vnode"` + UpdateEvery int `yaml:"update_every,omitempty" json:"update_every"` + AutoDetectionRetry int `yaml:"autodetection_retry,omitempty" json:"autodetection_retry"` + DSN string `yaml:"dsn" json:"dsn"` + MyCNF string `yaml:"my.cnf,omitempty" json:"my.cnf"` + Timeout confopt.Duration `yaml:"timeout,omitempty" json:"timeout"` + TopQueriesLimit int `yaml:"top_queries_limit,omitempty" json:"top_queries_limit,omitempty"` + DeadlockInfoFunctionEnabled *bool `yaml:"deadlock_info_function_enabled,omitempty" json:"deadlock_info_function_enabled,omitempty"` + ErrorInfoFunctionEnabled *bool `yaml:"error_info_function_enabled,omitempty" json:"error_info_function_enabled,omitempty"` } type Collector struct { @@ -119,6 +121,22 @@ func (c *Collector) Configuration() any { return c.Config } +// GetDeadlockInfoFunctionEnabled returns whether the deadlock-info function is enabled (default: true). +func (c *Config) GetDeadlockInfoFunctionEnabled() bool { + if c.DeadlockInfoFunctionEnabled == nil { + return true + } + return *c.DeadlockInfoFunctionEnabled +} + +// GetErrorInfoFunctionEnabled returns whether the error-info function is enabled (default: true). +func (c *Config) GetErrorInfoFunctionEnabled() bool { + if c.ErrorInfoFunctionEnabled == nil { + return true + } + return *c.ErrorInfoFunctionEnabled +} + func (c *Collector) Init(context.Context) error { if c.MyCNF != "" { dsn, err := dsnFromFile(c.MyCNF) diff --git a/src/go/plugin/go.d/collector/mysql/config_schema.json b/src/go/plugin/go.d/collector/mysql/config_schema.json index e01643a68e213d..092a733ced3922 100644 --- a/src/go/plugin/go.d/collector/mysql/config_schema.json +++ b/src/go/plugin/go.d/collector/mysql/config_schema.json @@ -48,6 +48,18 @@ "minimum": 1, "maximum": 5000, "default": 500 + }, + "deadlock_info_function_enabled": { + "title": "Enable Deadlock Info Function", + "description": "Enable the deadlock-info function. WARNING: query text may contain unmasked sensitive literals (PII). Only enable after ensuring proper access controls to the Netdata dashboard. This function reads SHOW ENGINE INNODB STATUS and may require PROCESS privilege.", + "type": "boolean", + "default": true + }, + "error_info_function_enabled": { + "title": "Enable Error Info Function", + "description": "Enable the error-info function. WARNING: error messages and query text may contain unmasked sensitive literals (PII). This function reads Performance Schema statement history tables; ensure proper access controls to the Netdata dashboard.", + "type": "boolean", + "default": true } }, "required": [ @@ -69,6 +81,12 @@ }, "timeout": { "ui:help": "Accepts decimals for precise control (e.g., type 1.5 for 1.5 seconds)." + }, + "deadlock_info_function_enabled": { + "ui:help": "When enabled, the deadlock-info function becomes available in the Netdata dashboard. WARNING: query text may contain unmasked sensitive literals; restrict dashboard access." + }, + "error_info_function_enabled": { + "ui:help": "When enabled, the error-info function becomes available in the Netdata dashboard. WARNING: error messages and query text may include sensitive literals." } } } diff --git a/src/go/plugin/go.d/collector/mysql/deadlock_info.go b/src/go/plugin/go.d/collector/mysql/deadlock_info.go new file mode 100644 index 00000000000000..08e37dfaf59be9 --- /dev/null +++ b/src/go/plugin/go.d/collector/mysql/deadlock_info.go @@ -0,0 +1,737 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package mysql + +import ( + "bufio" + "context" + "database/sql" + "errors" + "fmt" + "regexp" + "sort" + "strconv" + "strings" + "time" + + mysqlDriver "github.com/go-sql-driver/mysql" + + "github.com/netdata/netdata/go/plugins/pkg/funcapi" + "github.com/netdata/netdata/go/plugins/plugin/go.d/agent/module" + "github.com/netdata/netdata/go/plugins/plugin/go.d/pkg/strmutil" +) + +const ( + deadlockIdxRowID = iota + deadlockIdxDeadlockID + deadlockIdxTimestamp + deadlockIdxProcessID + deadlockIdxSpid + deadlockIdxEcid + deadlockIdxIsVictim + deadlockIdxQueryText + deadlockIdxLockMode + deadlockIdxLockStatus + deadlockIdxWaitResource + deadlockIdxDatabase + deadlockColumnCount +) + +const ( + deadlockSectionWaiting = "waiting" + deadlockSectionHolds = "holds" +) + +var ( + reDeadlockHeader = regexp.MustCompile(`LATEST DETECTED DEADLOCK`) + reDeadlockTxn = regexp.MustCompile(`(?i)^\*\*\* \((\d+)\) TRANSACTION:?`) + reDeadlockWait = regexp.MustCompile(`(?i)^\*\*\* \((\d+)\) WAITING FOR THIS LOCK TO BE GRANTED:?`) + reDeadlockHolds = regexp.MustCompile(`(?i)^\*\*\* \((\d+)\) HOLDS THE LOCK\(S\):?`) + reDeadlockWaitNoTxn = regexp.MustCompile(`(?i)^\*\*\*\s*WAITING FOR THIS LOCK TO BE GRANTED:?`) + reDeadlockHoldsNoTxn = regexp.MustCompile(`(?i)^\*\*\*\s*HOLDS THE LOCK\(S\):?`) + reDeadlockVictim = regexp.MustCompile(`(?i)^\*\*\* WE ROLL BACK TRANSACTION \((\d+)\)`) + reDeadlockThread = regexp.MustCompile(`(?i)\b(?:mysql|mariadb)?\s*thread id\s+(\d+)`) + reDeadlockMode = regexp.MustCompile(`(?i)lock[_ ]mode\s+([A-Z0-9_-]+)`) + reDeadlockTS = regexp.MustCompile(`\b\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\b`) + reDeadlockTable = regexp.MustCompile(`(?i)\bof\s+table\s+` + "`?" + `([-\w$]+)` + "`?" + `\.` + "`?" + `([-\w$]+)` + "`?") + reQueryTableRef = regexp.MustCompile(`(?i)\b(?:from|update|into|join)\s+` + "`?" + `([-\w$]+)` + "`?" + `\.` + "`?" + `([-\w$]+)` + "`?") + reSQLStatement = regexp.MustCompile(`(?i)^(?:/\*.*\*/\s*)?(SELECT|UPDATE|INSERT|DELETE|REPLACE|WITH|ALTER|CREATE|DROP|TRUNCATE|LOCK|UNLOCK|SET|SHOW|CALL|EXEC|EXECUTE|DO|BEGIN|COMMIT|ROLLBACK|MERGE)\b`) +) + +const ( + deadlockInfoHelp = "Latest detected deadlock from SHOW ENGINE INNODB STATUS. WARNING: query text may include unmasked sensitive literals; restrict dashboard access." + deadlockParseErrorStatus = 561 +) + +type mysqlDeadlockTxn struct { + txnNum int + threadID string + queryText string + lockMode string + lockStatus string + waitResource string +} + +type mysqlDeadlockParseResult struct { + found bool + deadlockTime time.Time + victimTxnNum int + transactions []*mysqlDeadlockTxn + parseErr error +} + +func (c *Collector) deadlockInfoParams(context.Context) ([]funcapi.ParamConfig, error) { + if !c.Config.GetDeadlockInfoFunctionEnabled() { + return nil, fmt.Errorf("deadlock-info function disabled in configuration") + } + return []funcapi.ParamConfig{}, nil +} + +func (c *Collector) collectDeadlockInfo(ctx context.Context) *module.FunctionResponse { + if !c.Config.GetDeadlockInfoFunctionEnabled() { + return &module.FunctionResponse{ + Status: 503, + Message: "deadlock-info function has been disabled in configuration. " + + "To enable, set deadlock_info_function_enabled: true in the MySQL collector config.", + } + } + + statusText, err := c.queryInnoDBStatus(ctx) + if err != nil { + if errors.Is(err, context.DeadlineExceeded) { + return c.deadlockInfoResponse(504, "deadlock query timed out", nil) + } + if isMySQLPermissionError(err) { + return c.deadlockInfoResponse( + 403, + "Deadlock info requires permission to run SHOW ENGINE INNODB STATUS. "+ + "Grant with: GRANT USAGE, REPLICATION CLIENT, PROCESS ON *.* TO 'netdata'@'%';", + nil, + ) + } + c.Warningf("deadlock-info: query failed: %v", err) + return c.deadlockInfoResponse(500, fmt.Sprintf("deadlock query failed: %v", err), nil) + } + + parseRes := parseInnoDBDeadlock(statusText, time.Now().UTC()) + if parseRes.parseErr != nil { + c.Warningf("deadlock-info: parse failed: %v", parseRes.parseErr) + return c.deadlockInfoResponse(deadlockParseErrorStatus, "deadlock section could not be parsed", nil) + } + if !parseRes.found { + return c.deadlockInfoResponse(200, "no deadlock found in SHOW ENGINE INNODB STATUS", nil) + } + + deadlockID := generateDeadlockID(parseRes.deadlockTime) + rows := buildDeadlockRows(parseRes, deadlockID) + if len(rows) == 0 { + return c.deadlockInfoResponse(200, "deadlock detected but no transactions could be parsed", nil) + } + + return c.deadlockInfoResponse(200, "latest detected deadlock", rows) +} + +func (c *Collector) deadlockInfoResponse(status int, message string, data [][]any) *module.FunctionResponse { + if data == nil { + data = make([][]any, 0) + } + return &module.FunctionResponse{ + Status: status, + Help: deadlockInfoHelp, + Message: message, + Columns: c.buildDeadlockColumns(), + Data: data, + DefaultSortColumn: "timestamp", + } +} + +func (c *Collector) queryInnoDBStatus(ctx context.Context) (string, error) { + qctx, cancel := context.WithTimeout(ctx, c.Timeout.Duration()) + defer cancel() + + var typ, name, status sql.NullString + if err := c.db.QueryRowContext(qctx, queryShowEngineInnoDBStatus).Scan(&typ, &name, &status); err != nil { + return "", err + } + if !status.Valid { + return "", fmt.Errorf("innodb status response was empty") + } + return status.String, nil +} + +func (c *Collector) buildDeadlockColumns() map[string]any { + const ( + ftString = funcapi.FieldTypeString + ftInteger = funcapi.FieldTypeInteger + ftTimestamp = funcapi.FieldTypeTimestamp + + trNone = funcapi.FieldTransformNone + trNumber = funcapi.FieldTransformNumber + trDatetime = funcapi.FieldTransformDatetime + + visValue = funcapi.FieldVisualValue + visPill = funcapi.FieldVisualPill + + sortAsc = funcapi.FieldSortAscending + sortDesc = funcapi.FieldSortDescending + + summaryCount = funcapi.FieldSummaryCount + summaryMax = funcapi.FieldSummaryMax + + filterMulti = funcapi.FieldFilterMultiselect + filterRange = funcapi.FieldFilterRange + ) + + columns := map[string]any{ + "row_id": funcapi.Column{ + Index: deadlockIdxRowID, + Name: "Row ID", + Type: ftString, + Visualization: visValue, + Sort: sortAsc, + Sortable: true, + Sticky: false, + Summary: summaryCount, + Filter: filterMulti, + FullWidth: false, + Wrap: false, + UniqueKey: true, + Visible: false, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + DecimalPoints: 0, + }, + }.BuildColumn(), + "deadlock_id": funcapi.Column{ + Index: deadlockIdxDeadlockID, + Name: "Deadlock ID", + Type: ftString, + Visualization: visValue, + Sort: sortAsc, + Sortable: true, + Summary: summaryCount, + Filter: filterMulti, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + "timestamp": funcapi.Column{ + Index: deadlockIdxTimestamp, + Name: "Timestamp", + Type: ftTimestamp, + Visualization: visValue, + Sort: sortDesc, + Sortable: true, + Summary: summaryMax, + Filter: filterRange, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trDatetime, + }, + }.BuildColumn(), + "process_id": funcapi.Column{ + Index: deadlockIdxProcessID, + Name: "Process ID", + Type: ftString, + Visualization: visValue, + Sort: sortAsc, + Sortable: true, + Summary: summaryCount, + Filter: filterMulti, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + "spid": funcapi.Column{ + Index: deadlockIdxSpid, + Name: "Connection ID", + Type: ftInteger, + Visualization: visValue, + Sort: sortAsc, + Sortable: true, + Summary: summaryCount, + Filter: filterRange, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNumber, + }, + }.BuildColumn(), + "ecid": funcapi.Column{ + Index: deadlockIdxEcid, + Name: "ECID", + Type: ftInteger, + Visualization: visValue, + Sort: sortAsc, + Sortable: true, + Summary: summaryCount, + Filter: filterRange, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNumber, + }, + }.BuildColumn(), + "is_victim": funcapi.Column{ + Index: deadlockIdxIsVictim, + Name: "Victim", + Type: ftString, + Visualization: visPill, + Sort: sortAsc, + Sortable: true, + Summary: summaryCount, + Filter: filterMulti, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + "query_text": funcapi.Column{ + Index: deadlockIdxQueryText, + Name: "Query", + Type: ftString, + Visualization: visValue, + Sort: sortAsc, + Sortable: false, + Sticky: true, + Summary: summaryCount, + Filter: filterMulti, + FullWidth: true, + Wrap: true, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + "lock_mode": funcapi.Column{ + Index: deadlockIdxLockMode, + Name: "Lock Mode", + Type: ftString, + Visualization: visValue, + Sort: sortAsc, + Sortable: true, + Summary: summaryCount, + Filter: filterMulti, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + "lock_status": funcapi.Column{ + Index: deadlockIdxLockStatus, + Name: "Lock Status", + Type: ftString, + Visualization: visPill, + Sort: sortAsc, + Sortable: true, + Summary: summaryCount, + Filter: filterMulti, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + "wait_resource": funcapi.Column{ + Index: deadlockIdxWaitResource, + Name: "Wait Resource", + Type: ftString, + Visualization: visValue, + Sort: sortAsc, + Sortable: false, + Summary: summaryCount, + Filter: filterMulti, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + "database": funcapi.Column{ + Index: deadlockIdxDatabase, + Name: "Database", + Type: ftString, + Visualization: visValue, + Sort: sortAsc, + Sortable: true, + Summary: summaryCount, + Filter: filterMulti, + Visible: true, + ValueOptions: funcapi.ValueOptions{ + Transform: trNone, + }, + }.BuildColumn(), + } + return columns +} + +func parseInnoDBDeadlock(status string, now time.Time) mysqlDeadlockParseResult { + result := mysqlDeadlockParseResult{ + found: false, + deadlockTime: now.UTC(), + } + + section, ok := extractDeadlockSection(status) + if !ok { + return result + } + result.found = true + + if ts, ok := parseDeadlockTimestamp(section); ok { + result.deadlockTime = ts.UTC() + } + + scanner := bufio.NewScanner(strings.NewReader(section)) + scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024) + + txnByNum := make(map[int]*mysqlDeadlockTxn) + txnOrder := make([]int, 0, 4) + + currentTxnNum := 0 + currentSection := "" + expectingQueryTxn := 0 + victimTxnNum := 0 + + ensureTxn := func(num int) *mysqlDeadlockTxn { + if txn, ok := txnByNum[num]; ok { + return txn + } + txn := &mysqlDeadlockTxn{txnNum: num} + txnByNum[num] = txn + txnOrder = append(txnOrder, num) + return txn + } + + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + + if num, ok := parseDeadlockTxnHeader(line); ok { + currentTxnNum = num + currentSection = "" + expectingQueryTxn = 0 + ensureTxn(num) + continue + } + + if num, sectionType, ok := parseDeadlockTxnSection(line); ok { + currentTxnNum = num + currentSection = sectionType + expectingQueryTxn = 0 + ensureTxn(num) + continue + } + + if currentTxnNum != 0 { + if isDeadlockWaitNoTxn(line) { + currentSection = deadlockSectionWaiting + expectingQueryTxn = 0 + ensureTxn(currentTxnNum) + continue + } + if isDeadlockHoldsNoTxn(line) { + currentSection = deadlockSectionHolds + expectingQueryTxn = 0 + ensureTxn(currentTxnNum) + continue + } + } + + if num, ok := parseDeadlockVictim(line); ok { + victimTxnNum = num + continue + } + + if currentTxnNum == 0 { + continue + } + + txn := ensureTxn(currentTxnNum) + + if threadID, ok := parseDeadlockThreadID(line); ok { + txn.threadID = threadID + expectingQueryTxn = currentTxnNum + continue + } + + if expectingQueryTxn == currentTxnNum && txn.queryText == "" && isSQLStatementLine(line) { + txn.queryText = line + expectingQueryTxn = 0 + continue + } + + if expectingQueryTxn == 0 && txn.queryText == "" && isSQLStatementLine(line) { + txn.queryText = line + continue + } + + switch currentSection { + case deadlockSectionWaiting: + // WAITING must win even if HOLDS was seen first in the output. + txn.lockStatus = "WAITING" + if txn.waitResource == "" && isLockResourceLine(line) { + txn.waitResource = strmutil.TruncateText(line, maxQueryTextLength) + } + if mode, ok := parseDeadlockLockMode(line); ok { + // WAITING lock mode should override any mode captured from HOLDS. + txn.lockMode = mode + } + case deadlockSectionHolds: + if txn.lockStatus == "" { + txn.lockStatus = "GRANTED" + } + if txn.lockMode == "" { + if mode, ok := parseDeadlockLockMode(line); ok { + txn.lockMode = mode + } + } + } + } + + if err := scanner.Err(); err != nil { + result.parseErr = err + return result + } + + result.victimTxnNum = victimTxnNum + result.transactions = make([]*mysqlDeadlockTxn, 0, len(txnOrder)) + for _, num := range txnOrder { + txn := txnByNum[num] + if txn == nil { + continue + } + if txn.threadID == "" { + txn.threadID = fmt.Sprintf("txn-%d", num) + } + if txn.lockStatus == "" { + if num == victimTxnNum { + txn.lockStatus = "WAITING" + } else { + txn.lockStatus = "GRANTED" + } + } + result.transactions = append(result.transactions, txn) + } + + if len(result.transactions) == 0 { + result.parseErr = fmt.Errorf("deadlock section detected but no transactions could be parsed") + return result + } + + sort.Slice(result.transactions, func(i, j int) bool { + return result.transactions[i].txnNum < result.transactions[j].txnNum + }) + + return result +} + +func buildDeadlockRows(parseRes mysqlDeadlockParseResult, deadlockID string) [][]any { + rows := make([][]any, 0, len(parseRes.transactions)) + timestamp := parseRes.deadlockTime.UTC().Format(time.RFC3339Nano) + + for _, txn := range parseRes.transactions { + if txn == nil { + continue + } + + processID := strings.TrimSpace(txn.threadID) + if processID == "" { + processID = fmt.Sprintf("txn-%d", txn.txnNum) + } + + var spid any + if id, err := strconv.Atoi(processID); err == nil { + spid = id + } else { + spid = nil + } + + isVictim := "false" + if parseRes.victimTxnNum != 0 && txn.txnNum == parseRes.victimTxnNum { + isVictim = "true" + } + + queryText := strmutil.TruncateText(strings.TrimSpace(txn.queryText), maxQueryTextLength) + lockMode := strings.TrimSpace(txn.lockMode) + lockStatus := strings.TrimSpace(txn.lockStatus) + waitResource := strmutil.TruncateText(strings.TrimSpace(txn.waitResource), maxQueryTextLength) + database := extractDeadlockDatabase(waitResource, queryText) + + var databaseValue any + if database != "" { + databaseValue = database + } + + row := make([]any, deadlockColumnCount) + row[deadlockIdxRowID] = fmt.Sprintf("%s:%s", deadlockID, processID) + row[deadlockIdxDeadlockID] = deadlockID + row[deadlockIdxTimestamp] = timestamp + row[deadlockIdxProcessID] = processID + row[deadlockIdxSpid] = spid + row[deadlockIdxEcid] = nil + row[deadlockIdxIsVictim] = isVictim + row[deadlockIdxQueryText] = queryText + row[deadlockIdxLockMode] = lockMode + row[deadlockIdxLockStatus] = lockStatus + row[deadlockIdxWaitResource] = waitResource + row[deadlockIdxDatabase] = databaseValue + rows = append(rows, row) + } + + return rows +} + +func extractDeadlockSection(status string) (string, bool) { + idx := reDeadlockHeader.FindStringIndex(status) + if idx == nil { + return "", false + } + return status[idx[0]:], true +} + +func parseDeadlockTimestamp(section string) (time.Time, bool) { + match := reDeadlockTS.FindString(section) + if match == "" { + return time.Time{}, false + } + ts, err := time.ParseInLocation("2006-01-02 15:04:05", match, time.Local) + if err != nil { + return time.Time{}, false + } + return ts, true +} + +func parseDeadlockTxnHeader(line string) (int, bool) { + m := reDeadlockTxn.FindStringSubmatch(line) + if len(m) != 2 { + return 0, false + } + n, err := strconv.Atoi(m[1]) + if err != nil { + return 0, false + } + return n, true +} + +func parseDeadlockTxnSection(line string) (int, string, bool) { + if m := reDeadlockWait.FindStringSubmatch(line); len(m) == 2 { + n, err := strconv.Atoi(m[1]) + if err != nil { + return 0, "", false + } + return n, deadlockSectionWaiting, true + } + if m := reDeadlockHolds.FindStringSubmatch(line); len(m) == 2 { + n, err := strconv.Atoi(m[1]) + if err != nil { + return 0, "", false + } + return n, deadlockSectionHolds, true + } + return 0, "", false +} + +func isDeadlockWaitNoTxn(line string) bool { + return reDeadlockWaitNoTxn.MatchString(line) +} + +func isDeadlockHoldsNoTxn(line string) bool { + return reDeadlockHoldsNoTxn.MatchString(line) +} + +func parseDeadlockVictim(line string) (int, bool) { + m := reDeadlockVictim.FindStringSubmatch(line) + if len(m) != 2 { + return 0, false + } + n, err := strconv.Atoi(m[1]) + if err != nil { + return 0, false + } + return n, true +} + +func parseDeadlockThreadID(line string) (string, bool) { + m := reDeadlockThread.FindStringSubmatch(line) + if len(m) != 2 { + return "", false + } + return m[1], true +} + +func parseDeadlockLockMode(line string) (string, bool) { + m := reDeadlockMode.FindStringSubmatch(line) + if len(m) != 2 { + return "", false + } + return strings.ToUpper(m[1]), true +} + +func isLikelyQueryLine(line string) bool { + return isSQLStatementLine(line) +} + +func isSQLStatementLine(line string) bool { + trimmed := strings.TrimSpace(line) + if trimmed == "" { + return false + } + upper := strings.ToUpper(trimmed) + if strings.HasPrefix(upper, "LOCK WAIT") { + return false + } + return reSQLStatement.MatchString(trimmed) +} + +func isLockResourceLine(line string) bool { + upper := strings.ToUpper(strings.TrimSpace(line)) + return strings.HasPrefix(upper, "RECORD LOCKS") || strings.HasPrefix(upper, "TABLE LOCK") +} + +func extractDeadlockDatabase(waitResource, queryText string) string { + if db := extractDatabaseFromLock(waitResource); db != "" { + return db + } + if db := extractDatabaseFromQuery(queryText); db != "" { + return db + } + return "" +} + +func extractDatabaseFromLock(line string) string { + m := reDeadlockTable.FindStringSubmatch(line) + if len(m) >= 2 { + return m[1] + } + return "" +} + +func extractDatabaseFromQuery(queryText string) string { + m := reQueryTableRef.FindStringSubmatch(queryText) + if len(m) >= 2 { + return m[1] + } + return "" +} + +func generateDeadlockID(t time.Time) string { + if t.IsZero() { + t = time.Now().UTC() + } + t = t.UTC() + micros := t.Nanosecond() / 1000 + return t.Format("20060102150405") + fmt.Sprintf("%06d", micros) +} + +func isMySQLPermissionError(err error) bool { + var mysqlErr *mysqlDriver.MySQLError + if errors.As(err, &mysqlErr) { + if mysqlErr.Number == 1045 || mysqlErr.Number == 1227 { + return true + } + } + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "access denied") || + strings.Contains(msg, "permission denied") || + strings.Contains(msg, "process privilege") +} diff --git a/src/go/plugin/go.d/collector/mysql/deadlock_info_test.go b/src/go/plugin/go.d/collector/mysql/deadlock_info_test.go new file mode 100644 index 00000000000000..673531b7f8b2f4 --- /dev/null +++ b/src/go/plugin/go.d/collector/mysql/deadlock_info_test.go @@ -0,0 +1,523 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package mysql + +import ( + "context" + "errors" + "testing" + "time" + + mysqlDriver "github.com/go-sql-driver/mysql" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestConfig_GetDeadlockInfoFunctionEnabled(t *testing.T) { + tests := []struct { + name string + cfg Config + expected bool + }{ + { + name: "default nil pointer enables function", + cfg: Config{}, + expected: true, + }, + { + name: "explicit true enables function", + cfg: Config{ + DeadlockInfoFunctionEnabled: boolPtr(true), + }, + expected: true, + }, + { + name: "explicit false disables function", + cfg: Config{ + DeadlockInfoFunctionEnabled: boolPtr(false), + }, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, tt.cfg.GetDeadlockInfoFunctionEnabled()) + }) + } +} + +func TestConfig_GetErrorInfoFunctionEnabled(t *testing.T) { + tests := []struct { + name string + cfg Config + expected bool + }{ + { + name: "default nil pointer enables function", + cfg: Config{}, + expected: true, + }, + { + name: "explicit true enables function", + cfg: Config{ + ErrorInfoFunctionEnabled: boolPtr(true), + }, + expected: true, + }, + { + name: "explicit false disables function", + cfg: Config{ + ErrorInfoFunctionEnabled: boolPtr(false), + }, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, tt.cfg.GetErrorInfoFunctionEnabled()) + }) + } +} + +func TestParseInnoDBDeadlock_WithDeadlock(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 123456000, time.UTC) + res := parseInnoDBDeadlock(sampleDeadlockStatus, now) + + assert.True(t, res.found) + assert.NoError(t, res.parseErr) + assert.Len(t, res.transactions, 2) + assert.Equal(t, 2, res.victimTxnNum) + assert.Equal(t, now.UTC(), res.deadlockTime) + + assert.Equal(t, "10", res.transactions[0].threadID) + assert.Equal(t, "11", res.transactions[1].threadID) + assert.Equal(t, "WAITING", res.transactions[0].lockStatus) + assert.Equal(t, "GRANTED", res.transactions[1].lockStatus) +} + +func TestParseInnoDBDeadlock_MariaDBThreadID(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 0, time.UTC) + res := parseInnoDBDeadlock(sampleDeadlockStatusMariaDB, now) + + require.True(t, res.found) + require.NoError(t, res.parseErr) + require.Len(t, res.transactions, 2) + + txnByNum := make(map[int]*mysqlDeadlockTxn, len(res.transactions)) + for _, txn := range res.transactions { + txnByNum[txn.txnNum] = txn + } + + require.Contains(t, txnByNum, 1) + require.Contains(t, txnByNum, 2) + + assert.Equal(t, "55", txnByNum[1].threadID) + assert.Contains(t, txnByNum[1].queryText, "deadlock_a") + assert.NotEmpty(t, txnByNum[1].waitResource) +} + +func TestParseInnoDBDeadlock_SkipsLockWaitLine(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 0, time.UTC) + res := parseInnoDBDeadlock(sampleDeadlockStatusLockWaitLine, now) + + require.True(t, res.found) + require.NoError(t, res.parseErr) + require.Len(t, res.transactions, 1) + + txn := res.transactions[0] + require.NotNil(t, txn) + assert.Equal(t, "90", txn.threadID) + assert.Equal(t, "UPDATE deadlock_a SET value = value + 1 WHERE id = 1", txn.queryText) +} + +func TestParseInnoDBDeadlock_WaitingHeaderWithoutTxn(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 0, time.UTC) + res := parseInnoDBDeadlock(sampleDeadlockStatusWaitNoTxn, now) + + require.True(t, res.found) + require.NoError(t, res.parseErr) + require.Len(t, res.transactions, 1) + + txn := res.transactions[0] + require.NotNil(t, txn) + assert.Equal(t, "99", txn.threadID) + assert.Equal(t, "WAITING", txn.lockStatus) + assert.Equal(t, "X", txn.lockMode) + assert.NotEmpty(t, txn.waitResource) +} + +func TestParseInnoDBDeadlock_HoldsBeforeWaiting(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 0, time.UTC) + res := parseInnoDBDeadlock(sampleDeadlockStatusHoldsFirst, now) + + require.True(t, res.found) + require.NoError(t, res.parseErr) + require.Len(t, res.transactions, 2) + + var txn1 *mysqlDeadlockTxn + for _, txn := range res.transactions { + if txn.txnNum == 1 { + txn1 = txn + break + } + } + + require.NotNil(t, txn1, "transaction (1) should be present") + assert.Equal(t, "WAITING", txn1.lockStatus) + assert.Equal(t, "AUTO-INC", txn1.lockMode) +} + +func TestParseDeadlockLockMode_HyphenAndUnderscore(t *testing.T) { + mode, ok := parseDeadlockLockMode("lock mode AUTO-INC waiting") + require.True(t, ok) + assert.Equal(t, "AUTO-INC", mode) + + mode, ok = parseDeadlockLockMode("lock_mode AUTO_INC") + require.True(t, ok) + assert.Equal(t, "AUTO_INC", mode) +} + +func TestParseInnoDBDeadlock_WithTimestamp(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 0, time.UTC) + res := parseInnoDBDeadlock(sampleDeadlockStatusWithTimestamp, now) + + assert.True(t, res.found) + assert.NoError(t, res.parseErr) + assert.Equal(t, "2026-01-25 12:34:56", res.deadlockTime.In(time.Local).Format("2006-01-02 15:04:05")) +} + +func TestParseInnoDBDeadlock_ThreeWay(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 0, time.UTC) + res := parseInnoDBDeadlock(sampleDeadlockStatusThreeWay, now) + + require.True(t, res.found) + require.NoError(t, res.parseErr) + require.Len(t, res.transactions, 3) + assert.Equal(t, 3, res.victimTxnNum) + + txnByNum := make(map[int]*mysqlDeadlockTxn, len(res.transactions)) + for _, txn := range res.transactions { + txnByNum[txn.txnNum] = txn + } + + require.Contains(t, txnByNum, 1) + require.Contains(t, txnByNum, 2) + require.Contains(t, txnByNum, 3) + + assert.Equal(t, "30", txnByNum[1].threadID) + assert.Equal(t, "WAITING", txnByNum[1].lockStatus) + + assert.Equal(t, "31", txnByNum[2].threadID) + assert.Equal(t, "GRANTED", txnByNum[2].lockStatus) + + assert.Equal(t, "32", txnByNum[3].threadID) + // Victim fallback should mark transaction (3) as WAITING even without WAITING/HOLDS sections. + assert.Equal(t, "WAITING", txnByNum[3].lockStatus) +} + +func TestParseInnoDBDeadlock_WaitingWithoutLockMode(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 0, time.UTC) + res := parseInnoDBDeadlock(sampleDeadlockStatusWaitingNoLockMode, now) + + require.True(t, res.found) + require.NoError(t, res.parseErr) + require.Len(t, res.transactions, 2) + + var txn1 *mysqlDeadlockTxn + for _, txn := range res.transactions { + if txn.txnNum == 1 { + txn1 = txn + break + } + } + + require.NotNil(t, txn1, "transaction (1) should be present") + assert.Equal(t, "WAITING", txn1.lockStatus) + assert.Empty(t, txn1.lockMode) + assert.NotEmpty(t, txn1.waitResource) +} + +func TestParseInnoDBDeadlock_NoDeadlock(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 0, time.UTC) + res := parseInnoDBDeadlock("no deadlock here", now) + + assert.False(t, res.found) + assert.NoError(t, res.parseErr) + assert.Len(t, res.transactions, 0) +} + +func TestParseInnoDBDeadlock_MalformedSection(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 0, time.UTC) + res := parseInnoDBDeadlock(sampleDeadlockStatusMalformed, now) + + assert.True(t, res.found) + assert.Error(t, res.parseErr) + assert.Len(t, res.transactions, 0) +} + +func TestCollector_collectDeadlockInfo_ParseError(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherEqual)) + require.NoError(t, err) + defer func() { _ = db.Close() }() + + rows := sqlmock.NewRows([]string{"Type", "Name", "Status"}). + AddRow("InnoDB", "Status", sampleDeadlockStatusMalformed) + mock.ExpectQuery(queryShowEngineInnoDBStatus).WillReturnRows(rows) + + collr := New() + collr.db = db + + resp := collr.collectDeadlockInfo(context.Background()) + require.NotNil(t, resp) + assert.Equal(t, deadlockParseErrorStatus, resp.Status) + assert.Contains(t, resp.Message, "could not be parsed") + assert.NoError(t, mock.ExpectationsWereMet()) +} + +func TestCollector_collectDeadlockInfo_QueryError(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherEqual)) + require.NoError(t, err) + defer func() { _ = db.Close() }() + + mock.ExpectQuery(queryShowEngineInnoDBStatus). + WillReturnError(errors.New("boom")) + + collr := New() + collr.db = db + + resp := collr.collectDeadlockInfo(context.Background()) + require.NotNil(t, resp) + assert.Equal(t, 500, resp.Status) + assert.Contains(t, resp.Message, "deadlock query failed") + assert.NoError(t, mock.ExpectationsWereMet()) +} + +func TestCollector_collectDeadlockInfo_Timeout(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherEqual)) + require.NoError(t, err) + defer func() { _ = db.Close() }() + + mock.ExpectQuery(queryShowEngineInnoDBStatus). + WillReturnError(context.DeadlineExceeded) + + collr := New() + collr.db = db + + resp := collr.collectDeadlockInfo(context.Background()) + require.NotNil(t, resp) + assert.Equal(t, 504, resp.Status) + assert.Contains(t, resp.Message, "timed out") + assert.NoError(t, mock.ExpectationsWereMet()) +} + +func TestCollector_collectDeadlockInfo_PermissionDenied(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherEqual)) + require.NoError(t, err) + defer func() { _ = db.Close() }() + + collr := New() + collr.db = db + + permErr := &mysqlDriver.MySQLError{ + Number: 1227, + Message: "Access denied; you need (at least one of) the PROCESS privilege(s) for this operation", + } + mock.ExpectQuery(queryShowEngineInnoDBStatus).WillReturnError(permErr) + + resp := collr.collectDeadlockInfo(context.Background()) + require.NotNil(t, resp) + assert.Equal(t, 403, resp.Status) + assert.Contains(t, resp.Message, "PROCESS") + assert.NoError(t, mock.ExpectationsWereMet()) +} + +func TestCollector_collectDeadlockInfo_Disabled(t *testing.T) { + c := New() + c.Config.DeadlockInfoFunctionEnabled = boolPtr(false) + + resp := c.collectDeadlockInfo(context.Background()) + require.Equal(t, 503, resp.Status) + assert.Contains(t, resp.Message, "disabled") +} + +func TestBuildDeadlockRows(t *testing.T) { + now := time.Date(2026, time.January, 25, 12, 0, 0, 123456000, time.UTC) + res := parseInnoDBDeadlock(sampleDeadlockStatus, now) + deadlockID := generateDeadlockID(now) + rows := buildDeadlockRows(res, deadlockID) + + assert.Len(t, rows, 2) + assert.Equal(t, deadlockID, rows[0][deadlockIdxDeadlockID]) + assert.Equal(t, deadlockID, rows[1][deadlockIdxDeadlockID]) + assert.Equal(t, deadlockID+":10", rows[0][deadlockIdxRowID]) + assert.Equal(t, deadlockID+":11", rows[1][deadlockIdxRowID]) + + hasDatabase := false + for _, row := range rows { + if row[deadlockIdxDatabase] == "netdata" { + hasDatabase = true + break + } + } + assert.True(t, hasDatabase, "expected at least one row with database populated") +} + +func boolPtr(v bool) *bool { + return &v +} + +const sampleDeadlockStatus = ` +------------------------ +LATEST DETECTED DEADLOCK +------------------------ +*** (1) TRANSACTION: +TRANSACTION 100, ACTIVE 0 sec +MySQL thread id 10, OS thread handle 1, query id 100 localhost root updating +UPDATE Animals SET value = value + 1 WHERE name='Aardvark' +*** (1) WAITING FOR THIS LOCK TO BE GRANTED: +RECORD LOCKS space id 1 page no 2 n bits 72 index PRIMARY of table netdata.Birds trx id 100 lock mode X waiting +*** (2) TRANSACTION: +TRANSACTION 101, ACTIVE 0 sec +MySQL thread id 11, OS thread handle 2, query id 101 localhost root updating +UPDATE Birds SET value = value + 1 WHERE name='Buzzard' +*** (2) HOLDS THE LOCK(S): +RECORD LOCKS space id 1 page no 3 n bits 72 index PRIMARY of table netdata.Animals trx id 101 lock mode X +*** WE ROLL BACK TRANSACTION (2) +` + +const sampleDeadlockStatusMariaDB = ` +------------------------ +LATEST DETECTED DEADLOCK +------------------------ +*** (1) TRANSACTION: +TRANSACTION 500, ACTIVE 1 sec +MariaDB thread id 55, OS thread handle 1, query id 500 localhost root updating +mysql tables in use 1, locked 1 +UPDATE deadlock_a SET value = value + 1 WHERE id = 1 +*** (1) waiting for this lock to be granted: +RECORD LOCKS space id 5 page no 6 n bits 72 index PRIMARY of table netdata.deadlock_a trx id 500 lock_mode X locks rec but not gap waiting +*** (2) TRANSACTION: +TRANSACTION 501, ACTIVE 1 sec +MariaDB thread id 56, OS thread handle 1, query id 501 localhost root updating +UPDATE deadlock_b SET value = value + 1 WHERE id = 1 +*** (2) HOLDS THE LOCK(S): +RECORD LOCKS space id 5 page no 6 n bits 72 index PRIMARY of table netdata.deadlock_a trx id 501 lock_mode X locks rec but not gap +*** WE ROLL BACK TRANSACTION (2) +` + +const sampleDeadlockStatusLockWaitLine = ` +------------------------ +LATEST DETECTED DEADLOCK +------------------------ +*** (1) TRANSACTION: +TRANSACTION 900, ACTIVE 1 sec +MySQL thread id 90, OS thread handle 1, query id 900 localhost root updating +LOCK WAIT 4 lock struct(s), heap size 1128, 2 row lock(s), undo log entries 1 +UPDATE deadlock_a SET value = value + 1 WHERE id = 1 +*** (1) WAITING FOR THIS LOCK TO BE GRANTED: +RECORD LOCKS space id 3 page no 4 n bits 72 index PRIMARY of table netdata.deadlock_a trx id 900 lock_mode X locks rec but not gap waiting +*** WE ROLL BACK TRANSACTION (1) +` + +const sampleDeadlockStatusWaitNoTxn = ` +------------------------ +LATEST DETECTED DEADLOCK +------------------------ +*** (1) TRANSACTION: +TRANSACTION 990, ACTIVE 1 sec +MariaDB thread id 99, OS thread handle 1, query id 990 localhost root Updating +UPDATE deadlock_b SET value = value + 1 WHERE id = 1 +*** WAITING FOR THIS LOCK TO BE GRANTED: +RECORD LOCKS space id 7 page no 3 n bits 320 index PRIMARY of table netdata.deadlock_b trx id 42 lock_mode X locks rec but not gap waiting +*** WE ROLL BACK TRANSACTION (1) +` + +const sampleDeadlockStatusHoldsFirst = ` +------------------------ +LATEST DETECTED DEADLOCK +------------------------ +*** (1) TRANSACTION: +TRANSACTION 200, ACTIVE 0 sec +MySQL thread id 20, OS thread handle 1, query id 200 localhost root updating +UPDATE deadlock_b SET value = value + 1 WHERE id = 1 +*** (1) HOLDS THE LOCK(S): +RECORD LOCKS space id 3 page no 4 n bits 72 index PRIMARY of table netdata.deadlock_a trx id 200 lock mode S +*** (1) WAITING FOR THIS LOCK TO BE GRANTED: +RECORD LOCKS space id 4 page no 4 n bits 72 index PRIMARY of table netdata.deadlock_b trx id 200 lock mode AUTO-INC waiting +*** (2) TRANSACTION: +TRANSACTION 201, ACTIVE 0 sec +MySQL thread id 21, OS thread handle 2, query id 201 localhost root updating +UPDATE deadlock_a SET value = value + 1 WHERE id = 1 +*** (2) HOLDS THE LOCK(S): +RECORD LOCKS space id 4 page no 4 n bits 72 index PRIMARY of table netdata.deadlock_b trx id 201 lock mode X +*** WE ROLL BACK TRANSACTION (2) +` + +const sampleDeadlockStatusWithTimestamp = ` +------------------------ +LATEST DETECTED DEADLOCK +------------------------ +2026-01-25 12:34:56 +*** (1) TRANSACTION: +TRANSACTION 100, ACTIVE 0 sec +MySQL thread id 10, OS thread handle 1, query id 100 localhost root updating +UPDATE Animals SET value = value + 1 WHERE name='Aardvark' +*** (2) TRANSACTION: +TRANSACTION 101, ACTIVE 0 sec +MySQL thread id 11, OS thread handle 2, query id 101 localhost root updating +UPDATE Birds SET value = value + 1 WHERE name='Buzzard' +*** WE ROLL BACK TRANSACTION (2) +` + +const sampleDeadlockStatusThreeWay = ` +------------------------ +LATEST DETECTED DEADLOCK +------------------------ +*** (1) TRANSACTION: +TRANSACTION 300, ACTIVE 0 sec +MySQL thread id 30, OS thread handle 1, query id 300 localhost root updating +UPDATE alpha SET value = value + 1 WHERE id = 1 +*** (1) WAITING FOR THIS LOCK TO BE GRANTED: +RECORD LOCKS space id 7 page no 8 n bits 72 index PRIMARY of table netdata.beta trx id 300 lock mode X waiting +*** (2) TRANSACTION: +TRANSACTION 301, ACTIVE 0 sec +MySQL thread id 31, OS thread handle 2, query id 301 localhost root updating +UPDATE beta SET value = value + 1 WHERE id = 1 +*** (2) HOLDS THE LOCK(S): +RECORD LOCKS space id 7 page no 9 n bits 72 index PRIMARY of table netdata.gamma trx id 301 lock mode S +*** (3) TRANSACTION: +TRANSACTION 302, ACTIVE 0 sec +MySQL thread id 32, OS thread handle 3, query id 302 localhost root updating +UPDATE gamma SET value = value + 1 WHERE id = 1 +*** WE ROLL BACK TRANSACTION (3) +` + +const sampleDeadlockStatusWaitingNoLockMode = ` +------------------------ +LATEST DETECTED DEADLOCK +------------------------ +*** (1) TRANSACTION: +TRANSACTION 400, ACTIVE 0 sec +MySQL thread id 40, OS thread handle 1, query id 400 localhost root updating +UPDATE delta SET value = value + 1 WHERE id = 1 +*** (1) WAITING FOR THIS LOCK TO BE GRANTED: +RECORD LOCKS space id 10 page no 11 n bits 72 index PRIMARY of table netdata.epsilon trx id 400 waiting +*** (2) TRANSACTION: +TRANSACTION 401, ACTIVE 0 sec +MySQL thread id 41, OS thread handle 2, query id 401 localhost root updating +UPDATE epsilon SET value = value + 1 WHERE id = 1 +*** (2) HOLDS THE LOCK(S): +RECORD LOCKS space id 10 page no 12 n bits 72 index PRIMARY of table netdata.delta trx id 401 lock mode X +*** WE ROLL BACK TRANSACTION (1) +` + +const sampleDeadlockStatusMalformed = ` +------------------------ +LATEST DETECTED DEADLOCK +------------------------ +THIS IS NOT A VALID DEADLOCK SECTION +` diff --git a/src/go/plugin/go.d/collector/mysql/error_info.go b/src/go/plugin/go.d/collector/mysql/error_info.go new file mode 100644 index 00000000000000..78c7732c5e71f5 --- /dev/null +++ b/src/go/plugin/go.d/collector/mysql/error_info.go @@ -0,0 +1,493 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package mysql + +import ( + "context" + "database/sql" + "fmt" + "strings" + + "github.com/netdata/netdata/go/plugins/pkg/funcapi" + "github.com/netdata/netdata/go/plugins/plugin/go.d/agent/module" +) + +const ( + mysqlErrorAttrEnabled = "enabled" + mysqlErrorAttrNotEnabled = "not_enabled" + mysqlErrorAttrNotSupported = "not_supported" + mysqlErrorAttrNoData = "no_data" +) + +type mysqlErrorSource struct { + table string + fallbackTable string + columns map[string]bool + status string + reason string +} + +type mysqlErrorRow struct { + Digest string + Query string + Schema string + ErrorNumber *int64 + SQLState string + Message string +} + +func mysqlErrorAttributionColumns() []mysqlColumnMeta { + return []mysqlColumnMeta{ + { + uiKey: "errorAttribution", + displayName: "Error Attribution", + dataType: ftString, + visible: true, + transform: trNone, + sortDir: sortAsc, + summary: summaryCount, + filter: filterMulti, + }, + { + uiKey: "errorNumber", + displayName: "Error Number", + dataType: ftInteger, + visible: true, + transform: trNumber, + sortDir: sortDesc, + summary: summaryMax, + filter: filterRange, + }, + { + uiKey: "sqlState", + displayName: "SQL State", + dataType: ftString, + visible: false, + transform: trNone, + sortDir: sortAsc, + summary: summaryCount, + filter: filterMulti, + }, + { + uiKey: "errorMessage", + displayName: "Error Message", + dataType: ftString, + visible: true, + transform: trNone, + sortDir: sortAsc, + summary: summaryCount, + filter: filterMulti, + fullWidth: true, + }, + } +} + +func (c *Collector) collectMySQLErrorDetailsForDigests(ctx context.Context, digests []string) (string, map[string]mysqlErrorRow) { + source, err := c.detectMySQLErrorHistorySource(ctx) + if err != nil { + c.Debugf("error attribution: %v", err) + return mysqlErrorAttrNotEnabled, nil + } + if source.status != mysqlErrorAttrEnabled { + return source.status, nil + } + + rows, err := c.fetchMySQLErrorRows(ctx, source, digests, len(digests)) + if err != nil { + c.Debugf("error attribution query failed: %v", err) + return mysqlErrorAttrNotEnabled, nil + } + if len(rows) == 0 && source.fallbackTable != "" { + fallback, ferr := c.buildMySQLErrorSource(ctx, source.fallbackTable) + if ferr == nil && fallback.status == mysqlErrorAttrEnabled { + fallbackRows, ferr := c.fetchMySQLErrorRows(ctx, fallback, digests, len(digests)) + if ferr == nil && len(fallbackRows) > 0 { + rows = fallbackRows + } + } + } + + out := make(map[string]mysqlErrorRow, len(rows)) + for _, row := range rows { + if row.Digest == "" { + continue + } + if _, ok := out[row.Digest]; ok { + continue + } + out[row.Digest] = row + } + return mysqlErrorAttrEnabled, out +} + +func (c *Collector) errorInfoParams(context.Context) ([]funcapi.ParamConfig, error) { + if !c.Config.GetErrorInfoFunctionEnabled() { + return nil, fmt.Errorf("error-info function disabled in configuration") + } + return []funcapi.ParamConfig{}, nil +} + +func (c *Collector) collectErrorInfo(ctx context.Context) *module.FunctionResponse { + if !c.Config.GetErrorInfoFunctionEnabled() { + return &module.FunctionResponse{ + Status: 503, + Message: "error-info not enabled: function disabled in configuration. " + + "To enable, set error_info_function_enabled: true in the MySQL collector config.", + } + } + + available, err := c.checkPerformanceSchema(ctx) + if err != nil { + return &module.FunctionResponse{ + Status: 500, + Message: fmt.Sprintf("failed to check performance_schema availability: %v", err), + } + } + if !available { + return &module.FunctionResponse{Status: 503, Message: "performance_schema is not enabled"} + } + + source, err := c.detectMySQLErrorHistorySource(ctx) + if err != nil { + return &module.FunctionResponse{Status: 503, Message: fmt.Sprintf("error-info not enabled: %v", err)} + } + if source.status != mysqlErrorAttrEnabled { + msg := "error-info not enabled" + if source.reason != "" { + msg = fmt.Sprintf("%s: %s", msg, source.reason) + } + return &module.FunctionResponse{Status: 503, Message: msg} + } + + limit := c.TopQueriesLimit + if limit <= 0 { + limit = 500 + } + + rows, err := c.fetchMySQLErrorRows(ctx, source, nil, limit) + if err != nil { + return &module.FunctionResponse{Status: 500, Message: fmt.Sprintf("error-info query failed: %v", err)} + } + if len(rows) == 0 && source.fallbackTable != "" { + fallback, ferr := c.buildMySQLErrorSource(ctx, source.fallbackTable) + if ferr == nil && fallback.status == mysqlErrorAttrEnabled { + fallbackRows, ferr := c.fetchMySQLErrorRows(ctx, fallback, nil, limit) + if ferr == nil && len(fallbackRows) > 0 { + rows = fallbackRows + } + } + } + + data := make([][]any, 0, len(rows)) + for _, row := range rows { + var errNo any + if row.ErrorNumber != nil { + errNo = *row.ErrorNumber + } + data = append(data, []any{ + row.Digest, + row.Query, + row.Schema, + errNo, + row.SQLState, + row.Message, + }) + } + + return &module.FunctionResponse{ + Status: 200, + Help: "Recent SQL errors from performance_schema statement history tables", + Columns: buildMySQLErrorInfoColumns(), + Data: data, + DefaultSortColumn: "errorNumber", + } +} + +func buildMySQLErrorInfoColumns() map[string]any { + columns := map[string]any{ + "digest": funcapi.Column{ + Index: 0, + Name: "Digest", + Type: ftString, + Sortable: true, + Visible: false, + UniqueKey: true, + ValueOptions: funcapi.ValueOptions{Transform: trNone}, + }.BuildColumn(), + "query": funcapi.Column{ + Index: 1, + Name: "Query", + Type: ftString, + Sortable: true, + Sticky: true, + FullWidth: true, + ValueOptions: funcapi.ValueOptions{Transform: trNone}, + }.BuildColumn(), + "schema": funcapi.Column{ + Index: 2, + Name: "Schema", + Type: ftString, + Sortable: true, + ValueOptions: funcapi.ValueOptions{Transform: trNone}, + }.BuildColumn(), + "errorNumber": funcapi.Column{ + Index: 3, + Name: "Error Number", + Type: ftInteger, + Sortable: true, + ValueOptions: funcapi.ValueOptions{Transform: trNumber}, + }.BuildColumn(), + "sqlState": funcapi.Column{ + Index: 4, + Name: "SQL State", + Type: ftString, + Sortable: true, + ValueOptions: funcapi.ValueOptions{Transform: trNone}, + }.BuildColumn(), + "errorMessage": funcapi.Column{ + Index: 5, + Name: "Error Message", + Type: ftString, + Sortable: false, + FullWidth: true, + ValueOptions: funcapi.ValueOptions{Transform: trNone}, + }.BuildColumn(), + } + return columns +} + +func (c *Collector) detectMySQLErrorHistorySource(ctx context.Context) (mysqlErrorSource, error) { + qctx, cancel := context.WithTimeout(ctx, c.Timeout.Duration()) + defer cancel() + + rows, err := c.db.QueryContext(qctx, ` +SELECT NAME, ENABLED +FROM performance_schema.setup_consumers +WHERE NAME IN ('events_statements_history_long','events_statements_history','events_statements_current'); +`) + if err != nil { + return mysqlErrorSource{status: mysqlErrorAttrNotEnabled, reason: "unable to read performance_schema.setup_consumers"}, err + } + defer rows.Close() + + enabled := map[string]bool{} + present := map[string]bool{} + for rows.Next() { + var name, enabledVal string + if err := rows.Scan(&name, &enabledVal); err != nil { + return mysqlErrorSource{status: mysqlErrorAttrNotEnabled, reason: "unable to read performance_schema.setup_consumers"}, err + } + key := strings.ToLower(name) + enabled[key] = strings.EqualFold(enabledVal, "YES") + present[key] = true + } + if err := rows.Err(); err != nil { + return mysqlErrorSource{status: mysqlErrorAttrNotEnabled, reason: "unable to read performance_schema.setup_consumers"}, err + } + + table := "" + fallback := "" + switch { + case enabled["events_statements_history_long"]: + table = "events_statements_history_long" + if enabled["events_statements_history"] { + fallback = "events_statements_history" + } + case enabled["events_statements_history"]: + table = "events_statements_history" + default: + return mysqlErrorSource{status: mysqlErrorAttrNotEnabled, reason: "statement history consumers are disabled"}, nil + } + + source, err := c.buildMySQLErrorSource(ctx, table) + if err != nil { + return source, err + } + if source.status != mysqlErrorAttrEnabled { + return source, nil + } + source.fallbackTable = fallback + return source, nil +} + +func (c *Collector) buildMySQLErrorSource(ctx context.Context, table string) (mysqlErrorSource, error) { + cols, err := c.fetchMySQLTableColumns(ctx, table) + if err != nil { + return mysqlErrorSource{status: mysqlErrorAttrNotSupported, reason: "unable to read history table columns"}, err + } + + required := []string{"DIGEST", "MYSQL_ERRNO", "MESSAGE_TEXT", "RETURNED_SQLSTATE"} + for _, key := range required { + if !cols[key] { + return mysqlErrorSource{status: mysqlErrorAttrNotSupported, reason: "required history columns are missing"}, nil + } + } + + return mysqlErrorSource{ + table: table, + columns: cols, + status: mysqlErrorAttrEnabled, + }, nil +} + +func (c *Collector) fetchMySQLTableColumns(ctx context.Context, table string) (map[string]bool, error) { + qctx, cancel := context.WithTimeout(ctx, c.Timeout.Duration()) + defer cancel() + + rows, err := c.db.QueryContext(qctx, ` +SELECT COLUMN_NAME +FROM information_schema.COLUMNS +WHERE TABLE_SCHEMA = 'performance_schema' + AND TABLE_NAME = ?;`, table) + if err != nil { + return nil, err + } + defer rows.Close() + + cols := make(map[string]bool) + for rows.Next() { + var name string + if err := rows.Scan(&name); err != nil { + return nil, err + } + cols[strings.ToUpper(name)] = true + } + if err := rows.Err(); err != nil { + return nil, err + } + return cols, nil +} + +func (c *Collector) fetchMySQLErrorRows(ctx context.Context, source mysqlErrorSource, digests []string, limit int) ([]mysqlErrorRow, error) { + if source.status != mysqlErrorAttrEnabled { + return nil, fmt.Errorf("error history not enabled") + } + + selectCols := []string{"DIGEST", "MYSQL_ERRNO", "RETURNED_SQLSTATE", "MESSAGE_TEXT"} + if source.columns["DIGEST_TEXT"] { + selectCols = append(selectCols, "DIGEST_TEXT") + } + if source.columns["SCHEMA_NAME"] { + selectCols = append(selectCols, "SCHEMA_NAME") + } + if source.columns["SQL_TEXT"] { + selectCols = append(selectCols, "SQL_TEXT") + } + + orderBy := "" + switch { + case source.columns["EVENT_ID"]: + orderBy = "EVENT_ID DESC" + case source.columns["TIMER_END"]: + orderBy = "TIMER_END DESC" + } + + var args []any + var filters []string + filters = append(filters, "MYSQL_ERRNO <> 0") + if len(digests) > 0 { + placeholders := make([]string, 0, len(digests)) + for _, digest := range digests { + placeholders = append(placeholders, "?") + args = append(args, digest) + } + filters = append(filters, fmt.Sprintf("DIGEST IN (%s)", strings.Join(placeholders, ","))) + } + + query := fmt.Sprintf("SELECT %s FROM performance_schema.%s WHERE %s", + strings.Join(selectCols, ", "), + source.table, + strings.Join(filters, " AND "), + ) + if orderBy != "" { + query = fmt.Sprintf("%s ORDER BY %s", query, orderBy) + } + if limit > 0 { + query = fmt.Sprintf("%s LIMIT %d", query, limit) + } + + qctx, cancel := context.WithTimeout(ctx, c.Timeout.Duration()) + defer cancel() + + rows, err := c.db.QueryContext(qctx, query, args...) + if err != nil { + return nil, err + } + defer rows.Close() + + var results []mysqlErrorRow + seen := make(map[string]bool) + + for rows.Next() { + var ( + digest sql.NullString + errno sql.NullInt64 + sqlState sql.NullString + message sql.NullString + digestText sql.NullString + schemaName sql.NullString + sqlText sql.NullString + scanTargets []any + ) + + scanTargets = append(scanTargets, &digest, &errno, &sqlState, &message) + if source.columns["DIGEST_TEXT"] { + scanTargets = append(scanTargets, &digestText) + } + if source.columns["SCHEMA_NAME"] { + scanTargets = append(scanTargets, &schemaName) + } + if source.columns["SQL_TEXT"] { + scanTargets = append(scanTargets, &sqlText) + } + + if err := rows.Scan(scanTargets...); err != nil { + return nil, err + } + + if !digest.Valid || strings.TrimSpace(digest.String) == "" { + continue + } + if seen[digest.String] { + continue + } + seen[digest.String] = true + + queryText := "" + switch { + case digestText.Valid && strings.TrimSpace(digestText.String) != "": + queryText = digestText.String + case sqlText.Valid && strings.TrimSpace(sqlText.String) != "": + queryText = sqlText.String + } + + var errNoPtr *int64 + if errno.Valid { + val := errno.Int64 + errNoPtr = &val + } + + row := mysqlErrorRow{ + Digest: digest.String, + Query: queryText, + Schema: schemaName.String, + ErrorNumber: errNoPtr, + SQLState: sqlState.String, + Message: message.String, + } + results = append(results, row) + } + + if err := rows.Err(); err != nil { + return nil, err + } + + return results, nil +} + +func nullableString(value string) any { + if strings.TrimSpace(value) == "" { + return nil + } + return value +} diff --git a/src/go/plugin/go.d/collector/mysql/functions.go b/src/go/plugin/go.d/collector/mysql/functions.go index 819915c4e1b57d..08d5da915b886e 100644 --- a/src/go/plugin/go.d/collector/mysql/functions.go +++ b/src/go/plugin/go.d/collector/mysql/functions.go @@ -203,6 +203,22 @@ func mysqlMethods() []module.MethodConfig { }, }, }, + { + UpdateEvery: 10, + ID: "deadlock-info", + Name: "Deadlock Info", + Help: deadlockInfoHelp, + RequireCloud: true, + RequiredParams: []funcapi.ParamConfig{}, + }, + { + UpdateEvery: 10, + ID: "error-info", + Name: "Error Info", + Help: "Recent SQL errors from performance_schema statement history tables", + RequireCloud: true, + RequiredParams: []funcapi.ParamConfig{}, + }, } } @@ -217,6 +233,10 @@ func mysqlMethodParams(ctx context.Context, job *module.Job, method string) ([]f switch method { case "top-queries": return collector.topQueriesParams(ctx) + case "deadlock-info": + return collector.deadlockInfoParams(ctx) + case "error-info": + return collector.errorInfoParams(ctx) default: return nil, fmt.Errorf("unknown method: %s", method) } @@ -240,6 +260,10 @@ func mysqlHandleMethod(ctx context.Context, job *module.Job, method string, para switch method { case "top-queries": return collector.collectTopQueries(ctx, params.Column(paramSort)) + case "deadlock-info": + return collector.collectDeadlockInfo(ctx) + case "error-info": + return collector.collectErrorInfo(ctx) default: return &module.FunctionResponse{Status: 404, Message: fmt.Sprintf("unknown method: %s", method)} } @@ -581,6 +605,70 @@ func (c *Collector) collectTopQueries(ctx context.Context, sortColumn string) *m return &module.FunctionResponse{Status: 500, Message: err.Error()} } + errorCols := mysqlErrorAttributionColumns() + errorStatus := mysqlErrorAttrNotSupported + errorDetails := map[string]mysqlErrorRow{} + digestIdx := -1 + for i, col := range cols { + if col.uiKey == "digest" { + digestIdx = i + break + } + } + if digestIdx >= 0 { + digests := make([]string, 0, len(data)) + seen := make(map[string]bool) + for _, row := range data { + if digestIdx >= len(row) { + continue + } + digest, ok := row[digestIdx].(string) + if !ok || digest == "" { + continue + } + if seen[digest] { + continue + } + seen[digest] = true + digests = append(digests, digest) + } + + if len(digests) > 0 { + errorStatus, errorDetails = c.collectMySQLErrorDetailsForDigests(ctx, digests) + } else { + errorStatus = mysqlErrorAttrNoData + } + } + + if len(errorCols) > 0 { + for i := range data { + status := errorStatus + var errRow mysqlErrorRow + var errNo any + if digestIdx >= 0 && digestIdx < len(data) { + if digest, ok := data[i][digestIdx].(string); ok && digest != "" { + if row, ok := errorDetails[digest]; ok { + status = mysqlErrorAttrEnabled + errRow = row + if errRow.ErrorNumber != nil { + errNo = *errRow.ErrorNumber + } + } else if status == mysqlErrorAttrEnabled { + status = mysqlErrorAttrNoData + } + } + } + + data[i] = append(data[i], + status, + errNo, + nullableString(errRow.SQLState), + nullableString(errRow.Message), + ) + } + cols = append(cols, errorCols...) + } + // Build dynamic sort options from available columns (only those actually detected) sortParam, sortOptions := c.topQueriesSortParam(cols) diff --git a/src/go/plugin/go.d/collector/mysql/functions_test.go b/src/go/plugin/go.d/collector/mysql/functions_test.go index d1f95b70d55f91..485ddf904bff2f 100644 --- a/src/go/plugin/go.d/collector/mysql/functions_test.go +++ b/src/go/plugin/go.d/collector/mysql/functions_test.go @@ -13,16 +13,43 @@ func TestMysqlMethods(t *testing.T) { methods := mysqlMethods() require := assert.New(t) - require.Len(methods, 1) - require.Equal("top-queries", methods[0].ID) - require.Equal("Top Queries", methods[0].Name) - require.NotEmpty(methods[0].RequiredParams) + require.Len(methods, 3) + + topIdx := -1 + deadlockIdx := -1 + errorIdx := -1 + for i := range methods { + switch methods[i].ID { + case "top-queries": + topIdx = i + case "deadlock-info": + deadlockIdx = i + case "error-info": + errorIdx = i + } + } + + require.NotEqual(-1, topIdx, "expected top-queries method") + require.NotEqual(-1, deadlockIdx, "expected deadlock-info method") + require.NotEqual(-1, errorIdx, "expected error-info method") + + topMethod := methods[topIdx] + require.Equal("Top Queries", topMethod.Name) + require.NotEmpty(topMethod.RequiredParams) + + deadlockMethod := methods[deadlockIdx] + require.Equal("Deadlock Info", deadlockMethod.Name) + require.Empty(deadlockMethod.RequiredParams) + + errorMethod := methods[errorIdx] + require.Equal("Error Info", errorMethod.Name) + require.Empty(errorMethod.RequiredParams) // Verify at least one default sort option exists var sortParam *funcapi.ParamConfig - for i := range methods[0].RequiredParams { - if methods[0].RequiredParams[i].ID == "__sort" { - sortParam = &methods[0].RequiredParams[i] + for i := range topMethod.RequiredParams { + if topMethod.RequiredParams[i].ID == "__sort" { + sortParam = &topMethod.RequiredParams[i] break } } @@ -233,21 +260,29 @@ func TestCollector_buildMySQLDynamicColumns(t *testing.T) { func TestMysqlMethods_SortOptionsHaveLabels(t *testing.T) { methods := mysqlMethods() - for _, method := range methods { - var sortParam *funcapi.ParamConfig - for i := range method.RequiredParams { - if method.RequiredParams[i].ID == "__sort" { - sortParam = &method.RequiredParams[i] - break - } + topIdx := -1 + for i := range methods { + if methods[i].ID == "top-queries" { + topIdx = i + break } - assert.NotNil(t, sortParam) - for _, opt := range sortParam.Options { - assert.NotEmpty(t, opt.ID, "sort option must have ID") - assert.NotEmpty(t, opt.Name, "sort option %s must have Name", opt.ID) - assert.Contains(t, opt.Name, "Top queries by", "label should have standard prefix") + } + assert.NotEqual(t, -1, topIdx, "expected top-queries method") + + method := methods[topIdx] + var sortParam *funcapi.ParamConfig + for i := range method.RequiredParams { + if method.RequiredParams[i].ID == "__sort" { + sortParam = &method.RequiredParams[i] + break } } + assert.NotNil(t, sortParam) + for _, opt := range sortParam.Options { + assert.NotEmpty(t, opt.ID, "sort option must have ID") + assert.NotEmpty(t, opt.Name, "sort option %s must have Name", opt.ID) + assert.Contains(t, opt.Name, "Top queries by", "label should have standard prefix") + } } // TestSortColumnValidation_SQLInjection verifies that SQL injection attempts diff --git a/src/go/plugin/go.d/collector/mysql/integrations/mariadb.md b/src/go/plugin/go.d/collector/mysql/integrations/mariadb.md index 73ad31c12be19e..5dbe756f91471e 100644 --- a/src/go/plugin/go.d/collector/mysql/integrations/mariadb.md +++ b/src/go/plugin/go.d/collector/mysql/integrations/mariadb.md @@ -306,6 +306,10 @@ Aggregated statement statistics from Performance Schema, grouped by query digest | Lock Time | duration | milliseconds | | Total time spent waiting for table locks across all executions. High lock time may indicate contention from concurrent transactions. | | Errors | integer | | | Total number of times this query pattern resulted in an error. Non-zero values require investigation into the underlying issue. | | Warnings | integer | | | Total number of times this query pattern generated warnings. Warnings may indicate data type conversions, NULL handling issues, or other non-critical problems. | +| Error Attribution | string | | | Status of error detail attribution for this query. Values: enabled, no_data, not_enabled, not_supported. | +| Error Number | integer | | | Most recent error number observed for this query digest (when error attribution is enabled). | +| SQL State | string | | hidden | SQLSTATE code for the most recent error (when error attribution is enabled). | +| Error Message | string | | | Most recent error message for this query digest (when error attribution is enabled). | | Rows Affected | integer | | | Total number of rows modified by INSERT, UPDATE, DELETE, or REPLACE statements. Useful for tracking write workloads. | | Rows Sent | integer | | | Total number of rows returned to the client by SELECT statements. High values may indicate result sets that are too large. | | Rows Examined | integer | | | Total number of rows read during query execution. A high ratio of rows examined to rows sent suggests missing or inefficient indexes. | @@ -335,6 +339,78 @@ Aggregated statement statistics from Performance Schema, grouped by query digest | Max Total Memory | integer | | | Maximum total memory used by this query pattern including both controlled and uncontrolled allocations. Available in MySQL 8.0.31+. | +### Deadlock Info + +Retrieves the latest detected InnoDB deadlock from `SHOW ENGINE INNODB STATUS`. + +### Error Info + +Retrieves recent SQL errors from Performance Schema statement history tables. + +- Requires Performance Schema statement history consumers to be enabled (`events_statements_current` plus `events_statements_history_long` preferred, or `events_statements_history`). +- Returns HTTP 503 with `errorMessage: "not enabled"` when the history consumer is disabled. +- Error messages and query text may include unmasked literals (PII); restrict dashboard access. + +The output is parsed to attribute the deadlock to participating transactions and their query text, lock mode, lock status, and wait resource. + +Use cases: +- Identify which query was chosen as the deadlock victim +- Inspect the waiting lock resource and lock mode +- Correlate deadlocks with application changes or deployments + +Query text and wait resource strings are truncated at 4096 characters for display purposes. + + +| Aspect | Description | +|:-------|:------------| +| Name | `Mysql:deadlock-info` | +| Require Cloud | yes | +| Performance | Executes `SHOW ENGINE INNODB STATUS` on demand:
• Not part of regular collection
• Query cost depends on server load and the size of the InnoDB status output | +| Security | Query text and wait resource strings may include unmasked literal values including sensitive data (PII/secrets):
• SQL literals such as emails, IDs, or tokens
• Schema and table names that may be sensitive in some environments
• Restrict dashboard access to authorized personnel only | +| Availability | Available when:
• The collector has successfully connected to MySQL
• `deadlock_info_function_enabled` is true
• The account can run `SHOW ENGINE INNODB STATUS` (PROCESS privilege)
• Returns HTTP 200 with empty data when no deadlock is found
• Returns HTTP 403 when PROCESS privilege is missing
• Returns HTTP 500 if the query fails
• Returns HTTP 504 if the query times out
• Returns HTTP 561 when the deadlock section cannot be parsed
• Returns HTTP 503 if the collector is still initializing or the function is disabled | + +#### Prerequisites + +1. Ensure the account has the required privilege: + ```sql + GRANT PROCESS ON *.* TO 'netdata'@'localhost'; + FLUSH PRIVILEGES; + ``` +2. Enable the function in Netdata collector config: + ```yaml + jobs: + - name: local + dsn: "mysql://user:pass@tcp(127.0.0.1:3306)/" + deadlock_info_function_enabled: true + ``` +3. Verify the deadlock source is accessible: + ```sql + SHOW ENGINE INNODB STATUS\G + ``` + +#### Parameters + +This function has no parameters. + +#### Returns + +Parsed deadlock participants from the latest detected deadlock. Each row represents one transaction involved in the deadlock. + +| Column | Type | Unit | Visibility | Description | +|:-------|:-----|:-----|:-----------|:------------| +| Row ID | string | | hidden | Unique row identifier composed of deadlock ID and process ID. | +| Deadlock ID | string | | | Identifier for the deadlock event, used to group participating transactions. | +| Timestamp | timestamp | | | Timestamp of the deadlock event. Parsed from the deadlock section when available; otherwise the function execution time. | +| Process ID | string | | | MySQL thread id of the transaction involved in the deadlock. | +| Connection ID | integer | | | Numeric connection identifier when the process id is numeric. | +| ECID | integer | | | Execution context id (engine-specific). This is typically null for MySQL and reserved for cross-engine consistency. | +| Victim | string | | | "true" when the transaction was chosen as the deadlock victim and rolled back; otherwise "false". | +| Query | string | | | SQL query text for the transaction involved in the deadlock. Truncated to 4096 characters. | +| Lock Mode | string | | | Lock mode reported for the waiting lock (for example X or S). | +| Lock Status | string | | | Lock status for the transaction. WAITING indicates the transaction was waiting on a lock. | +| Wait Resource | string | | | Lock resource line from InnoDB status showing what the transaction was waiting on. | +| Database | string | | | Database name when it can be inferred. This may be empty or null depending on the deadlock output. | + ## Alerts @@ -614,5 +690,3 @@ If your Netdata runs in a Docker container named "netdata" (replace if different ```bash docker logs netdata 2>&1 | grep mysql ``` - - diff --git a/src/go/plugin/go.d/collector/mysql/integrations/mysql.md b/src/go/plugin/go.d/collector/mysql/integrations/mysql.md index dc3a845bed3744..e70795ad9fe02c 100644 --- a/src/go/plugin/go.d/collector/mysql/integrations/mysql.md +++ b/src/go/plugin/go.d/collector/mysql/integrations/mysql.md @@ -306,6 +306,10 @@ Aggregated statement statistics from Performance Schema, grouped by query digest | Lock Time | duration | milliseconds | | Total time spent waiting for table locks across all executions. High lock time may indicate contention from concurrent transactions. | | Errors | integer | | | Total number of times this query pattern resulted in an error. Non-zero values require investigation into the underlying issue. | | Warnings | integer | | | Total number of times this query pattern generated warnings. Warnings may indicate data type conversions, NULL handling issues, or other non-critical problems. | +| Error Attribution | string | | | Status of error detail attribution for this query. Values: enabled, no_data, not_enabled, not_supported. | +| Error Number | integer | | | Most recent error number observed for this query digest (when error attribution is enabled). | +| SQL State | string | | hidden | SQLSTATE code for the most recent error (when error attribution is enabled). | +| Error Message | string | | | Most recent error message for this query digest (when error attribution is enabled). | | Rows Affected | integer | | | Total number of rows modified by INSERT, UPDATE, DELETE, or REPLACE statements. Useful for tracking write workloads. | | Rows Sent | integer | | | Total number of rows returned to the client by SELECT statements. High values may indicate result sets that are too large. | | Rows Examined | integer | | | Total number of rows read during query execution. A high ratio of rows examined to rows sent suggests missing or inefficient indexes. | @@ -335,6 +339,78 @@ Aggregated statement statistics from Performance Schema, grouped by query digest | Max Total Memory | integer | | | Maximum total memory used by this query pattern including both controlled and uncontrolled allocations. Available in MySQL 8.0.31+. | +### Deadlock Info + +Retrieves the latest detected InnoDB deadlock from `SHOW ENGINE INNODB STATUS`. + +### Error Info + +Retrieves recent SQL errors from Performance Schema statement history tables. + +- Requires Performance Schema statement history consumers to be enabled (`events_statements_current` plus `events_statements_history_long` preferred, or `events_statements_history`). +- Returns HTTP 503 with `errorMessage: "not enabled"` when the history consumer is disabled. +- Error messages and query text may include unmasked literals (PII); restrict dashboard access. + +The output is parsed to attribute the deadlock to participating transactions and their query text, lock mode, lock status, and wait resource. + +Use cases: +- Identify which query was chosen as the deadlock victim +- Inspect the waiting lock resource and lock mode +- Correlate deadlocks with application changes or deployments + +Query text and wait resource strings are truncated at 4096 characters for display purposes. + + +| Aspect | Description | +|:-------|:------------| +| Name | `Mysql:deadlock-info` | +| Require Cloud | yes | +| Performance | Executes `SHOW ENGINE INNODB STATUS` on demand:
• Not part of regular collection
• Query cost depends on server load and the size of the InnoDB status output | +| Security | Query text and wait resource strings may include unmasked literal values including sensitive data (PII/secrets):
• SQL literals such as emails, IDs, or tokens
• Schema and table names that may be sensitive in some environments
• Restrict dashboard access to authorized personnel only | +| Availability | Available when:
• The collector has successfully connected to MySQL
• `deadlock_info_function_enabled` is true
• The account can run `SHOW ENGINE INNODB STATUS` (PROCESS privilege)
• Returns HTTP 200 with empty data when no deadlock is found
• Returns HTTP 403 when PROCESS privilege is missing
• Returns HTTP 500 if the query fails
• Returns HTTP 504 if the query times out
• Returns HTTP 561 when the deadlock section cannot be parsed
• Returns HTTP 503 if the collector is still initializing or the function is disabled | + +#### Prerequisites + +1. Ensure the account has the required privilege: + ```sql + GRANT PROCESS ON *.* TO 'netdata'@'localhost'; + FLUSH PRIVILEGES; + ``` +2. Enable the function in Netdata collector config: + ```yaml + jobs: + - name: local + dsn: "mysql://user:pass@tcp(127.0.0.1:3306)/" + deadlock_info_function_enabled: true + ``` +3. Verify the deadlock source is accessible: + ```sql + SHOW ENGINE INNODB STATUS\G + ``` + +#### Parameters + +This function has no parameters. + +#### Returns + +Parsed deadlock participants from the latest detected deadlock. Each row represents one transaction involved in the deadlock. + +| Column | Type | Unit | Visibility | Description | +|:-------|:-----|:-----|:-----------|:------------| +| Row ID | string | | hidden | Unique row identifier composed of deadlock ID and process ID. | +| Deadlock ID | string | | | Identifier for the deadlock event, used to group participating transactions. | +| Timestamp | timestamp | | | Timestamp of the deadlock event. Parsed from the deadlock section when available; otherwise the function execution time. | +| Process ID | string | | | MySQL thread id of the transaction involved in the deadlock. | +| Connection ID | integer | | | Numeric connection identifier when the process id is numeric. | +| ECID | integer | | | Execution context id (engine-specific). This is typically null for MySQL and reserved for cross-engine consistency. | +| Victim | string | | | "true" when the transaction was chosen as the deadlock victim and rolled back; otherwise "false". | +| Query | string | | | SQL query text for the transaction involved in the deadlock. Truncated to 4096 characters. | +| Lock Mode | string | | | Lock mode reported for the waiting lock (for example X or S). | +| Lock Status | string | | | Lock status for the transaction. WAITING indicates the transaction was waiting on a lock. | +| Wait Resource | string | | | Lock resource line from InnoDB status showing what the transaction was waiting on. | +| Database | string | | | Database name when it can be inferred. This may be empty or null depending on the deadlock output. | + ## Alerts @@ -614,5 +690,3 @@ If your Netdata runs in a Docker container named "netdata" (replace if different ```bash docker logs netdata 2>&1 | grep mysql ``` - - diff --git a/src/go/plugin/go.d/collector/mysql/integrations/percona_mysql.md b/src/go/plugin/go.d/collector/mysql/integrations/percona_mysql.md index 7139bc2130275e..30fedfedc607b9 100644 --- a/src/go/plugin/go.d/collector/mysql/integrations/percona_mysql.md +++ b/src/go/plugin/go.d/collector/mysql/integrations/percona_mysql.md @@ -306,6 +306,10 @@ Aggregated statement statistics from Performance Schema, grouped by query digest | Lock Time | duration | milliseconds | | Total time spent waiting for table locks across all executions. High lock time may indicate contention from concurrent transactions. | | Errors | integer | | | Total number of times this query pattern resulted in an error. Non-zero values require investigation into the underlying issue. | | Warnings | integer | | | Total number of times this query pattern generated warnings. Warnings may indicate data type conversions, NULL handling issues, or other non-critical problems. | +| Error Attribution | string | | | Status of error detail attribution for this query. Values: enabled, no_data, not_enabled, not_supported. | +| Error Number | integer | | | Most recent error number observed for this query digest (when error attribution is enabled). | +| SQL State | string | | hidden | SQLSTATE code for the most recent error (when error attribution is enabled). | +| Error Message | string | | | Most recent error message for this query digest (when error attribution is enabled). | | Rows Affected | integer | | | Total number of rows modified by INSERT, UPDATE, DELETE, or REPLACE statements. Useful for tracking write workloads. | | Rows Sent | integer | | | Total number of rows returned to the client by SELECT statements. High values may indicate result sets that are too large. | | Rows Examined | integer | | | Total number of rows read during query execution. A high ratio of rows examined to rows sent suggests missing or inefficient indexes. | @@ -335,6 +339,78 @@ Aggregated statement statistics from Performance Schema, grouped by query digest | Max Total Memory | integer | | | Maximum total memory used by this query pattern including both controlled and uncontrolled allocations. Available in MySQL 8.0.31+. | +### Deadlock Info + +Retrieves the latest detected InnoDB deadlock from `SHOW ENGINE INNODB STATUS`. + +### Error Info + +Retrieves recent SQL errors from Performance Schema statement history tables. + +- Requires Performance Schema statement history consumers to be enabled (`events_statements_current` plus `events_statements_history_long` preferred, or `events_statements_history`). +- Returns HTTP 503 with `errorMessage: "not enabled"` when the history consumer is disabled. +- Error messages and query text may include unmasked literals (PII); restrict dashboard access. + +The output is parsed to attribute the deadlock to participating transactions and their query text, lock mode, lock status, and wait resource. + +Use cases: +- Identify which query was chosen as the deadlock victim +- Inspect the waiting lock resource and lock mode +- Correlate deadlocks with application changes or deployments + +Query text and wait resource strings are truncated at 4096 characters for display purposes. + + +| Aspect | Description | +|:-------|:------------| +| Name | `Mysql:deadlock-info` | +| Require Cloud | yes | +| Performance | Executes `SHOW ENGINE INNODB STATUS` on demand:
• Not part of regular collection
• Query cost depends on server load and the size of the InnoDB status output | +| Security | Query text and wait resource strings may include unmasked literal values including sensitive data (PII/secrets):
• SQL literals such as emails, IDs, or tokens
• Schema and table names that may be sensitive in some environments
• Restrict dashboard access to authorized personnel only | +| Availability | Available when:
• The collector has successfully connected to MySQL
• `deadlock_info_function_enabled` is true
• The account can run `SHOW ENGINE INNODB STATUS` (PROCESS privilege)
• Returns HTTP 200 with empty data when no deadlock is found
• Returns HTTP 403 when PROCESS privilege is missing
• Returns HTTP 500 if the query fails
• Returns HTTP 504 if the query times out
• Returns HTTP 561 when the deadlock section cannot be parsed
• Returns HTTP 503 if the collector is still initializing or the function is disabled | + +#### Prerequisites + +1. Ensure the account has the required privilege: + ```sql + GRANT PROCESS ON *.* TO 'netdata'@'localhost'; + FLUSH PRIVILEGES; + ``` +2. Enable the function in Netdata collector config: + ```yaml + jobs: + - name: local + dsn: "mysql://user:pass@tcp(127.0.0.1:3306)/" + deadlock_info_function_enabled: true + ``` +3. Verify the deadlock source is accessible: + ```sql + SHOW ENGINE INNODB STATUS\G + ``` + +#### Parameters + +This function has no parameters. + +#### Returns + +Parsed deadlock participants from the latest detected deadlock. Each row represents one transaction involved in the deadlock. + +| Column | Type | Unit | Visibility | Description | +|:-------|:-----|:-----|:-----------|:------------| +| Row ID | string | | hidden | Unique row identifier composed of deadlock ID and process ID. | +| Deadlock ID | string | | | Identifier for the deadlock event, used to group participating transactions. | +| Timestamp | timestamp | | | Timestamp of the deadlock event. Parsed from the deadlock section when available; otherwise the function execution time. | +| Process ID | string | | | MySQL thread id of the transaction involved in the deadlock. | +| Connection ID | integer | | | Numeric connection identifier when the process id is numeric. | +| ECID | integer | | | Execution context id (engine-specific). This is typically null for MySQL and reserved for cross-engine consistency. | +| Victim | string | | | "true" when the transaction was chosen as the deadlock victim and rolled back; otherwise "false". | +| Query | string | | | SQL query text for the transaction involved in the deadlock. Truncated to 4096 characters. | +| Lock Mode | string | | | Lock mode reported for the waiting lock (for example X or S). | +| Lock Status | string | | | Lock status for the transaction. WAITING indicates the transaction was waiting on a lock. | +| Wait Resource | string | | | Lock resource line from InnoDB status showing what the transaction was waiting on. | +| Database | string | | | Database name when it can be inferred. This may be empty or null depending on the deadlock output. | + ## Alerts @@ -614,5 +690,3 @@ If your Netdata runs in a Docker container named "netdata" (replace if different ```bash docker logs netdata 2>&1 | grep mysql ``` - - diff --git a/src/go/plugin/go.d/collector/mysql/metadata.yaml b/src/go/plugin/go.d/collector/mysql/metadata.yaml index 9c72d0ca9197ca..9a0e7bb56d133c 100644 --- a/src/go/plugin/go.d/collector/mysql/metadata.yaml +++ b/src/go/plugin/go.d/collector/mysql/metadata.yaml @@ -304,6 +304,23 @@ modules: type: integer unit: "" description: "Total number of times this query pattern generated warnings. Warnings may indicate data type conversions, NULL handling issues, or other non-critical problems." + - name: Error Attribution + type: string + unit: "" + description: "Status of error detail attribution for this query. Values: enabled (error details available), no_data (no recent error for this digest), not_enabled (statement history consumers disabled, including events_statements_current), not_supported (required columns unavailable)." + - name: Error Number + type: integer + unit: "" + description: "Most recent error number observed for this query digest (when error attribution is enabled)." + - name: SQL State + type: string + unit: "" + visibility: hidden + description: "SQLSTATE code for the most recent error (when error attribution is enabled)." + - name: Error Message + type: string + unit: "" + description: "Most recent error message for this query digest (when error attribution is enabled)." - name: Rows Affected type: integer unit: "" @@ -500,6 +517,128 @@ modules: availability: | Available when:
• The collector has successfully connected to MySQL
• Performance Schema is enabled with statement digest collection
• Returns HTTP 503 if collector is still initializing
• Returns HTTP 500 if the query fails
• Returns HTTP 504 if the query times out require_cloud: true + - id: deadlock-info + name: Deadlock Info + description: | + Retrieves the latest detected InnoDB deadlock from `SHOW ENGINE INNODB STATUS`. + + The output is parsed to attribute the deadlock to the participating transactions and their query text, lock mode, lock status, and wait resource. + + Use cases: + - Identify which query was chosen as the deadlock victim + - Inspect the waiting lock resource and lock mode + - Correlate deadlocks with application changes or deployment events + + Query text is truncated at 4096 characters for display purposes. + returns: + description: Parsed deadlock participants from the latest detected deadlock. Each row represents one transaction involved in the deadlock. + columns: + - name: Row ID + type: string + unit: "" + visibility: hidden + description: "Unique row identifier composed of deadlock ID and process ID." + - name: Deadlock ID + type: string + unit: "" + description: "Identifier for the deadlock event, used to group participating transactions." + - name: Timestamp + type: timestamp + unit: "" + description: "Timestamp of the deadlock event. Parsed from the deadlock section when available; otherwise the function execution time." + - name: Process ID + type: string + unit: "" + description: "MySQL thread id of the transaction involved in the deadlock." + - name: Connection ID + type: integer + unit: "" + description: "Numeric connection identifier when the process id is numeric." + - name: ECID + type: integer + unit: "" + description: "Execution context id (engine-specific). This is typically null for MySQL and reserved for cross-engine consistency." + - name: Victim + type: string + unit: "" + description: "\"true\" when the transaction was chosen as the deadlock victim and rolled back; otherwise \"false\"." + - name: Query + type: string + unit: "" + description: "SQL query text for the transaction involved in the deadlock. Truncated to 4096 characters." + - name: Lock Mode + type: string + unit: "" + description: "Lock mode reported for the waiting lock (for example X or S)." + - name: Lock Status + type: string + unit: "" + description: "Lock status for the transaction. WAITING indicates the transaction was waiting on a lock." + - name: Wait Resource + type: string + unit: "" + description: "Lock resource line from InnoDB status showing what the transaction was waiting on." + - name: Database + type: string + unit: "" + description: "Database name when it can be inferred. This may be empty or null depending on the deadlock output." + performance: | + Executes `SHOW ENGINE INNODB STATUS` on demand:
• Not part of regular collection
• Query cost depends on server load and the size of the InnoDB status output + security: | + Query text and wait resource strings may include unmasked literal values including sensitive data (PII/secrets):
• SQL literals such as emails, IDs, or tokens
• Schema and table names that may be sensitive in some environments
• Restrict dashboard access to authorized personnel only + availability: | + Available when:
• The collector has successfully connected to MySQL
• `deadlock_info_function_enabled` is true
• The account can run `SHOW ENGINE INNODB STATUS` (PROCESS privilege)
• Returns HTTP 200 with empty data when no deadlock is found
• Returns HTTP 403 when PROCESS privilege is missing
• Returns HTTP 500 if the query fails
• Returns HTTP 504 if the query times out
• Returns HTTP 561 when the deadlock section cannot be parsed
• Returns HTTP 503 if the collector is still initializing or the function is disabled + require_cloud: true + - id: error-info + name: Error Info + description: | + Retrieves recent SQL errors from Performance Schema statement history tables. + + This function reads `performance_schema.events_statements_history_long` when enabled, + otherwise falls back to `performance_schema.events_statements_history`. It reports the + most recent error per query digest, including error number, SQLSTATE, and message. + + Use cases: + - Identify recent query errors and their messages + - Correlate errors to query patterns (digest) + - Validate error rates seen in top-queries + + Error messages are truncated by Performance Schema (usually 128 characters). + returns: + description: Most recent error per query digest from Performance Schema history tables. + columns: + - name: Digest + type: string + unit: "" + visibility: hidden + description: "Unique hash identifier for the normalized query pattern." + - name: Query + type: string + unit: "" + description: "Normalized query text when available (digest text or SQL text)." + - name: Schema + type: string + unit: "" + description: "Database schema name when available." + - name: Error Number + type: integer + unit: "" + description: "MySQL error number for the most recent error of this digest." + - name: SQL State + type: string + unit: "" + description: "SQLSTATE code for the most recent error." + - name: Error Message + type: string + unit: "" + description: "Error message for the most recent error." + performance: | + Reads Performance Schema statement history tables on demand:
• Not part of regular collection
• Query cost depends on history table size and server load + security: | + Error messages and query text may include unmasked literals (PII/secrets).
• Restrict dashboard access to authorized personnel only + availability: | + Available when:
• The collector has successfully connected to MySQL
• `error_info_function_enabled` is true
• Performance Schema statement history consumers are enabled (events_statements_current + history/history_long)
• Returns HTTP 200 with empty data when no errors are found
• Returns HTTP 503 when required consumers are not enabled or function disabled
• Returns HTTP 500 if the query fails
• Returns HTTP 504 if the query times out + require_cloud: true metrics: folding: title: Metrics diff --git a/src/go/tools/functions-validation/config/go.d/mssql.conf b/src/go/tools/functions-validation/config/go.d/mssql.conf index f78b4976fd0c83..267de14c3ec9b3 100644 --- a/src/go/tools/functions-validation/config/go.d/mssql.conf +++ b/src/go/tools/functions-validation/config/go.d/mssql.conf @@ -1,4 +1,5 @@ jobs: - name: local - dsn: "sqlserver://sa:Netdata123!@127.0.0.1:1433?database=netdata" + dsn: "sqlserver://sa:Netdata123!@127.0.0.1:1433?database=netdata&encrypt=disable" top_queries_limit: 100 + deadlock_info_function_enabled: true diff --git a/src/go/tools/functions-validation/config/go.d/mysql.conf b/src/go/tools/functions-validation/config/go.d/mysql.conf index 6564faf11b8c1f..33984767cd4fe1 100644 --- a/src/go/tools/functions-validation/config/go.d/mysql.conf +++ b/src/go/tools/functions-validation/config/go.d/mysql.conf @@ -2,3 +2,4 @@ jobs: - name: local dsn: "netdata:netdata@tcp(127.0.0.1:3306)/netdata" top_queries_limit: 100 + deadlock_info_function_enabled: true diff --git a/src/go/tools/functions-validation/docker-compose.yml b/src/go/tools/functions-validation/docker-compose.yml index b1cfeab9ccf29e..88eb6d35474fdf 100644 --- a/src/go/tools/functions-validation/docker-compose.yml +++ b/src/go/tools/functions-validation/docker-compose.yml @@ -17,25 +17,29 @@ services: retries: 10 mysql: - image: mysql:8.0 + image: ${MYSQL_IMAGE:-mysql:8.0} environment: MYSQL_ROOT_PASSWORD: rootpw MYSQL_DATABASE: netdata MYSQL_USER: netdata MYSQL_PASSWORD: netdata + MARIADB_ROOT_PASSWORD: rootpw + MARIADB_DATABASE: netdata + MARIADB_USER: netdata + MARIADB_PASSWORD: netdata command: ["--performance_schema=ON"] ports: - "${MYSQL_PORT:-3306}:3306" volumes: - ./seed/mysql/init.sql:/docker-entrypoint-initdb.d/init.sql:ro healthcheck: - test: ["CMD-SHELL", "mysqladmin ping -h 127.0.0.1 -u root -p$$MYSQL_ROOT_PASSWORD > /dev/null"] + test: ["CMD-SHELL", "if command -v mysqladmin >/dev/null 2>&1; then mysqladmin ping -h 127.0.0.1 -u root -p$$MYSQL_ROOT_PASSWORD > /dev/null; elif command -v mariadb-admin >/dev/null 2>&1; then mariadb-admin ping -h 127.0.0.1 -u root -p$$MYSQL_ROOT_PASSWORD > /dev/null; else exit 127; fi"] interval: 5s timeout: 5s retries: 10 mssql: - image: mcr.microsoft.com/mssql/server:2022-latest + image: ${MSSQL_IMAGE:-mcr.microsoft.com/mssql/server:2022-latest} environment: ACCEPT_EULA: "Y" MSSQL_SA_PASSWORD: "Netdata123!" diff --git a/src/go/tools/functions-validation/e2e/lib.sh b/src/go/tools/functions-validation/e2e/lib.sh index 35e0170c18f6f3..3f1c18bd160a60 100755 --- a/src/go/tools/functions-validation/e2e/lib.sh +++ b/src/go/tools/functions-validation/e2e/lib.sh @@ -206,6 +206,28 @@ run_info() { run_info_method "$module" "top-queries" } +run_function() { + local module="$1" + local method="$2" + local args="${3:-__job:local}" + local require_rows="${4:-true}" + local output="$WORKDIR/${module}-${method}.json" + + run "$WORKDIR/go.d.plugin" \ + --config-dir "$WORKDIR/config" \ + --function "${module}:${method}" \ + --function-args "$args" \ + > "$output" + + if [ "$require_rows" = "true" ]; then + validate "$output" --min-rows 1 + else + validate "$output" + fi + + echo "$output" +} + run_top_queries() { local module="$1" local output="$WORKDIR/${module}-top-queries.json" diff --git a/src/go/tools/functions-validation/e2e/mssql-matrix.sh b/src/go/tools/functions-validation/e2e/mssql-matrix.sh new file mode 100644 index 00000000000000..96a1039466c163 --- /dev/null +++ b/src/go/tools/functions-validation/e2e/mssql-matrix.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +MSSQL_VARIANTS=( + "mcr.microsoft.com/mssql/server:2017-latest|mssql-2017" + "mcr.microsoft.com/mssql/server:2019-latest|mssql-2019" + "mcr.microsoft.com/mssql/server:2022-latest|mssql-2022" +) + +for entry in "${MSSQL_VARIANTS[@]}"; do + IFS='|' read -r image label <<< "$entry" + echo "\n=== Running MSSQL collector E2E for ${label} (${image}) ===" >&2 + MSSQL_IMAGE="$image" MSSQL_VARIANT="$label" bash "$SCRIPT_DIR/mssql.sh" +done + diff --git a/src/go/tools/functions-validation/e2e/mssql.sh b/src/go/tools/functions-validation/e2e/mssql.sh index 20419567139908..574a17b9d70510 100755 --- a/src/go/tools/functions-validation/e2e/mssql.sh +++ b/src/go/tools/functions-validation/e2e/mssql.sh @@ -10,14 +10,1470 @@ trap cleanup EXIT MSSQL_PORT="$(reserve_port)" write_env "MSSQL_PORT" "$MSSQL_PORT" -replace_in_file "$WORKDIR/config/go.d/mssql.conf" "127.0.0.1:1433" "127.0.0.1:${MSSQL_PORT}" +MSSQL_CONF="$WORKDIR/config/go.d/mssql.conf" +replace_in_file "$MSSQL_CONF" "127.0.0.1:1433" "127.0.0.1:${MSSQL_PORT}" + +MSSQL_VARIANT_LABEL="${MSSQL_VARIANT:-mssql}" +if [ -n "${MSSQL_IMAGE:-}" ]; then + write_env "MSSQL_IMAGE" "$MSSQL_IMAGE" +fi compose_up mssql wait_healthy mssql 120 compose_run mssql-init build_plugin -run_info mssql -run_top_queries mssql -echo "E2E checks passed for mssql." >&2 +MSSQL_JOB_RETRIES="${MSSQL_JOB_RETRIES:-6}" +MSSQL_JOB_RETRY_DELAY="${MSSQL_JOB_RETRY_DELAY:-5}" + +mssql_is_no_jobs_started() { + local input="$1" + if [ ! -s "$input" ]; then + return 1 + fi + + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +try: + with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) +except Exception: + raise SystemExit(1) + +status = doc.get("status") +msg = str(doc.get("errorMessage") or "").lower() +if status == 503 and "no jobs started for module" in msg: + raise SystemExit(0) +raise SystemExit(1) +PY + return $? + fi + + python - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +try: + with open(path, "r") as fh: + doc = json.load(fh) +except Exception: + raise SystemExit(1) + +status = doc.get("status") +msg = str(doc.get("errorMessage") or "").lower() +if status == 503 and "no jobs started for module" in msg: + raise SystemExit(0) +raise SystemExit(1) +PY +} + +run_mssql_info_with_retry() { + local output="$WORKDIR/mssql-top-queries-info.json" + local attempt=1 + + while true; do + if run "$WORKDIR/go.d.plugin" \ + --config-dir "$WORKDIR/config" \ + --function "mssql:top-queries" \ + --function-args info \ + > "$output"; then + validate "$output" + return 0 + fi + + if mssql_is_no_jobs_started "$output"; then + if [ "$attempt" -ge "$MSSQL_JOB_RETRIES" ]; then + echo "Timed out waiting for mssql jobs to start (info)" >&2 + return 1 + fi + attempt=$((attempt + 1)) + sleep "$MSSQL_JOB_RETRY_DELAY" + continue + fi + + echo "Unexpected failure while running mssql top-queries info" >&2 + cat "$output" >&2 + return 1 + done +} + +run_mssql_top_queries_with_retry() { + local output="$WORKDIR/mssql-top-queries.json" + local attempt=1 + + while true; do + if run "$WORKDIR/go.d.plugin" \ + --config-dir "$WORKDIR/config" \ + --function "mssql:top-queries" \ + --function-args __job:local \ + > "$output"; then + validate "$output" --min-rows 1 + return 0 + fi + + if mssql_is_no_jobs_started "$output"; then + if [ "$attempt" -ge "$MSSQL_JOB_RETRIES" ]; then + echo "Timed out waiting for mssql jobs to start (top-queries)" >&2 + return 1 + fi + attempt=$((attempt + 1)) + sleep "$MSSQL_JOB_RETRY_DELAY" + continue + fi + + echo "Unexpected failure while running mssql top-queries" >&2 + cat "$output" >&2 + return 1 + done +} + +run_mssql_function_with_retry() { + local method="$1" + local args="${2:-__job:local}" + local require_rows="${3:-true}" + local output="$WORKDIR/mssql-${method}.json" + local attempt=1 + + while true; do + if run "$WORKDIR/go.d.plugin" \ + --config-dir "$WORKDIR/config" \ + --function "mssql:${method}" \ + --function-args "$args" \ + > "$output"; then + if [ "$require_rows" = "true" ]; then + validate "$output" --min-rows 1 + else + validate "$output" + fi + echo "$output" + return 0 + fi + + if mssql_is_no_jobs_started "$output"; then + if [ "$attempt" -ge "$MSSQL_JOB_RETRIES" ]; then + echo "Timed out waiting for mssql jobs to start (${method})" >&2 + return 1 + fi + attempt=$((attempt + 1)) + sleep "$MSSQL_JOB_RETRY_DELAY" + continue + fi + + echo "Unexpected failure while running mssql ${method}" >&2 + cat "$output" >&2 + return 1 + done +} + +run_mssql_info_with_retry +run_mssql_top_queries_with_retry + +mssql_container_id() { + "${COMPOSE[@]}" ps -q mssql +} + +mssql_sqlcmd_path() { + local cid + cid="$(mssql_container_id)" + if [ -z "$cid" ]; then + echo "MSSQL container ID not found" >&2 + return 1 + fi + docker exec -i "$cid" bash -lc 'if [ -x /opt/mssql-tools18/bin/sqlcmd ]; then echo /opt/mssql-tools18/bin/sqlcmd; elif [ -x /opt/mssql-tools/bin/sqlcmd ]; then echo /opt/mssql-tools/bin/sqlcmd; else exit 1; fi' +} + +MSSQL_SQLCMD_PATH="$(mssql_sqlcmd_path)" +echo "Using sqlcmd path: $MSSQL_SQLCMD_PATH" >&2 + +mssql_sqlcmd_supports_c() { + local cid + cid="$(mssql_container_id)" + if [ -z "$cid" ]; then + return 1 + fi + set +e + local help + help="$(docker exec -i "$cid" "$MSSQL_SQLCMD_PATH" -? 2>&1)" + local status=$? + set -e + if [ $status -ne 0 ] && [ -z "$help" ]; then + return 1 + fi + echo "$help" | grep -q " -C" +} + +MSSQL_SQLCMD_CFLAG=() +case "$MSSQL_SQLCMD_PATH" in + *mssql-tools18*) + MSSQL_SQLCMD_CFLAG=(-C) + ;; + *) + if mssql_sqlcmd_supports_c; then + MSSQL_SQLCMD_CFLAG=(-C) + fi + ;; +esac + +mssql_exec_sa() { + local sql="$1" + local cid + cid="$(mssql_container_id)" + if [ -z "$cid" ]; then + echo "MSSQL container ID not found" >&2 + return 1 + fi + run docker exec -i "$cid" "$MSSQL_SQLCMD_PATH" "${MSSQL_SQLCMD_CFLAG[@]}" -S localhost -U sa -P "Netdata123!" -d netdata -b -y 0 -Y 0 -Q "$sql" +} + +mssql_exec_sa_allow_error() { + local sql="$1" + local cid + cid="$(mssql_container_id)" + if [ -z "$cid" ]; then + echo "MSSQL container ID not found" >&2 + return 1 + fi + set +e + docker exec -i "$cid" "$MSSQL_SQLCMD_PATH" "${MSSQL_SQLCMD_CFLAG[@]}" -S localhost -U sa -P "Netdata123!" -d netdata -b -y 0 -Y 0 -Q "$sql" >/dev/null 2>&1 + set -e +} + +induce_deadlock_once() { + local tx1 + local tx2 + + tx1="$(cat <<'SQL' +SET NOCOUNT ON; +SET LOCK_TIMEOUT 5000; +BEGIN TRAN; +UPDATE dbo.deadlock_a SET value = value + 1 WHERE id = 1; +WAITFOR DELAY '00:00:01'; +UPDATE dbo.deadlock_b SET value = value + 1 WHERE id = 1; +COMMIT; +SQL +)" + + tx2="$(cat <<'SQL' +SET NOCOUNT ON; +SET LOCK_TIMEOUT 5000; +BEGIN TRAN; +UPDATE dbo.deadlock_b SET value = value + 1 WHERE id = 1; +WAITFOR DELAY '00:00:01'; +UPDATE dbo.deadlock_a SET value = value + 1 WHERE id = 1; +COMMIT; +SQL +)" + + mssql_exec_sa "$tx1" & + local pid1=$! + mssql_exec_sa "$tx2" & + local pid2=$! + + set +e + wait "$pid1" + wait "$pid2" + set -e +} + +assert_deadlock_info_content() { + local input="$1" + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" <<'PY' +import json +import re +import sys + +path = sys.argv[1] +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit(f"unexpected status value: {doc.get('status')!r}") + +if status != 200: + raise SystemExit(f"expected status 200, got {status}") + +if doc.get("errorMessage"): + raise SystemExit(f"unexpected errorMessage on status 200: {doc.get('errorMessage')!r}") + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("row_id", "deadlock_id", "process_id", "is_victim", "lock_mode", "lock_status", "query_text", "wait_resource", "database"): + if required not in field_to_idx: + raise SystemExit(f"missing expected column: {required}") + +data = doc.get("data") or [] +if not data: + raise SystemExit("deadlock-info returned no rows") + +def get_value(row, field): + idx = field_to_idx[field] + return row[idx] if idx < len(row) else None + +def norm(val): + return "" if val is None else str(val).strip() + +has_waiting = any(str(get_value(row, "lock_status")).upper() == "WAITING" for row in data) +if not has_waiting: + raise SystemExit("no WAITING lock_status found in deadlock-info output") + +table_pattern = re.compile(r"deadlock_(a|b)", re.IGNORECASE) +has_expected_query = any(table_pattern.search(str(get_value(row, "query_text"))) for row in data) +if not has_expected_query: + raise SystemExit("query_text does not reference deadlock tables") + +waiting_rows = [row for row in data if str(get_value(row, "lock_status")).upper() == "WAITING"] +if any(norm(get_value(row, "lock_mode")) == "" for row in waiting_rows): + raise SystemExit("WAITING rows must include lock_mode") +if any(norm(get_value(row, "wait_resource")) == "" for row in waiting_rows): + raise SystemExit("WAITING rows must include wait_resource") + +lock_mode_re = re.compile(r"^[A-Za-z0-9_-]+$") +if any(not lock_mode_re.match(norm(get_value(row, "lock_mode"))) for row in waiting_rows): + raise SystemExit("WAITING rows must include a valid lock_mode") + +victim_counts = {} +expected_db = "netdata" +has_database = False +for row in data: + deadlock_id = norm(get_value(row, "deadlock_id")) + if deadlock_id == "": + raise SystemExit("deadlock_id missing from deadlock-info output") + process_id = norm(get_value(row, "process_id")) + if process_id == "": + raise SystemExit("process_id missing from deadlock-info output") + row_id = norm(get_value(row, "row_id")) + if row_id != f"{deadlock_id}:{process_id}": + raise SystemExit(f"row_id {row_id} does not match deadlock_id/process_id") + victim_counts.setdefault(deadlock_id, 0) + if str(get_value(row, "is_victim")).lower() == "true": + victim_counts[deadlock_id] += 1 + db_val = norm(get_value(row, "database")).lower() + if db_val: + has_database = True + if db_val != expected_db: + raise SystemExit(f"unexpected database value {db_val!r}, expected {expected_db!r}") + +for deadlock_id, count in victim_counts.items(): + if count != 1: + raise SystemExit(f"deadlock_id {deadlock_id} has victim count {count}, expected 1") +if not has_database: + raise SystemExit("expected at least one row with database populated") +PY + return + fi + + python - "$input" <<'PY' +import json +import re +import sys + +path = sys.argv[1] +with open(path, "r") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit("unexpected status value: %r" % (doc.get("status"),)) + +if status != 200: + raise SystemExit("expected status 200, got %s" % status) + +if doc.get("errorMessage"): + raise SystemExit("unexpected errorMessage on status 200: %r" % (doc.get("errorMessage"),)) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("row_id", "deadlock_id", "process_id", "is_victim", "lock_mode", "lock_status", "query_text", "wait_resource", "database"): + if required not in field_to_idx: + raise SystemExit("missing expected column: %s" % required) + +data = doc.get("data") or [] +if not data: + raise SystemExit("deadlock-info returned no rows") + +def get_value(row, field): + idx = field_to_idx[field] + return row[idx] if idx < len(row) else None + +def norm(val): + return "" if val is None else str(val).strip() + +has_waiting = any(str(get_value(row, "lock_status")).upper() == "WAITING" for row in data) +if not has_waiting: + raise SystemExit("no WAITING lock_status found in deadlock-info output") + +table_pattern = re.compile(r"deadlock_(a|b)", re.IGNORECASE) +has_expected_query = any(table_pattern.search(str(get_value(row, "query_text"))) for row in data) +if not has_expected_query: + raise SystemExit("query_text does not reference deadlock tables") + +waiting_rows = [row for row in data if str(get_value(row, "lock_status")).upper() == "WAITING"] +if any(norm(get_value(row, "lock_mode")) == "" for row in waiting_rows): + raise SystemExit("WAITING rows must include lock_mode") +if any(norm(get_value(row, "wait_resource")) == "" for row in waiting_rows): + raise SystemExit("WAITING rows must include wait_resource") + +lock_mode_re = re.compile(r"^[A-Za-z0-9_-]+$") +if any(not lock_mode_re.match(norm(get_value(row, "lock_mode"))) for row in waiting_rows): + raise SystemExit("WAITING rows must include a valid lock_mode") + +victim_counts = {} +expected_db = "netdata" +has_database = False +for row in data: + deadlock_id = norm(get_value(row, "deadlock_id")) + if deadlock_id == "": + raise SystemExit("deadlock_id missing from deadlock-info output") + process_id = norm(get_value(row, "process_id")) + if process_id == "": + raise SystemExit("process_id missing from deadlock-info output") + row_id = norm(get_value(row, "row_id")) + if row_id != "%s:%s" % (deadlock_id, process_id): + raise SystemExit("row_id %s does not match deadlock_id/process_id" % row_id) + victim_counts.setdefault(deadlock_id, 0) + if str(get_value(row, "is_victim")).lower() == "true": + victim_counts[deadlock_id] += 1 + db_val = norm(get_value(row, "database")).lower() + if db_val: + has_database = True + if db_val != expected_db: + raise SystemExit("unexpected database value %r, expected %r" % (db_val, expected_db)) + +for deadlock_id, count in victim_counts.items(): + if count != 1: + raise SystemExit("deadlock_id %s has victim count %s, expected 1" % (deadlock_id, count)) +if not has_database: + raise SystemExit("expected at least one row with database populated") +PY +} + +assert_deadlock_info_empty_success() { + local input="$1" + + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit(f"unexpected status value: {doc.get('status')!r}") + +if status != 200: + raise SystemExit(f"expected status 200, got {status}") + +if doc.get("errorMessage"): + raise SystemExit(f"unexpected errorMessage on status 200: {doc.get('errorMessage')!r}") + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +data = doc.get("data") or [] +if len(data) == 0: + raise SystemExit(0) + +query_idx = field_to_idx.get("query_text", None) +if query_idx is None: + raise SystemExit(f"expected no rows, got {len(data)}") + +for row in data: + if query_idx >= len(row): + continue + query = str(row[query_idx]).lower() + if "deadlock_a" in query or "deadlock_b" in query: + raise SystemExit(f"unexpected deadlock rows for test tables, got {len(data)} rows") +PY + return + fi + + python - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit("unexpected status value: %r" % (doc.get("status"),)) + +if status != 200: + raise SystemExit("expected status 200, got %s" % status) + +if doc.get("errorMessage"): + raise SystemExit("unexpected errorMessage on status 200: %r" % (doc.get("errorMessage"),)) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +data = doc.get("data") or [] +if len(data) == 0: + raise SystemExit(0) + +query_idx = field_to_idx.get("query_text", None) +if query_idx is None: + raise SystemExit("expected no rows, got %s" % len(data)) + +for row in data: + if query_idx >= len(row): + continue + query = str(row[query_idx]).lower() + if "deadlock_a" in query or "deadlock_b" in query: + raise SystemExit("unexpected deadlock rows for test tables, got %s rows" % len(data)) +PY +} + +assert_deadlock_info_error_contains() { + local input="$1" + local expected_status="$2" + local expected_substr="$3" + + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" "$expected_status" "$expected_substr" <<'PY' +import json +import sys + +path = sys.argv[1] +expected_status = int(sys.argv[2]) +expected = sys.argv[3].strip().lower() +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit(f"unexpected status value: {doc.get('status')!r}") + +if status != expected_status: + raise SystemExit(f"expected status {expected_status}, got {status}") + +err = str(doc.get("errorMessage") or "").lower() +if expected not in err: + raise SystemExit(f"expected errorMessage to contain {expected!r}, got {err!r}") +PY + return + fi + + python - "$input" "$expected_status" "$expected_substr" <<'PY' +import json +import sys + +path = sys.argv[1] +expected_status = int(sys.argv[2]) +expected = sys.argv[3].strip().lower() +with open(path, "r") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit("unexpected status value: %r" % (doc.get("status"),)) + +if status != expected_status: + raise SystemExit("expected status %s, got %s" % (expected_status, status)) + +err = str(doc.get("errorMessage") or "").lower() +if expected not in err: + raise SystemExit("expected errorMessage to contain %r, got %r" % (expected, err)) +PY +} + +assert_error_info_not_enabled() { + local input="$1" + + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit(f"unexpected status value: {doc.get('status')!r}") + +if status < 400: + raise SystemExit(f"expected error status, got {status}") + +err = str(doc.get("errorMessage") or "").lower() +if "not enabled" not in err: + raise SystemExit(f"expected errorMessage to contain 'not enabled', got {err!r}") +PY + return + fi + + python - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit("unexpected status value: %r" % (doc.get("status"),)) + +if status < 400: + raise SystemExit("expected error status, got %s" % status) + +err = str(doc.get("errorMessage") or "").lower() +if "not enabled" not in err: + raise SystemExit("expected errorMessage to contain 'not enabled', got %r" % err) +PY +} + +assert_error_info_has_error() { + local input="$1" + + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit(f"unexpected status value: {doc.get('status')!r}") + +if status != 200: + raise SystemExit(f"expected status 200, got {status}") + +if doc.get("errorMessage"): + raise SystemExit(f"unexpected errorMessage on status 200: {doc.get('errorMessage')!r}") + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("errorNumber", "errorMessage", "query"): + if required not in field_to_idx: + raise SystemExit(f"missing expected column: {required}") + +data = doc.get("data") or [] +if not data: + raise SystemExit("error-info returned no rows") + +num_idx = field_to_idx["errorNumber"] +msg_idx = field_to_idx["errorMessage"] +query_idx = field_to_idx["query"] + +target = "netdata_error_map_e2e" +matched = False +for row in data: + if num_idx >= len(row): + continue + err_no = row[num_idx] + try: + err_no_val = int(err_no) + except Exception: + continue + if err_no_val != 208: + continue + msg = str(row[msg_idx]).lower() if msg_idx < len(row) else "" + query = str(row[query_idx]).lower() if query_idx < len(row) else "" + if "invalid object name" in msg and target in query: + matched = True + break + +if not matched: + raise SystemExit("no error-info row contained invalid object name for netdata_error_map_e2e") +PY + return + fi + + python - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit("unexpected status value: %r" % (doc.get("status"),)) + +if status != 200: + raise SystemExit("expected status 200, got %s" % status) + +if doc.get("errorMessage"): + raise SystemExit("unexpected errorMessage on status 200: %r" % (doc.get("errorMessage"),)) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("errorNumber", "errorMessage", "query"): + if required not in field_to_idx: + raise SystemExit("missing expected column: %s" % required) + +data = doc.get("data") or [] +if not data: + raise SystemExit("error-info returned no rows") + +num_idx = field_to_idx["errorNumber"] +msg_idx = field_to_idx["errorMessage"] +query_idx = field_to_idx["query"] + +target = "netdata_error_map_e2e" +matched = False +for row in data: + if num_idx >= len(row): + continue + err_no = row[num_idx] + try: + err_no_val = int(err_no) + except Exception: + continue + if err_no_val != 208: + continue + msg = str(row[msg_idx]).lower() if msg_idx < len(row) else "" + query = str(row[query_idx]).lower() if query_idx < len(row) else "" + if "invalid object name" in msg and target in query: + matched = True + break + +if not matched: + raise SystemExit("no error-info row contained invalid object name for netdata_error_map_e2e") +PY +} + +assert_top_queries_error_attribution_not_enabled() { + local input="$1" + + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +if "errorAttribution" not in field_to_idx: + raise SystemExit("missing expected column: errorAttribution") + +data = doc.get("data") or [] +idx = field_to_idx["errorAttribution"] +for row in data: + if idx >= len(row): + continue + if str(row[idx]) != "not_enabled": + raise SystemExit(f"expected errorAttribution 'not_enabled', got {row[idx]!r}") +PY + return + fi + + python - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r") as fh: + doc = json.load(fh) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +if "errorAttribution" not in field_to_idx: + raise SystemExit("missing expected column: errorAttribution") + +data = doc.get("data") or [] +idx = field_to_idx["errorAttribution"] +for row in data: + if idx >= len(row): + continue + if str(row[idx]) != "not_enabled": + raise SystemExit("expected errorAttribution 'not_enabled', got %r" % row[idx]) +PY +} + +assert_top_queries_error_attribution_active() { + local input="$1" + + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("errorAttribution",): + if required not in field_to_idx: + raise SystemExit(f"missing expected column: {required}") + +data = doc.get("data") or [] +status_idx = field_to_idx["errorAttribution"] + +for row in data: + if status_idx >= len(row): + continue + status = str(row[status_idx]) + if status not in ("enabled", "no_data"): + raise SystemExit(f"unexpected errorAttribution status {status!r}") +PY + return + fi + + python - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r") as fh: + doc = json.load(fh) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("errorAttribution",): + if required not in field_to_idx: + raise SystemExit("missing expected column: %s" % required) + +data = doc.get("data") or [] +status_idx = field_to_idx["errorAttribution"] + +for row in data: + if status_idx >= len(row): + continue + status = str(row[status_idx]) + if status not in ("enabled", "no_data"): + raise SystemExit("unexpected errorAttribution status %r" % status) +PY +} + +assert_top_queries_error_attribution_mapped() { + local top_queries="$1" + local error_info="$2" + + if command -v python3 >/dev/null 2>&1; then + python3 - "$top_queries" "$error_info" <<'PY' +import json +import sys + +top_path = sys.argv[1] +err_path = sys.argv[2] + +with open(err_path, "r", encoding="utf-8") as fh: + err_doc = json.load(fh) + +err_cols = err_doc.get("columns") or {} +err_idx = {} +if isinstance(err_cols, dict): + for field, col in err_cols.items(): + if not isinstance(col, dict): + continue + try: + err_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(err_cols): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + err_idx[field] = idx + +for required in ("errorMessage", "errorNumber", "query", "queryHash"): + if required not in err_idx: + raise SystemExit(f"missing expected error-info column: {required}") + +def normalize(text: str) -> str: + return " ".join(text.split()).strip().rstrip(";").strip() + +error_rows = err_doc.get("data") or [] +candidates = [] +for row in error_rows: + msg = str(row[err_idx["errorMessage"]]).lower() if err_idx["errorMessage"] < len(row) else "" + err_no = row[err_idx["errorNumber"]] if err_idx["errorNumber"] < len(row) else None + query = str(row[err_idx["query"]]).lower() if err_idx["query"] < len(row) else "" + qh = row[err_idx["queryHash"]] if err_idx["queryHash"] < len(row) else None + try: + err_no_val = int(err_no) + except Exception: + continue + if err_no_val != 208: + continue + if "invalid object name" in msg and "netdata_error_map_e2e" in query: + candidates.append((str(qh) if qh else "", normalize(query))) + +if not candidates: + raise SystemExit("no error-info row contained invalid object name for netdata_error_map_e2e") + +with open(top_path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("query", "queryHash", "errorAttribution", "errorNumber", "errorMessage"): + if required not in field_to_idx: + raise SystemExit(f"missing expected column: {required}") + +data = doc.get("data") or [] +status_idx = field_to_idx["errorAttribution"] +num_idx = field_to_idx["errorNumber"] +msg_idx = field_to_idx["errorMessage"] +hash_idx = field_to_idx["queryHash"] + +matched = False +for row in data: + if status_idx >= len(row): + continue + if hash_idx >= len(row): + continue + status = str(row[status_idx]) if status_idx < len(row) else "" + if status != "enabled": + continue + err_no = row[num_idx] if num_idx < len(row) else None + try: + err_no_val = int(err_no) + except Exception: + continue + if err_no_val != 208: + continue + msg = str(row[msg_idx]).lower() if msg_idx < len(row) and row[msg_idx] is not None else "" + if "invalid object name" not in msg: + continue + row_hash = str(row[hash_idx]) if hash_idx < len(row) and row[hash_idx] is not None else "" + row_query = normalize(str(row[field_to_idx["query"]]).lower()) if field_to_idx["query"] < len(row) else "" + for cand_hash, cand_query in candidates: + if cand_hash and row_hash == cand_hash: + matched = True + break + if cand_query and row_query == cand_query: + matched = True + break + if matched: + break + +if not matched: + raise SystemExit("no top-queries row had enabled error attribution for netdata_error_map_e2e") +PY + return + fi + + python - "$top_queries" "$error_info" <<'PY' +import json +import sys + +top_path = sys.argv[1] +err_path = sys.argv[2] + +with open(err_path, "r") as fh: + err_doc = json.load(fh) + +err_cols = err_doc.get("columns") or {} +err_idx = {} +if isinstance(err_cols, dict): + for field, col in err_cols.items(): + if not isinstance(col, dict): + continue + try: + err_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(err_cols): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + err_idx[field] = idx + +for required in ("errorMessage", "errorNumber", "query", "queryHash"): + if required not in err_idx: + raise SystemExit("missing expected error-info column: %s" % required) + +def normalize(text): + return " ".join(text.split()).strip().rstrip(";").strip() + +error_rows = err_doc.get("data") or [] +candidates = [] +for row in error_rows: + msg = str(row[err_idx["errorMessage"]]).lower() if err_idx["errorMessage"] < len(row) else "" + err_no = row[err_idx["errorNumber"]] if err_idx["errorNumber"] < len(row) else None + query = str(row[err_idx["query"]]).lower() if err_idx["query"] < len(row) else "" + qh = row[err_idx["queryHash"]] if err_idx["queryHash"] < len(row) else None + try: + err_no_val = int(err_no) + except Exception: + continue + if err_no_val != 208: + continue + if "invalid object name" in msg and "netdata_error_map_e2e" in query: + candidates.append((str(qh) if qh else "", normalize(query))) + +if not candidates: + raise SystemExit("no error-info row contained invalid object name for netdata_error_map_e2e") + +with open(top_path, "r") as fh: + doc = json.load(fh) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("query", "queryHash", "errorAttribution", "errorNumber", "errorMessage"): + if required not in field_to_idx: + raise SystemExit("missing expected column: %s" % required) + +data = doc.get("data") or [] +status_idx = field_to_idx["errorAttribution"] +num_idx = field_to_idx["errorNumber"] +msg_idx = field_to_idx["errorMessage"] +hash_idx = field_to_idx["queryHash"] + +matched = False +for row in data: + if status_idx >= len(row): + continue + if hash_idx >= len(row): + continue + status = str(row[status_idx]) if status_idx < len(row) else "" + if status != "enabled": + continue + err_no = row[num_idx] if num_idx < len(row) else None + try: + err_no_val = int(err_no) + except Exception: + continue + if err_no_val != 208: + continue + msg = str(row[msg_idx]).lower() if msg_idx < len(row) and row[msg_idx] is not None else "" + if "invalid object name" not in msg: + continue + row_hash = str(row[hash_idx]) if hash_idx < len(row) and row[hash_idx] is not None else "" + row_query = normalize(str(row[field_to_idx["query"]]).lower()) if field_to_idx["query"] < len(row) else "" + for cand_hash, cand_query in candidates: + if cand_hash and row_hash == cand_hash: + matched = True + break + if cand_query and row_query == cand_query: + matched = True + break + if matched: + break + +if not matched: + raise SystemExit("no top-queries row had enabled error attribution for netdata_error_map_e2e") +PY +} + +assert_top_queries_plan_ops() { + local input="$1" + + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("query", "hashMatch", "sorts"): + if required not in field_to_idx: + raise SystemExit(f"missing expected column: {required}") + +data = doc.get("data") or [] +query_idx = field_to_idx["query"] +hash_idx = field_to_idx["hashMatch"] +sort_idx = field_to_idx["sorts"] + +matched = False +for row in data: + if query_idx >= len(row): + continue + query = str(row[query_idx]).lower() + if "join" not in query or "sample" not in query: + continue + hash_val = row[hash_idx] if hash_idx < len(row) else 0 + sort_val = row[sort_idx] if sort_idx < len(row) else 0 + try: + hash_val = int(hash_val) + except Exception: + hash_val = 0 + try: + sort_val = int(sort_val) + except Exception: + sort_val = 0 + if hash_val > 0 and sort_val > 0: + matched = True + break + +if not matched: + raise SystemExit("no top-queries row had hashMatch and sorts counts for the join query") +PY + return + fi + + python - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r") as fh: + doc = json.load(fh) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("query", "hashMatch", "sorts"): + if required not in field_to_idx: + raise SystemExit("missing expected column: %s" % required) + +data = doc.get("data") or [] +query_idx = field_to_idx["query"] +hash_idx = field_to_idx["hashMatch"] +sort_idx = field_to_idx["sorts"] + +matched = False +for row in data: + if query_idx >= len(row): + continue + query = str(row[query_idx]).lower() + if "join" not in query or "sample" not in query: + continue + hash_val = row[hash_idx] if hash_idx < len(row) else 0 + sort_val = row[sort_idx] if sort_idx < len(row) else 0 + try: + hash_val = int(hash_val) + except Exception: + hash_val = 0 + try: + sort_val = int(sort_val) + except Exception: + sort_val = 0 + if hash_val > 0 and sort_val > 0: + matched = True + break + +if not matched: + raise SystemExit("no top-queries row had hashMatch and sorts counts for the join query") +PY +} + +verify_deadlock_info_no_deadlock() { + local output + + output="$(run_mssql_function_with_retry deadlock-info '__job:local' 'false')" + validate "$output" + assert_deadlock_info_empty_success "$output" +} + +verify_deadlock_info() { + local attempt + local output + local found="false" + + for attempt in 1 2 3 4 5; do + induce_deadlock_once + output="$(run_mssql_function_with_retry deadlock-info '__job:local' 'false')" + if has_min_rows "$output" 1; then + validate "$output" --min-rows 1 + if assert_deadlock_info_content "$output"; then + found="true" + break + fi + fi + sleep 1 + done + + if [ "$found" != "true" ]; then + echo "deadlock-info did not produce valid deadlock attribution after 5 attempts" >&2 + return 1 + fi +} + +verify_deadlock_info_no_deadlock +verify_deadlock_info + +assert_top_queries_error_attribution_not_enabled "$WORKDIR/mssql-top-queries.json" + +error_output="$(run_mssql_function_with_retry error-info '__job:local' 'false')" +assert_error_info_not_enabled "$error_output" + +mssql_exec_sa "IF EXISTS (SELECT 1 FROM sys.server_event_sessions WHERE name = 'netdata_errors') DROP EVENT SESSION [netdata_errors] ON SERVER;" +mssql_exec_sa "CREATE EVENT SESSION [netdata_errors] ON SERVER ADD EVENT sqlserver.error_reported(ACTION(sqlserver.sql_text, sqlserver.query_hash)) ADD TARGET package0.ring_buffer;" +mssql_exec_sa "ALTER EVENT SESSION [netdata_errors] ON SERVER STATE = START;" +mssql_exec_sa "ALTER DATABASE netdata SET QUERY_STORE (QUERY_CAPTURE_MODE = ALL, OPERATION_MODE = READ_WRITE);" + +mssql_exec_sa "IF OBJECT_ID('dbo.netdata_error_map_e2e', 'U') IS NOT NULL DROP TABLE dbo.netdata_error_map_e2e;" +mssql_exec_sa "CREATE TABLE dbo.netdata_error_map_e2e (id int NOT NULL PRIMARY KEY);" +mssql_exec_sa "INSERT INTO dbo.netdata_error_map_e2e (id) VALUES (1), (2), (3);" + +for _ in 1 2 3 4 5 6 7 8 9 10; do + mssql_exec_sa "SELECT COUNT(*) FROM dbo.netdata_error_map_e2e;" +done + +mssql_exec_sa "DROP TABLE dbo.netdata_error_map_e2e;" + +for _ in 1 2 3; do + mssql_exec_sa_allow_error "SELECT COUNT(*) FROM dbo.netdata_error_map_e2e;" +done + +for _ in 1 2 3 4 5; do + mssql_exec_sa "SET NOCOUNT ON; SELECT a.id, b.name FROM dbo.sample a JOIN dbo.sample b ON a.id = b.id ORDER BY a.value + b.value DESC OPTION (HASH JOIN);" +done + +mssql_exec_sa "EXEC sys.sp_query_store_flush_db;" +sleep 2 + +error_output="$(run_mssql_function_with_retry error-info '__job:local' 'true')" +assert_error_info_has_error "$error_output" + +run_mssql_top_queries_with_retry +assert_top_queries_error_attribution_active "$WORKDIR/mssql-top-queries.json" +assert_top_queries_error_attribution_mapped "$WORKDIR/mssql-top-queries.json" "$WORKDIR/mssql-error-info.json" +assert_top_queries_plan_ops "$WORKDIR/mssql-top-queries.json" + +echo "E2E checks passed for ${MSSQL_VARIANT_LABEL}." >&2 diff --git a/src/go/tools/functions-validation/e2e/mysql-matrix.sh b/src/go/tools/functions-validation/e2e/mysql-matrix.sh new file mode 100644 index 00000000000000..4d0da0716b541b --- /dev/null +++ b/src/go/tools/functions-validation/e2e/mysql-matrix.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +MYSQL_VARIANTS=( + "mysql:5.7|mysql-5.7" + "mysql:8.0|mysql-8.0" + "mysql:8.4|mysql-8.4" + "mariadb:10.3|mariadb-10.3" + "mariadb:10.6|mariadb-10.6" + "mariadb:11.4|mariadb-11.4" + "percona:5.7|percona-5.7" + "percona:8.0|percona-8.0" +) + +for entry in "${MYSQL_VARIANTS[@]}"; do + IFS='|' read -r image label <<< "$entry" + echo "\n=== Running MySQL collector E2E for ${label} (${image}) ===" >&2 + MYSQL_IMAGE="$image" MYSQL_VARIANT="$label" bash "$SCRIPT_DIR/mysql.sh" +done + diff --git a/src/go/tools/functions-validation/e2e/mysql.sh b/src/go/tools/functions-validation/e2e/mysql.sh index 0b921a5e00820f..84083448a6457f 100755 --- a/src/go/tools/functions-validation/e2e/mysql.sh +++ b/src/go/tools/functions-validation/e2e/mysql.sh @@ -12,11 +12,892 @@ MYSQL_PORT="$(reserve_port)" write_env "MYSQL_PORT" "$MYSQL_PORT" replace_in_file "$WORKDIR/config/go.d/mysql.conf" "127.0.0.1:3306" "127.0.0.1:${MYSQL_PORT}" +MYSQL_VARIANT_LABEL="${MYSQL_VARIANT:-mysql}" +if [ -n "${MYSQL_IMAGE:-}" ]; then + write_env "MYSQL_IMAGE" "$MYSQL_IMAGE" +fi + compose_up mysql -wait_healthy mysql 90 +MYSQL_HEALTH_TIMEOUT="${MYSQL_HEALTH_TIMEOUT:-180}" +wait_healthy mysql "$MYSQL_HEALTH_TIMEOUT" build_plugin run_info mysql run_top_queries mysql -echo "E2E checks passed for mysql." >&2 +assert_top_queries_error_columns() { + local input="$1" + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) +columns = doc.get("columns") or {} +required = {"errorAttribution", "errorNumber", "sqlState", "errorMessage"} + +found = set() +if isinstance(columns, dict): + for key in columns.keys(): + found.add(key) +else: + for col in columns: + if isinstance(col, dict): + field = col.get("field") + if field: + found.add(field) + +missing = sorted(required - found) +if missing: + raise SystemExit(f"missing top-queries error columns: {missing}") +PY + return + fi + python - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r") as fh: + doc = json.load(fh) +columns = doc.get("columns") or {} +required = {"errorAttribution", "errorNumber", "sqlState", "errorMessage"} + +found = set() +if isinstance(columns, dict): + for key in columns.keys(): + found.add(key) +else: + for col in columns: + if isinstance(col, dict): + field = col.get("field") + if field: + found.add(field) + +missing = sorted(required - found) +if missing: + raise SystemExit("missing top-queries error columns: %s" % missing) +PY +} + +assert_top_queries_error_columns "$WORKDIR/mysql-top-queries.json" + +mysql_container_id() { + "${COMPOSE[@]}" ps -q mysql +} + +mysql_client_path() { + local cid + cid="$(mysql_container_id)" + if [ -z "$cid" ]; then + echo "MySQL container ID not found" >&2 + return 1 + fi + docker exec -i "$cid" sh -lc 'command -v mysql || command -v mariadb' +} + +MYSQL_CLIENT_PATH="$(mysql_client_path)" +echo "Using mysql client: $MYSQL_CLIENT_PATH" >&2 + +mysql_exec_root() { + local sql="$1" + local cid + cid="$(mysql_container_id)" + if [ -z "$cid" ]; then + echo "MySQL container ID not found" >&2 + return 1 + fi + run docker exec -i "$cid" "$MYSQL_CLIENT_PATH" -uroot -prootpw netdata -e "$sql" +} + +mysql_query_root() { + local sql="$1" + local cid + cid="$(mysql_container_id)" + if [ -z "$cid" ]; then + echo "MySQL container ID not found" >&2 + return 1 + fi + run docker exec -i "$cid" "$MYSQL_CLIENT_PATH" -uroot -prootpw -N -s netdata -e "$sql" +} + +mysql_exec_root_allow_error() { + local sql="$1" + local cid + cid="$(mysql_container_id)" + if [ -z "$cid" ]; then + echo "MySQL container ID not found" >&2 + return 1 + fi + set +e + docker exec -i "$cid" "$MYSQL_CLIENT_PATH" -uroot -prootpw netdata -e "$sql" >/dev/null 2>&1 + set -e +} + +induce_deadlock_once() { + local tx1 + local tx2 + + tx1="$(cat <<'SQL' +SET SESSION innodb_lock_wait_timeout = 5; +START TRANSACTION; +UPDATE deadlock_a SET value = value + 1 WHERE id = 1; +DO SLEEP(1); +UPDATE deadlock_b SET value = value + 1 WHERE id = 1; +COMMIT; +SQL +)" + + tx2="$(cat <<'SQL' +SET SESSION innodb_lock_wait_timeout = 5; +START TRANSACTION; +UPDATE deadlock_b SET value = value + 1 WHERE id = 1; +DO SLEEP(1); +UPDATE deadlock_a SET value = value + 1 WHERE id = 1; +COMMIT; +SQL +)" + + mysql_exec_root "$tx1" & + local pid1=$! + mysql_exec_root "$tx2" & + local pid2=$! + + wait "$pid1" || true + wait "$pid2" || true +} + +assert_deadlock_info_content() { + local input="$1" + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" <<'PY' +import json +import re +import sys + +path = sys.argv[1] +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit(f"unexpected status value: {doc.get('status')!r}") + +if status != 200: + raise SystemExit(f"expected status 200, got {status}") + +if doc.get("errorMessage"): + raise SystemExit(f"unexpected errorMessage on status 200: {doc.get('errorMessage')!r}") + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("row_id", "deadlock_id", "process_id", "is_victim", "lock_mode", "lock_status", "query_text", "wait_resource", "database"): + if required not in field_to_idx: + raise SystemExit(f"missing expected column: {required}") + +data = doc.get("data") or [] +if not data: + raise SystemExit("deadlock-info returned no rows") + +def get_value(row, field): + idx = field_to_idx[field] + return row[idx] if idx < len(row) else None + +def norm(val): + return "" if val is None else str(val).strip() + +has_waiting = any(str(get_value(row, "lock_status")).upper() == "WAITING" for row in data) +if not has_waiting: + raise SystemExit("no WAITING lock_status found in deadlock-info output") + +table_pattern = re.compile(r"deadlock_(a|b)", re.IGNORECASE) +has_expected_query = any(table_pattern.search(str(get_value(row, "query_text"))) for row in data) +if not has_expected_query: + raise SystemExit("query_text does not reference deadlock tables") + +waiting_rows = [row for row in data if norm(get_value(row, "lock_status")).upper() == "WAITING"] +if any(norm(get_value(row, "lock_mode")) == "" for row in waiting_rows): + raise SystemExit("WAITING rows must include lock_mode") +if any(norm(get_value(row, "wait_resource")) == "" for row in waiting_rows): + raise SystemExit("WAITING rows must include wait_resource") + +lock_mode_re = re.compile(r"^[A-Za-z0-9_-]+$") +if any(not lock_mode_re.match(norm(get_value(row, "lock_mode"))) for row in waiting_rows): + raise SystemExit("WAITING rows must include a valid lock_mode") + +victim_counts = {} +expected_db = "netdata" +has_database = False +for row in data: + deadlock_id = norm(get_value(row, "deadlock_id")) + if deadlock_id == "": + raise SystemExit("deadlock_id missing from deadlock-info output") + process_id = norm(get_value(row, "process_id")) + if process_id == "": + raise SystemExit("process_id missing from deadlock-info output") + row_id = norm(get_value(row, "row_id")) + if row_id != f"{deadlock_id}:{process_id}": + raise SystemExit(f"row_id {row_id} does not match deadlock_id/process_id") + victim_counts.setdefault(deadlock_id, 0) + if norm(get_value(row, "is_victim")).lower() == "true": + victim_counts[deadlock_id] += 1 + db_val = norm(get_value(row, "database")).lower() + if db_val: + has_database = True + if db_val != expected_db: + raise SystemExit(f"unexpected database value {db_val!r}, expected {expected_db!r}") + +for deadlock_id, count in victim_counts.items(): + if count != 1: + raise SystemExit(f"deadlock_id {deadlock_id} has victim count {count}, expected 1") +if not has_database: + raise SystemExit("expected at least one row with database populated") +PY + else + python - "$input" <<'PY' +import json +import re +import sys + +path = sys.argv[1] +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit("unexpected status value: %r" % (doc.get("status"),)) + +if status != 200: + raise SystemExit("expected status 200, got %s" % status) + +if doc.get("errorMessage"): + raise SystemExit("unexpected errorMessage on status 200: %r" % (doc.get("errorMessage"),)) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("row_id", "deadlock_id", "process_id", "is_victim", "lock_mode", "lock_status", "query_text", "wait_resource", "database"): + if required not in field_to_idx: + raise SystemExit(f"missing expected column: {required}") + +data = doc.get("data") or [] +if not data: + raise SystemExit("deadlock-info returned no rows") + +def get_value(row, field): + idx = field_to_idx[field] + return row[idx] if idx < len(row) else None + +def norm(val): + return "" if val is None else str(val).strip() + +has_waiting = any(str(get_value(row, "lock_status")).upper() == "WAITING" for row in data) +if not has_waiting: + raise SystemExit("no WAITING lock_status found in deadlock-info output") + +table_pattern = re.compile(r"deadlock_(a|b)", re.IGNORECASE) +has_expected_query = any(table_pattern.search(str(get_value(row, "query_text"))) for row in data) +if not has_expected_query: + raise SystemExit("query_text does not reference deadlock tables") + +waiting_rows = [row for row in data if norm(get_value(row, "lock_status")).upper() == "WAITING"] +if any(norm(get_value(row, "lock_mode")) == "" for row in waiting_rows): + raise SystemExit("WAITING rows must include lock_mode") +if any(norm(get_value(row, "wait_resource")) == "" for row in waiting_rows): + raise SystemExit("WAITING rows must include wait_resource") + +lock_mode_re = re.compile(r"^[A-Za-z0-9_-]+$") +if any(not lock_mode_re.match(norm(get_value(row, "lock_mode"))) for row in waiting_rows): + raise SystemExit("WAITING rows must include a valid lock_mode") + +victim_counts = {} +expected_db = "netdata" +has_database = False +for row in data: + deadlock_id = norm(get_value(row, "deadlock_id")) + if deadlock_id == "": + raise SystemExit("deadlock_id missing from deadlock-info output") + process_id = norm(get_value(row, "process_id")) + if process_id == "": + raise SystemExit("process_id missing from deadlock-info output") + row_id = norm(get_value(row, "row_id")) + if row_id != "%s:%s" % (deadlock_id, process_id): + raise SystemExit("row_id %s does not match deadlock_id/process_id" % row_id) + victim_counts.setdefault(deadlock_id, 0) + if norm(get_value(row, "is_victim")).lower() == "true": + victim_counts[deadlock_id] += 1 + db_val = norm(get_value(row, "database")).lower() + if db_val: + has_database = True + if db_val != expected_db: + raise SystemExit("unexpected database value %r, expected %r" % (db_val, expected_db)) + +for deadlock_id, count in victim_counts.items(): + if count != 1: + raise SystemExit(f"deadlock_id {deadlock_id} has victim count {count}, expected 1") +if not has_database: + raise SystemExit("expected at least one row with database populated") +PY + fi +} + +assert_deadlock_info_empty_success() { + local input="$1" + + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit(f"unexpected status value: {doc.get('status')!r}") + +if status != 200: + raise SystemExit(f"expected status 200, got {status}") + +if doc.get("errorMessage"): + raise SystemExit(f"unexpected errorMessage on status 200: {doc.get('errorMessage')!r}") + +data = doc.get("data") or [] +if len(data) != 0: + raise SystemExit(f"expected no rows, got {len(data)}") +PY + return + fi + + python - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit("unexpected status value: %r" % (doc.get("status"),)) + +if status != 200: + raise SystemExit("expected status 200, got %s" % status) + +if doc.get("errorMessage"): + raise SystemExit("unexpected errorMessage on status 200: %r" % (doc.get("errorMessage"),)) + +data = doc.get("data") or [] +if len(data) != 0: + raise SystemExit("expected no rows, got %s" % len(data)) +PY +} + +assert_error_info_not_enabled() { + local input="$1" + + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit(f"unexpected status value: {doc.get('status')!r}") + +if status < 400: + raise SystemExit(f"expected error status, got {status}") + +err = str(doc.get("errorMessage") or "").lower() +if "not enabled" not in err: + raise SystemExit(f"expected errorMessage to contain 'not enabled', got {err!r}") +PY + return + fi + + python - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit("unexpected status value: %r" % (doc.get("status"),)) + +if status < 400: + raise SystemExit("expected error status, got %s" % status) + +err = str(doc.get("errorMessage") or "").lower() +if "not enabled" not in err: + raise SystemExit("expected errorMessage to contain 'not enabled', got %r" % err) +PY +} + +assert_error_info_has_error() { + local input="$1" + + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit(f"unexpected status value: {doc.get('status')!r}") + +if status != 200: + raise SystemExit(f"expected status 200, got {status}") + +if doc.get("errorMessage"): + raise SystemExit(f"unexpected errorMessage on status 200: {doc.get('errorMessage')!r}") + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("errorNumber", "errorMessage"): + if required not in field_to_idx: + raise SystemExit(f"missing expected column: {required}") + +data = doc.get("data") or [] +if not data: + raise SystemExit("error-info returned no rows") + +err_idx = field_to_idx["errorMessage"] +num_idx = field_to_idx["errorNumber"] +def normalize(val): + return "" if val is None else str(val) + +matched = False +for row in data: + if num_idx < len(row) and row[num_idx] is not None: + msg = normalize(row[err_idx]).lower() + if "missing_table" in msg: + matched = True + break + +if not matched: + raise SystemExit("no error-info row contained missing_table with errorNumber populated") +PY + return + fi + + python - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r") as fh: + doc = json.load(fh) + +try: + status = int(doc.get("status")) +except (TypeError, ValueError): + raise SystemExit("unexpected status value: %r" % (doc.get("status"),)) + +if status != 200: + raise SystemExit("expected status 200, got %s" % status) + +if doc.get("errorMessage"): + raise SystemExit("unexpected errorMessage on status 200: %r" % (doc.get("errorMessage"),)) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("errorNumber", "errorMessage"): + if required not in field_to_idx: + raise SystemExit("missing expected column: %s" % required) + +data = doc.get("data") or [] +if not data: + raise SystemExit("error-info returned no rows") + +err_idx = field_to_idx["errorMessage"] +num_idx = field_to_idx["errorNumber"] +def normalize(val): + return "" if val is None else str(val) + +matched = False +for row in data: + if num_idx < len(row) and row[num_idx] is not None: + msg = normalize(row[err_idx]).lower() + if "missing_table" in msg: + matched = True + break + +if not matched: + raise SystemExit("no error-info row contained missing_table with errorNumber populated") +PY +} + +assert_top_queries_error_attribution_enabled() { + local input="$1" + + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("errorAttribution", "errorNumber", "errorMessage"): + if required not in field_to_idx: + raise SystemExit(f"missing expected column: {required}") + +data = doc.get("data") or [] +status_idx = field_to_idx["errorAttribution"] +num_idx = field_to_idx["errorNumber"] +msg_idx = field_to_idx["errorMessage"] + +matched = False +for row in data: + if status_idx >= len(row): + continue + if str(row[status_idx]) != "enabled": + continue + num = row[num_idx] if num_idx < len(row) else None + msg = str(row[msg_idx]).lower() if msg_idx < len(row) else "" + if num is not None and "missing_table" in msg: + matched = True + break + +if not matched: + raise SystemExit("no top-queries row had enabled error attribution for missing_table") +PY + return + fi + + python - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r") as fh: + doc = json.load(fh) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +for required in ("errorAttribution", "errorNumber", "errorMessage"): + if required not in field_to_idx: + raise SystemExit("missing expected column: %s" % required) + +data = doc.get("data") or [] +status_idx = field_to_idx["errorAttribution"] +num_idx = field_to_idx["errorNumber"] +msg_idx = field_to_idx["errorMessage"] + +matched = False +for row in data: + if status_idx >= len(row): + continue + if str(row[status_idx]) != "enabled": + continue + num = row[num_idx] if num_idx < len(row) else None + msg = str(row[msg_idx]).lower() if msg_idx < len(row) else "" + if num is not None and "missing_table" in msg: + matched = True + break + +if not matched: + raise SystemExit("no top-queries row had enabled error attribution for missing_table") +PY +} + +assert_top_queries_error_attribution_not_enabled() { + local input="$1" + + if command -v python3 >/dev/null 2>&1; then + python3 - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r", encoding="utf-8") as fh: + doc = json.load(fh) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +if "errorAttribution" not in field_to_idx: + raise SystemExit("missing expected column: errorAttribution") + +data = doc.get("data") or [] +idx = field_to_idx["errorAttribution"] +for row in data: + if idx >= len(row): + continue + if str(row[idx]) != "not_enabled": + raise SystemExit(f"expected errorAttribution 'not_enabled', got {row[idx]!r}") +PY + return + fi + + python - "$input" <<'PY' +import json +import sys + +path = sys.argv[1] +with open(path, "r") as fh: + doc = json.load(fh) + +columns = doc.get("columns") or {} +field_to_idx = {} +if isinstance(columns, dict): + for field, col in columns.items(): + if not isinstance(col, dict): + continue + try: + field_to_idx[field] = int(col.get("index")) + except (TypeError, ValueError): + continue +else: + for idx, col in enumerate(columns): + if not isinstance(col, dict): + continue + field = col.get("field") + if field: + field_to_idx[field] = idx + +if "errorAttribution" not in field_to_idx: + raise SystemExit("missing expected column: errorAttribution") + +data = doc.get("data") or [] +idx = field_to_idx["errorAttribution"] +for row in data: + if idx >= len(row): + continue + if str(row[idx]) != "not_enabled": + raise SystemExit("expected errorAttribution 'not_enabled', got %r" % row[idx]) +PY +} + +capture_statement_history_states() { + local output + output="$(mysql_query_root " +SELECT + COALESCE(MAX(CASE WHEN NAME = 'events_statements_history_long' THEN ENABLED END), 'NO') AS history_long, + COALESCE(MAX(CASE WHEN NAME = 'events_statements_history' THEN ENABLED END), 'NO') AS history, + COALESCE(MAX(CASE WHEN NAME = 'events_statements_current' THEN ENABLED END), 'NO') AS history_current +FROM performance_schema.setup_consumers +WHERE NAME IN ('events_statements_history_long','events_statements_history','events_statements_current');")" + local history_long + local history + local history_current + IFS=$'\t' read -r history_long history history_current <<<"$output" + MYSQL_HISTORY_LONG_STATE="$history_long" + MYSQL_HISTORY_STATE="$history" + MYSQL_HISTORY_CURRENT_STATE="$history_current" +} + +disable_statement_history_consumers() { + mysql_exec_root "UPDATE performance_schema.setup_consumers SET ENABLED = 'NO' WHERE NAME IN ('events_statements_history_long','events_statements_history','events_statements_current');" +} + +enable_statement_history_consumers() { + mysql_exec_root "UPDATE performance_schema.setup_consumers SET ENABLED = 'YES' WHERE NAME IN ('events_statements_history_long','events_statements_history','events_statements_current');" +} + +restore_statement_history_consumers() { + if [ -n "${MYSQL_HISTORY_LONG_STATE:-}" ]; then + mysql_exec_root "UPDATE performance_schema.setup_consumers SET ENABLED = '${MYSQL_HISTORY_LONG_STATE}' WHERE NAME = 'events_statements_history_long';" + fi + if [ -n "${MYSQL_HISTORY_STATE:-}" ]; then + mysql_exec_root "UPDATE performance_schema.setup_consumers SET ENABLED = '${MYSQL_HISTORY_STATE}' WHERE NAME = 'events_statements_history';" + fi + if [ -n "${MYSQL_HISTORY_CURRENT_STATE:-}" ]; then + mysql_exec_root "UPDATE performance_schema.setup_consumers SET ENABLED = '${MYSQL_HISTORY_CURRENT_STATE}' WHERE NAME = 'events_statements_current';" + fi +} + +verify_deadlock_info_no_deadlock() { + local output + + output="$(run_function mysql deadlock-info '__job:local' 'false')" + validate "$output" + assert_deadlock_info_empty_success "$output" +} + +verify_deadlock_info() { + local output="" + local found="false" + + for attempt in 1 2 3 4 5; do + induce_deadlock_once + output="$(run_function mysql deadlock-info '__job:local' 'false')" + if has_min_rows "$output" 1; then + validate "$output" --min-rows 1 + if assert_deadlock_info_content "$output"; then + found="true" + break + fi + fi + sleep 1 + done + + if [ "$found" != "true" ]; then + echo "deadlock-info did not produce valid deadlock attribution after 5 attempts" >&2 + return 1 + fi +} + +verify_deadlock_info_no_deadlock +verify_deadlock_info + +capture_statement_history_states +disable_statement_history_consumers + +error_output="$(run_function mysql error-info '__job:local' 'false')" +assert_error_info_not_enabled "$error_output" + +run_top_queries mysql +assert_top_queries_error_attribution_not_enabled "$WORKDIR/mysql-top-queries.json" + +enable_statement_history_consumers + +for _ in 1 2 3; do + mysql_exec_root_allow_error "SELECT * FROM missing_table;" +done + +sleep 1 + +error_output="$(run_function mysql error-info '__job:local' 'true')" +assert_error_info_has_error "$error_output" + +run_top_queries mysql +assert_top_queries_error_attribution_enabled "$WORKDIR/mysql-top-queries.json" + +restore_statement_history_consumers + +echo "E2E checks passed for ${MYSQL_VARIANT_LABEL}." >&2 diff --git a/src/go/tools/functions-validation/seed/mssql/init.sql b/src/go/tools/functions-validation/seed/mssql/init.sql index 609090a58cb58f..c0af0479621a91 100644 --- a/src/go/tools/functions-validation/seed/mssql/init.sql +++ b/src/go/tools/functions-validation/seed/mssql/init.sql @@ -7,9 +7,44 @@ GO ALTER DATABASE netdata SET QUERY_STORE = ON; GO +IF NOT EXISTS (SELECT 1 FROM sys.server_principals WHERE name = 'netdata_limited') +BEGIN + CREATE LOGIN netdata_limited WITH PASSWORD = 'Netdata123!'; +END +GO + +-- Ensure the limited user can start the collector in E2E: +-- A previous DENY will override GRANT, so revoke first. +REVOKE VIEW SERVER STATE FROM netdata_limited; +GO + +GRANT VIEW SERVER STATE TO netdata_limited; +GO + +USE msdb; +GO + +IF NOT EXISTS (SELECT 1 FROM sys.database_principals WHERE name = 'netdata_limited') +BEGIN + CREATE USER netdata_limited FOR LOGIN netdata_limited; +END +GO + +GRANT SELECT ON dbo.sysjobs TO netdata_limited; +GO + USE netdata; GO +IF NOT EXISTS (SELECT 1 FROM sys.database_principals WHERE name = 'netdata_limited') +BEGIN + CREATE USER netdata_limited FOR LOGIN netdata_limited; +END +GO + +GRANT CONNECT TO netdata_limited; +GO + IF OBJECT_ID('dbo.sample', 'U') IS NULL BEGIN CREATE TABLE dbo.sample ( @@ -20,6 +55,36 @@ BEGIN END GO +IF OBJECT_ID('dbo.deadlock_a', 'U') IS NULL +BEGIN + CREATE TABLE dbo.deadlock_a ( + id INT PRIMARY KEY, + value INT NOT NULL + ); +END +GO + +IF OBJECT_ID('dbo.deadlock_b', 'U') IS NULL +BEGIN + CREATE TABLE dbo.deadlock_b ( + id INT PRIMARY KEY, + value INT NOT NULL + ); +END +GO + +IF NOT EXISTS (SELECT 1 FROM dbo.deadlock_a WHERE id = 1) +BEGIN + INSERT INTO dbo.deadlock_a (id, value) VALUES (1, 10); +END +GO + +IF NOT EXISTS (SELECT 1 FROM dbo.deadlock_b WHERE id = 1) +BEGIN + INSERT INTO dbo.deadlock_b (id, value) VALUES (1, 20); +END +GO + INSERT INTO dbo.sample (name, value) VALUES ('alpha', 10), ('beta', 20), ('gamma', 30); GO diff --git a/src/go/tools/functions-validation/seed/mysql/init.sql b/src/go/tools/functions-validation/seed/mysql/init.sql index 4e3b568dd344f3..4ae74ffc1b3aa2 100644 --- a/src/go/tools/functions-validation/seed/mysql/init.sql +++ b/src/go/tools/functions-validation/seed/mysql/init.sql @@ -4,6 +4,20 @@ CREATE TABLE IF NOT EXISTS sample ( value INT NOT NULL ); +-- Tables for deadlock induction tests. +CREATE TABLE IF NOT EXISTS deadlock_a ( + id INT PRIMARY KEY, + value INT NOT NULL +) ENGINE=InnoDB; + +CREATE TABLE IF NOT EXISTS deadlock_b ( + id INT PRIMARY KEY, + value INT NOT NULL +) ENGINE=InnoDB; + +INSERT INTO deadlock_a (id, value) VALUES (1, 10); +INSERT INTO deadlock_b (id, value) VALUES (1, 20); + -- Ensure statement digest collection is enabled. UPDATE performance_schema.setup_consumers SET ENABLED = 'YES'