diff --git a/metrics/check/check_api.go b/metrics/check/check_api.go new file mode 100644 index 0000000..afdcc84 --- /dev/null +++ b/metrics/check/check_api.go @@ -0,0 +1,15 @@ +//Copyright (c) 2014 Square, Inc + +package check + +type Checker interface { + //Returns Warnings using input function. e.g. OutputWarnings(formats.Basic) + OutputWarnings(func(Checker, ...string) error, ...string) error + + //Check the metrics against their thresholds + CheckMetrics() error + + //Return results of metric checks. + // Result in the form of sectionName -> check results + GetWarnings() map[string]metricResults +} diff --git a/metrics/check/check_impl.go b/metrics/check/check_impl.go new file mode 100644 index 0000000..0558183 --- /dev/null +++ b/metrics/check/check_impl.go @@ -0,0 +1,206 @@ +//Copyright (c) 2014 Square, Inc + +package check + +import ( + "encoding/json" + "go/ast" + "go/parser" + "go/token" + "log" + "net/http" + "os" + "strconv" + "strings" + + _ "code.google.com/p/go.tools/go/gcimporter" + "code.google.com/p/go.tools/go/types" + "code.google.com/p/goconf/conf" // used for parsing config files +) + +type checker struct { + hostport string + Metrics map[string]metric + Warnings map[string]metricResults + c *conf.ConfigFile + Logger *log.Logger + pkg *types.Package + scope *types.Scope +} + +type metricThresholds struct { + metricblob string + checks map[string]string +} + +type metricResults struct { + Message string + Checks map[string]bool // maps check name to result +} + +type metric struct { + Type string + Name string + Value float64 + Rate float64 +} + +//Creates new checker +//hostport is address to listen on for metrics json +func New(hostport, configFile string) (Checker, error) { + c, err := conf.ReadConfigFile(configFile) + if err != nil { + return nil, err + } + hc := &checker{ + hostport: hostport, //hostport to listen on for metrics json + Metrics: make(map[string]metric), + Warnings: make(map[string]metricResults), + c: c, + Logger: log.New(os.Stderr, "LOG: ", log.Lshortfile), + } + hc.setupConstants() + return hc, nil +} + +func (hc *checker) OutputWarnings(printer func(Checker, ...string) error, s ...string) error { + err := printer(hc, s...) + return err +} + +//gets metrics and unmarshals from JSON +func (hc *checker) getMetrics() error { + //get metrics from metrics collector + resp, err := http.Get("http://" + hc.hostport + "/api/v1/metrics.json/Counters|Gauges|StatTimers?allowNaN=false") + if err != nil { + hc.Logger.Println(err) + return err + } + defer resp.Body.Close() + d := json.NewDecoder(resp.Body) + if err != nil { + hc.Logger.Println(err) + return err + } + //unmarshal metrics + var metrics []metric + err = d.Decode(&metrics) + if err != nil { + hc.Logger.Println(err) + return err + } + //store metrics in map, so they can be found easily by name + for _, m := range metrics { + hc.Metrics[m.Name] = m + } + return nil +} + +//Checks all metrics metrics. +//iterates through checks in config file and checks against collected metrics +func (hc *checker) CheckMetrics() error { + err := hc.getMetrics() + if err != nil { + hc.Logger.Println(err) + return err + } + //iterate through all sections of tests + for _, sectionName := range hc.c.GetSections() { + if sectionName == "default" || sectionName == "nagios" || sectionName == "constants" { + continue + } + m := getConfigChecks(hc.c, sectionName) + hc.Warnings[sectionName] = hc.checkMetric(m) + } + return nil +} + +//Check single section against its tests +func (hc *checker) checkMetric(m metricThresholds) metricResults { + res := &metricResults{} + res.Checks = make(map[string]bool) + for name, check := range m.checks { + checkVal, err := hc.replaceNames(check) + if err != nil { + hc.Logger.Println(err) + } + resultType, result, err := types.Eval(checkVal, hc.pkg, hc.scope) + //error evaluating expression, don't store result + if err != nil { + hc.Logger.Println(err) + continue + } + //check that expression evaluated to bool + if !types.Identical(resultType, types.Typ[types.UntypedBool]) && !types.Identical(resultType, types.Typ[types.Bool]) { + hc.Logger.Println("Check: " + name + ": " + check + " does not evaluate to bool") + continue + } + res.Checks[name], _ = strconv.ParseBool(result.String()) + } + return *res +} + +//finds and replaces names of other metrics inside expression +func (hc *checker) replaceNames(expr string) (string, error) { + words := strings.Split(expr, " ") + for _, word := range words { + if strings.Contains(word, ".") { + parts := strings.Split(word, ".") + metricName := strings.Join(parts[:len(parts)-1], ".") + m, ok := hc.Metrics[metricName] + if !ok { + continue + } + if parts[len(parts)-1] == "Value" { + expr = strings.Replace(expr, word, strconv.FormatFloat(m.Value, 'f', 5, 64), -1) + } else if parts[len(parts)-1] == "Rate" { + expr = strings.Replace(expr, word, strconv.FormatFloat(m.Rate, 'f', 5, 64), -1) + } + } + } + return expr, nil +} + +//Reads the thresholds and messages from the config file +func getConfigChecks(c *conf.ConfigFile, test string) metricThresholds { + m := &metricThresholds{} + m.checks = make(map[string]string) + checks, _ := c.GetOptions(test) + for _, checkName := range checks { + if checkName == "metric-name" { + continue + } + m.checks[checkName], _ = c.GetString(test, checkName) + } + return *m +} + +func (hc *checker) GetWarnings() map[string]metricResults { + return hc.Warnings +} + +func (hc *checker) setupConstants() error { + constants, err := hc.c.GetOptions("constants") + if err != nil { + hc.Logger.Println(err) + return err + } + src := "package p\n" + for _, name := range constants { + val, _ := hc.c.GetString("constants", name) + src += "const " + name + " = " + val + "\n" + } + fset := token.NewFileSet() + file, err := parser.ParseFile(fset, "p", src, 0) + if err != nil { + hc.Logger.Println(err) + return err + } + hc.pkg, err = types.Check("p", fset, []*ast.File{file}) + if err != nil { + hc.Logger.Println(err) + return err + } + hc.scope = hc.pkg.Scope().Child(0) + return nil +} diff --git a/metrics/check/check_test.go b/metrics/check/check_test.go new file mode 100644 index 0000000..f5ec35e --- /dev/null +++ b/metrics/check/check_test.go @@ -0,0 +1,288 @@ +package check + +import ( + "fmt" + "log" + "net/http" + "os" + "testing" + + "code.google.com/p/goconf/conf" // used for parsing config files + "github.com/square/prodeng/metrics" +) + +func initChecker(t testing.TB) checker { + hc := checker{ + hostport: "localhost:12345", + Metrics: make(map[string]metric), + Warnings: make(map[string]metricResults), + Logger: log.New(os.Stderr, "LOG: ", log.Lshortfile), + } + return hc +} + +func initConfigFile() *conf.ConfigFile { + c := conf.NewConfigFile() + c.AddSection("constants") + c.AddOption("constants", "const1", "1") + c.AddOption("constants", "const2", "2") + c.AddSection("section1") + c.AddOption("section1", "check1", "value1") + c.AddOption("section1", "check2", "value2") + c.AddSection("section2") + c.AddOption("section2", "check1", "valueA") + c.AddOption("section2", "check2", "valueB") + c.AddOption("section2", "check3", "valueC") + return c +} + +var ( + expectedValues = map[string]float64{ + "testGauge2": float64(200), + "testGauge3": float64(300), + "testGauge4": float64(400), + "testGauge5": float64(500)} +) + +func initMetricsJson() { + _, err := http.Get("http://localhost:12345/api/v1/metrics.json") + if err == nil { + return + } + m := metrics.NewMetricContext("test") + g1 := metrics.NewGauge() + m.Register(g1, "testGauge1") + g2 := metrics.NewGauge() + m.Register(g2, "testGauge2") + g3 := metrics.NewGauge() + m.Register(g3, "testGauge3") + g4 := metrics.NewGauge() + m.Register(g4, "testGauge4") + g5 := metrics.NewGauge() + m.Register(g5, "testGauge5") + g2.Set(float64(200)) + g3.Set(float64(300)) + g4.Set(float64(400)) + g5.Set(float64(500)) + go func() { + http.HandleFunc("/api/v1/metrics.json", m.HttpJsonHandler) + http.ListenAndServe("localhost:12345", nil) + }() +} + +//Tests get metrics json correctly +func TestGetMetrics(t *testing.T) { + //initialize checkers + hc := initChecker(t) + initMetricsJson() + //get metrics here + err := hc.getMetrics() + if err != nil { + t.Fatal(err) + } + //now check we collected the right metrics + for name, metric := range hc.Metrics { + v, ok := expectedValues[name] + if !ok { + t.Errorf("Unexpected metric collected: " + name) + continue + } + if metric.Value != v { + t.Errorf(fmt.Sprintf("Unexpected value in %s. Expected %f, got %f", name, v, metric.Value)) + } + } +} + +//tests replacement of names in expressions correctly +func TestReplaceNames1(t *testing.T) { + expr := "testGauge2.Value > 100" + hc := initChecker(t) + initMetricsJson() + hc.getMetrics() + result, err := hc.replaceNames(expr) + if err != nil { + t.Fatal(err) + } + expected := "200.00000 > 100" + if result != expected { + t.Error(fmt.Sprintf("Expected %s, but got %s", expected, result)) + } +} + +func TestReplaceNames2(t *testing.T) { + expr := "testGauge2.Rate > 100" + hc := initChecker(t) + initMetricsJson() + hc.getMetrics() + result, err := hc.replaceNames(expr) + if err != nil { + t.Fatal(err) + } + expected := "0.00000 > 100" + if result != expected { + t.Error(fmt.Sprintf("Expected %s, but got %s", expected, result)) + } +} + +func TestReplaceNames3(t *testing.T) { + expr := "testGauge2.Value > testGauge2.Rate" + hc := initChecker(t) + initMetricsJson() + hc.getMetrics() + result, err := hc.replaceNames(expr) + if err != nil { + t.Fatal(err) + } + expected := "200.00000 > 0.00000" + if result != expected { + t.Error(fmt.Sprintf("Expected %s, but got %s", expected, result)) + } +} + +//tests correctly.checks metrics against thresholds correctly +func TestCheckMetrics1(t *testing.T) { + hc := initChecker(t) + initMetricsJson() + hc.getMetrics() + m := metricThresholds{ + checks: map[string]string{ + "1": "testGauge2.Value > 199", + "2": "testGauge2.Value == 200 ", + "3": "testGauge2.Value <= 205", + }, + } + result := hc.checkMetric(m) + if result.Checks["1"] != true { + t.Errorf("Did not make check 1 correctly") + } + if result.Checks["2"] != true { + t.Errorf("Did not make check 2 correctly") + } + if result.Checks["3"] != true { + t.Errorf("Did not make check 3 correctly") + } +} + +func TestCheckMetrics2(t *testing.T) { + hc := initChecker(t) + initMetricsJson() + hc.getMetrics() + m := metricThresholds{ + checks: map[string]string{ + "1": "testGauge2.Value < 199", + "2": "testGauge2.Value != 200 ", + "3": "testGauge2.Value >= 205", + }, + } + result := hc.checkMetric(m) + if result.Checks["1"] != false { + t.Errorf("Did not make check 1 correctly") + } + if result.Checks["2"] != false { + t.Errorf("Did not make check 2 correctly") + } + if result.Checks["3"] != false { + t.Errorf("Did not make check 3 correctly") + } +} + +func TestCheckMetrics3(t *testing.T) { + hc := initChecker(t) + initMetricsJson() + hc.getMetrics() + m := metricThresholds{ + checks: map[string]string{ + "1": "testGauge2.Value < testGauge3.Value", + "2": "testGauge2.Value == testGauge4.Value ", + "3": "testGauge4.Value >= testGauge3.Value", + }, + } + result := hc.checkMetric(m) + if result.Checks["1"] != true { + t.Errorf("Did not make check 1 correctly") + } + if result.Checks["2"] != false { + t.Errorf("Did not make check 2 correctly") + } + if result.Checks["3"] != true { + t.Errorf("Did not make check 3 correctly") + } +} + +func TestCheckConstants1(t *testing.T) { + hc := initChecker(t) + hc.c = conf.NewConfigFile() + hc.c.AddSection("constants") + hc.c.AddOption("constants", "const1", "1") + + m := metricThresholds{ + checks: map[string]string{ + "1": "const1 == 1", + "2": "const1 != 1", + "3": "const1 >= 0", + }, + } + + hc.setupConstants() + result := hc.checkMetric(m) + + if result.Checks["1"] != true { + t.Errorf("Did not make check 1 correctly") + } + if result.Checks["2"] != false { + t.Errorf("Did not make check 2 correctly") + } + if result.Checks["3"] != true { + t.Errorf("Did not make check 3 correctly") + } +} + +func TestCheckConstants2(t *testing.T) { + hc := initChecker(t) + hc.c = conf.NewConfigFile() + hc.c.AddSection("constants") + hc.c.AddOption("constants", "const1", "1") + hc.c.AddOption("constants", "const2", "2") + + m := metricThresholds{ + checks: map[string]string{ + "1": "const1 == const2", + "2": "const1 != const2", + "3": "const1 >= const2", + }, + } + + hc.setupConstants() + result := hc.checkMetric(m) + + if result.Checks["1"] != false { + t.Errorf("Did not make check 1 correctly") + } + if result.Checks["2"] != true { + t.Errorf("Did not make check 2 correctly") + } + if result.Checks["3"] != false { + t.Errorf("Did not make check 3 correctly") + } +} + +func TestReadConfigFile(t *testing.T) { + c := initConfigFile() + m1 := getConfigChecks(c, "section1") + if m1.checks["check1"] != "value1" { + t.Error("did not get section1, check1 correct") + } + if m1.checks["check2"] != "value2" { + t.Error("did not get section1, check2 correct") + } + m2 := getConfigChecks(c, "section2") + if m2.checks["check1"] != "valueA" { + t.Error("did not get section2, check1 correct") + } + if m2.checks["check2"] != "valueB" { + t.Error("did not get section2, check2 correct") + } + if m2.checks["check3"] != "valueC" { + t.Error("did not get section2, check3 correct") + } +} diff --git a/metrics/check/formats/basicFormat.go b/metrics/check/formats/basicFormat.go new file mode 100644 index 0000000..0d314aa --- /dev/null +++ b/metrics/check/formats/basicFormat.go @@ -0,0 +1,18 @@ +package formats + +import ( + "fmt" + "strconv" + + "github.com/square/prodeng/metrics/check" +) + +func Basic(hc check.Checker, s ...string) error { + for metric, result := range hc.GetWarnings() { + fmt.Println(metric + ": " + result.Message) + for checkName, val := range result.Checks { + fmt.Println(" " + checkName + ": " + strconv.FormatBool(val)) + } + } + return nil +} diff --git a/metrics/check/formats/nagiosFormat.go b/metrics/check/formats/nagiosFormat.go new file mode 100644 index 0000000..a6dc604 --- /dev/null +++ b/metrics/check/formats/nagiosFormat.go @@ -0,0 +1,94 @@ +package formats + +import ( + "fmt" + "os" + "os/exec" + "strings" + + "code.google.com/p/goconf/conf" // used for parsing config files + "github.com/square/prodeng/metrics/check" +) + +var ( + nagLevels = map[string]int{"OK": 0, "WARN": 1, "CRIT": 2, "UNKNOWN": 3} +) + +type nagSender struct { + server string + serviceType string + hostname string + NSCA_BINARY_PATH string + NSCA_CONFIG_PATH string +} + +//Nagios statement formatted as: host service state_code message +func Nagios(hc check.Checker, configFile ...string) error { + ns := getNagiosInfo(configFile[0]) + res := []string{} + critical := []string{} + warning := []string{} + ok := []string{} + for sectionName, result := range hc.GetWarnings() { + crit := false + warn := false + for checkName, res := range result.Checks { + if strings.Contains(strings.ToLower(checkName), "crit") && res { + crit = true + } else if strings.Contains(strings.ToLower(checkName), "warn") && res { + warn = true + } + } + if crit { + critical = append(critical, sectionName) //result.Message) + } else if warn { + warning = append(warning, sectionName) //result.Message) + } else { + ok = append(ok, sectionName) //result.Message) + } + } + messages := map[string][]string{"CRIT": critical, "WARN": warning, "OK": ok} + for level, msgs := range messages { + if len(msgs) == 0 { + continue + } + res = append(res, fmt.Sprintf("%s\t%s\t%d\t%s\n", ns.hostname, ns.serviceType, nagLevels[level], strings.Join(msgs, ", "))) + } + for _, m := range res { + fmt.Println(m) + } + return nil +} + +//Sends nagios server metrics warnings +func SendNagiosPassive(messages []string, configFile string) error { + ns := getNagiosInfo(configFile) + for _, message := range messages { + printCmd := exec.Command("printf", fmt.Sprintf("\"%s\\n\"", message)) + sendCmd := exec.Command(ns.NSCA_BINARY_PATH, ns.server, "-c "+ns.NSCA_CONFIG_PATH) + sendCmd.Stdin, _ = printCmd.StdoutPipe() + sendCmd.Start() + printCmd.Run() + err := sendCmd.Wait() + if err != nil { + return err + } + } + return nil +} + +//grabs nagios info from config file +//TODO: can either grab this info from config file or give as input to send function +func getNagiosInfo(configFile string) nagSender { + ns := &nagSender{} + c, err := conf.ReadConfigFile(configFile) + if !c.HasSection("nagios") || err != nil { + return *ns + } + ns.hostname, _ = os.Hostname() + ns.server, _ = c.GetString("nagios", "server") + ns.NSCA_BINARY_PATH, _ = c.GetString("nagios", "nsca-binary-path") + ns.NSCA_CONFIG_PATH, _ = c.GetString("nagios", "nsca-config-path") + ns.serviceType, _ = c.GetString("nagios", "service") + return *ns +} diff --git a/metrics/metric_check/.localized b/metrics/metric_check/.localized new file mode 100644 index 0000000..e69de29 diff --git a/metrics/metric_check/README.md b/metrics/metric_check/README.md new file mode 100644 index 0000000..1c01860 --- /dev/null +++ b/metrics/metric_check/README.md @@ -0,0 +1,30 @@ +#metric checks + +## Usage + +### Command Line Utility + +The command line utility: +``` +./bin/metric_check +``` + +Can be run with the `-conf` option to specify path to the config file described in the section below. The `-hostport` flag will specify the host port to listen on for metrics. The `-nagConf` option specifies to the path to the nagios configuration file. `-basic=true` and `-nagios=true` will set the output format to basic and nagios, respectively. + +### Config File + +The config file specifies the checks that will be done on the metrics values. An example: +``` +[metric1] +check1 = metric1.Value < 16384 +check2 = metric1.Value == 16384 + +[test rates] +check rate = metric1.Rate > 700 +check rate 2 = metric2.Rate > 900 + +[check user percentage] +user 50 pct = cpustat.cpu.User.Value / cpustat.cpu.Total.Value * 100 > 50 +user 30 pct = cpustat.cpu.User.Value / cpustat.cpu.Total.Value * 100 > 30 +``` +Each section title serves to describe its set of metric checks. Only the sections `default` and `nagios` are reserved for special information. The fields in the section will specify the checks. On the left hand side of the `=` is the name of the check; each name must be unique within its section. On the right hand side is the check that will be performed. Metrics can be specified by their full name, and will be replaced by the appropriate value. diff --git a/metrics/metric_check/check.go b/metrics/metric_check/check.go new file mode 100644 index 0000000..96a7996 --- /dev/null +++ b/metrics/metric_check/check.go @@ -0,0 +1,59 @@ +//Copyright (c) 2014 Square, Inc + +package main + +import ( + "flag" + "fmt" + "os" + "time" + + "github.com/square/prodeng/metrics/check" + "github.com/square/prodeng/metrics/check/formats" +) + +var ( + testconfigurationfile = "./test.config" + testnagiosconfigfile = "./test_nagios.config" +) + +// basic checker +// starts loop and prints checks against config file +func main() { + var hostport, configFile, nagConfigFile string + var basic, nagios bool + var stepSec int + + flag.StringVar(&hostport, "hostport", "localhost:12345", "hostport to grab metrics") + flag.StringVar(&configFile, "conf", "", "config file to read metric thresholds") + flag.StringVar(&nagConfigFile, "nagConf", "", "config file to send nagios messages") + flag.IntVar(&stepSec, "step", 2, "time step in between sending messages to nagios") + flag.BoolVar(&basic, "basic", true, "output check results in basic format") + flag.BoolVar(&nagios, "nagios", false, "output check results in nagios format") + flag.Parse() + if configFile == "" { + configFile = testconfigurationfile + } + if nagConfigFile == "" { + nagConfigFile = testconfigurationfile + } + + fmt.Println("starting metrics checker on: ", hostport) + + hc, err := check.New(hostport, configFile) + if err != nil { + fmt.Println(err) + os.Exit(1) + } + step := time.Millisecond * time.Duration(stepSec) * 1000 + ticker := time.NewTicker(step) + for _ = range ticker.C { + hc.CheckMetrics() + if basic { + hc.OutputWarnings(formats.Basic) + } + if nagios { + hc.OutputWarnings(formats.Nagios, nagConfigFile) + } + } +} diff --git a/metrics/metric_check/test.config b/metrics/metric_check/test.config new file mode 100644 index 0000000..ffb28a7 --- /dev/null +++ b/metrics/metric_check/test.config @@ -0,0 +1,25 @@ +[free mem] +check_expr1 = memstat.Free.Value < 2000000000 +check_expr2 = memstat.Free.Value >= 1500000000 + +[active] +crit_check_expr1 = memstat.Active.Value >= 6600000000 +warn_check_expr2 = memstat.Active.Value < 6500000000 + +[arithmetic in checks] +check1_expr = cpustat.cpu.User.Value < 14828637 +check2_expr = cpustat.cpu.User.Value == 14828637 +check3_expr = cpustat.cpu.User.Value < 100 * 200000 + +[missing metric] +# example of a comment line +# these metrics wont be found +check1 = metric.value < 16384 +check2 = metric.value == 16384 + +[test Rates] +check Rate expr = cpustat.cpu.System.Rate > 700 +check Rate expr2 = cpustat.cpu.Total.Rate > 900 + +[test name replace] +check one = cpustat.cpu.User.Value / cpustat.cpu.Total.Value * 100 > 50 diff --git a/metrics/metric_check/test_nagios.config b/metrics/metric_check/test_nagios.config new file mode 100644 index 0000000..80a59e6 --- /dev/null +++ b/metrics/metric_check/test_nagios.config @@ -0,0 +1,5 @@ +[nagios] +server = mytestserver +nsca-binary-path = binary_path +nsca-config-path = config-path +service = mysql