codeyourinfra
diff --git a/‎.gitignore‎
Lines changed: 6 additions & 0 deletions b/‎.gitignore‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.travis.yml‎
Lines changed: 14 additions & 0 deletions b/‎.travis.yml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎.yamllint‎
Lines changed: 15 additions & 0 deletions b/‎.yamllint‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎LICENSE‎
Lines changed: 21 additions & 0 deletions b/‎LICENSE‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 29 additions & 0 deletions b/‎README.md‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎Vagrantfile‎
Lines changed: 27 additions & 0 deletions b/‎Vagrantfile‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎ansible.cfg‎
Lines changed: 8 additions & 0 deletions b/‎ansible.cfg‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎files/monitor-datasource.json‎
Lines changed: 8 additions & 0 deletions b/‎files/monitor-datasource.json‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎files/slack-notification-channel.json‎
Lines changed: 8 additions & 0 deletions b/‎files/slack-notification-channel.json‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎files/used_mem_pct-dashboard.json‎
Lines changed: 229 additions & 0 deletions b/‎files/used_mem_pct-dashboard.json‎
Lines changed: 229 additions & 0 deletions
@@ -0,0 +1,6 @@
+.vagrant/
+.vscode/
+*.retry
+ubuntu-*-cloudimg-console.log
+env
+__pycache__
@@ -0,0 +1,14 @@
+---
+language: python
+python: "3.6"
+
+addons:
+  apt:
+    packages:
+      - python-pip
+
+install:
+  - pip install -r requirements.txt
+
+script:
+  - molecule test -s aws
@@ -0,0 +1,15 @@
+---
+extends: default
+ignore: |
+  **/lib/
+rules:
+  braces:
+    max-spaces-inside: 1
+    level: error
+  brackets:
+    max-spaces-inside: 1
+    level: error
+  line-length: disable
+  # NOTE(retr0h): Templates no longer fail this lint rule.
+  #               Uncomment if running old Molecule templates.
+  # truthy: disable
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Esign Consulting Ltda.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,29 @@
+# Get metrics for alerting in advance and preventing trouble
+
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![GitHub release](https://img.shields.io/github/release/codeyourinfra/get_metrics_for_alerting.svg)](https://github.com/codeyourinfra/get_metrics_for_alerting/releases/latest) [![Build status](https://travis-ci.org/codeyourinfra/get_metrics_for_alerting.svg?branch=master)](https://travis-ci.org/codeyourinfra/get_metrics_for_alerting)
+
+This solution is explained in detail in the Codeyourinfra project blog post [How to get metrics for alerting in advance and preventing trouble](http://codeyourinfra.today/how-to-get-metrics-for-alerting-in-advance-and-preventing-trouble). Check it out!
+
+## Problem
+
+You may already have a monitoring solution. After all, you are responsible for keeping all the IT services available. You don't want to be surprised by an unexpected outage, then you install in every server an agent for collecting relevant data for monitoring purposes. In addition, automatic emails are sent if something is going wrong, you've configured that. The problem is that you can't handle it anymore because you now have more than a thousand of servers to be monitored. Furthermore, people no more give attention to the alerts received by email, due to the big amount of false positive ones.
+
+## Solution
+
+The solution is based on [InfluxDB](https://docs.influxdata.com/influxdb), a high performance time series database, on [Grafana](https://grafana.com/), a time series analytics and monitoring tool, and on [Ansible](https://www.ansible.com/), an agentless automation tool. They are all open source tools and can be easily integrated with each other in order to create a monitoring service. With Ansible is possible to extract the servers' hardware metrics and store them in the InfluxDB ([playbook-get-metrics.yml](templates/playbook-get-metrics.yml)). With Grafana is possible to connect to InfluxDB and show the metrics in a graphical way, define thresholds and configure alerts that can be given through different channels, including instant messaging apps like [Slack](https://slack.com) and [Telegram](https://telegram.org).
+
+![Solution picture](get_metrics_for_alerting.png)
+
+## Test
+
+First of all, run the command `vagrant up monitor`, in order to turn on the **monitoring server**. Then, open your web browser and access the Grafana web application through the **URL** <http://192.168.33.10:3000>. The **user** and the **password** are *admin*. After that, click in the **used_mem_pct** dashboard. You will see the **Used memory percentage** line chart, with data from the **monitoring server** itself. An alert is sent to a [Slack workspace](https://mygrafanaalerts.slack.com) (click [here](https://join.slack.com/t/mygrafanaalerts/shared_invite/enQtNjg2NTQ0MDM0MDgxLTA3NzhkNjliNjY5YWUwNTY1OWI3MjkwOGIwZjM2NDQzNzlhMDc3YjQzMjg0Mjc4MjYzYjYyNjc2MjQ5ZDA3OGU) to join) if the last 5 used memory percentage values are grater than or equal to 95%, the defined threshold.
+
+You can add the other servers to the monitoring service, if you want. In order to add the **server1**, firstly boot it up, through the command `vagrant up server1`. After that, execute the command `ansible-playbook playbook-add-server.yml -e "host=192.168.33.20 user=vagrant password=vagrant"`. The parameters **host**, **user** and **password** are used by Ansible to access the monitored hosts, through SSH, from the monitoring server. Once added, wait at least 1 minute and check if Ansible is properly getting the metrics from the new monitored server by executing the ad-hoc command `ansible monitor -m shell -a "cat /etc/ansible/playbooks/playbook-get-metrics.log"`. Repeat these steps for the **server2**, at your will.
+
+### Automated tests
+
+You can also test the solution automaticaly, by executing `./test.sh` or using [Molecule](https://molecule.readthedocs.io). With the latter, you can perform the test not only locally (the default), but in [AWS](https://aws.amazon.com) as well. During the Codeyourinfra's *continuous integration* process in Travis CI, the solution is tested on [Amazon EC2](https://aws.amazon.com/ec2).
+
+In order to get your environment ready for using *Molecule*, prepare your [Python virtual environment](https://docs.python.org/3/tutorial/venv.html), executing `python3 -m venv env && source env/bin/activate && pip install -r ../requirements.txt`. After that, just run the command `molecule test`, to test the solution locally in a [VirtualBox](https://www.virtualbox.org) VM managed by [Vagrant](https://www.vagrantup.com).
+
+If you prefer performing the test in AWS, bear in mind you must have your credentials appropriately in **~/.aws/credentials**. You can [configure it through the AWS CLI tool](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html). The test is performed in the AWS region *Europe - London (eu-west-2)*. Just run `molecule test -s aws` and check the running instances through your [AWS Console](https://eu-west-2.console.aws.amazon.com/ec2/v2).
@@ -0,0 +1,27 @@
+# -*- mode: ruby -*-
+# vi: set ft=ruby :
+
+Vagrant.configure("2") do |config|
+  config.vm.define "monitor" do |monitor|
+    monitor.vm.box = "codeyourinfra/monitor"
+    monitor.vm.network "private_network", ip: "192.168.33.10"
+
+    monitor.vm.provision "ansible" do |ansible|
+      ansible.playbook = "monitoring-configuration.yml"
+      ansible.inventory_path = "inventory.yml"
+    end
+  end
+
+  (1..2).each do |i|
+    config.vm.define "server#{i}" do |server|
+      server.vm.box = "ubuntu/bionic64"
+      server.vm.network "private_network", ip: "192.168.33.#{i+1}0"
+
+      server.vm.provision "ansible" do |ansible|
+        ansible.limit = "server#{i}"
+        ansible.playbook = "servers-configuration.yml"
+        ansible.inventory_path = "inventory.yml"
+      end
+    end
+  end
+end
@@ -0,0 +1,8 @@
+[defaults]
+callback_whitelist = profile_tasks
+host_key_checking = False
+inventory = inventory.yml
+localhost_warning = False
+
+[inventory]
+enable_plugins = yaml
@@ -0,0 +1,8 @@
+{
+  "name": "monitor",
+  "isDefault": true,
+  "type": "influxdb",
+  "url": "http://localhost:8086",
+  "access": "proxy",
+  "database": "monitor"
+}
@@ -0,0 +1,8 @@
+{
+  "name": "Slack notification channel",
+  "type":  "slack",
+  "isDefault": false,
+  "settings": {
+    "url": "https://hooks.slack.com/services/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"
+  }
+}
@@ -0,0 +1,229 @@
+{
+  "dashboard": {
+    "annotations": {
+      "list": [
+        {
+          "builtIn": 1,
+          "datasource": "-- Grafana --",
+          "enable": true,
+          "hide": true,
+          "iconColor": "rgba(0, 211, 255, 1)",
+          "name": "Annotations & Alerts",
+          "type": "dashboard"
+        }
+      ]
+    },
+    "editable": true,
+    "gnetId": null,
+    "graphTooltip": 0,
+    "hideControls": false,
+    "id": null,
+    "links": [],
+    "refresh": "1m",
+    "rows": [
+      {
+        "collapse": false,
+        "height": "250px",
+        "panels": [
+          {
+            "alert": {
+              "conditions": [
+                {
+                  "evaluator": {
+                    "params": [
+                      95
+                    ],
+                    "type": "gt"
+                  },
+                  "operator": {
+                    "type": "and"
+                  },
+                  "query": {
+                    "params": [
+                      "A",
+                      "5m",
+                      "now"
+                    ]
+                  },
+                  "reducer": {
+                    "params": [],
+                    "type": "last"
+                  },
+                  "type": "query"
+                }
+              ],
+              "executionErrorState": "alerting",
+              "frequency": "60s",
+              "handler": 1,
+              "message": "The last 5 used memory percentage values are greater than or equal to the threshold of 95%.",
+              "name": "Used memory percentage alert",
+              "noDataState": "no_data",
+              "notifications": [
+                {
+                  "id": 1
+                }
+              ]
+            },
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": null,
+            "fill": 1,
+            "id": 1,
+            "legend": {
+              "avg": false,
+              "current": false,
+              "max": false,
+              "min": false,
+              "show": true,
+              "total": false,
+              "values": false
+            },
+            "lines": true,
+            "linewidth": 1,
+            "links": [],
+            "nullPointMode": "null",
+            "percentage": false,
+            "pointradius": 5,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [],
+            "spaceLength": 10,
+            "span": 12,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+              {
+                "dsType": "influxdb",
+                "groupBy": [
+                  {
+                    "params": [
+                      "host"
+                    ],
+                    "type": "tag"
+                  }
+                ],
+                "measurement": "used_mem_pct",
+                "orderByTime": "ASC",
+                "policy": "default",
+                "refId": "A",
+                "resultFormat": "time_series",
+                "select": [
+                  [
+                    {
+                      "params": [
+                        "value"
+                      ],
+                      "type": "field"
+                    }
+                  ]
+                ],
+                "tags": []
+              }
+            ],
+            "thresholds": [
+              {
+                "colorMode": "critical",
+                "fill": true,
+                "line": true,
+                "op": "gt",
+                "value": 95
+              }
+            ],
+            "timeFrom": null,
+            "timeShift": null,
+            "title": "Used memory percentage",
+            "tooltip": {
+              "shared": true,
+              "sort": 0,
+              "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+              "buckets": null,
+              "mode": "time",
+              "name": null,
+              "show": true,
+              "values": []
+            },
+            "yaxes": [
+              {
+                "decimals": null,
+                "format": "percent",
+                "label": null,
+                "logBase": 1,
+                "max": 100,
+                "min": null,
+                "show": true
+              },
+              {
+                "format": "short",
+                "label": "",
+                "logBase": 1,
+                "max": null,
+                "min": null,
+                "show": true
+              }
+            ]
+          }
+        ],
+        "repeat": null,
+        "repeatIteration": null,
+        "repeatRowId": null,
+        "showTitle": false,
+        "title": "Dashboard Row",
+        "titleSize": "h6"
+      },
+      {
+        "collapse": false,
+        "height": 250,
+        "panels": [],
+        "repeat": null,
+        "repeatIteration": null,
+        "repeatRowId": null,
+        "showTitle": false,
+        "title": "Dashboard Row",
+        "titleSize": "h6"
+      }
+    ],
+    "schemaVersion": 14,
+    "style": "dark",
+    "tags": [],
+    "templating": {
+      "list": []
+    },
+    "time": {
+      "from": "now-30m",
+      "to": "now"
+    },
+    "timepicker": {
+      "refresh_intervals": [
+        "5s",
+        "10s",
+        "30s",
+        "1m",
+        "5m",
+        "15m",
+        "30m",
+        "1h",
+        "2h",
+        "1d"
+      ],
+      "time_options": [
+        "5m",
+        "15m",
+        "1h",
+        "6h",
+        "12h",
+        "24h",
+        "2d",
+        "7d",
+        "30d"
+      ]
+    },
+    "timezone": "",
+    "title": "used_mem_pct",
+    "version": 1
+  }
+}