[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[ansible-taler-exchange] branch master updated: use firefly for alerts
From: |
Admin |
Subject: |
[ansible-taler-exchange] branch master updated: use firefly for alerts |
Date: |
Mon, 02 Jun 2025 09:14:40 +0200 |
This is an automated email from the git hooks/post-receive script.
grothoff pushed a commit to branch master
in repository ansible-taler-exchange.
The following commit(s) were added to refs/heads/master by this push:
new 613cd9b use firefly for alerts
613cd9b is described below
commit 613cd9be4a8a28539f754a1f83a13f204e26cfac
Author: Christian Grothoff <christian@grothoff.org>
AuthorDate: Mon Jun 2 09:14:36 2025 +0200
use firefly for alerts
---
.../files/etc/prometheus/alert_rules.yml | 29 ++++++++++++++++++++++
.../monitoring/files/etc/prometheus/prometheus.yml | 8 +++---
roles/monitoring/tasks/main.yml | 8 ++++++
.../templates/etc/prometheus/alertmanager.yml | 12 ++++-----
4 files changed, 47 insertions(+), 10 deletions(-)
diff --git a/roles/monitoring/files/etc/prometheus/alert_rules.yml
b/roles/monitoring/files/etc/prometheus/alert_rules.yml
new file mode 100644
index 0000000..914fef6
--- /dev/null
+++ b/roles/monitoring/files/etc/prometheus/alert_rules.yml
@@ -0,0 +1,29 @@
+groups:
+- name: node_exporter_alerts
+ rules:
+ - alert: HighCPULatency
+ expr: sum(rate(node_cpu_seconds_total{mode="system"}[1m])) /
count(node_cpu_seconds_total{mode="system"}) * 100 > 80
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: "High CPU Latency detected"
+ description: "CPU latency is above 80% for more than 1 minute."
+
+ - alert: LowDiskSpace
+ expr: (node_filesystem_free_bytes / node_filesystem_size_bytes) * 100 < 10
+ for: 1m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Low Disk Space detected"
+ description: "Disk space is below 10% for more than 1 minute."
+
+ - alert: HighMemoryUsage
+ expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))
* 100 > 80
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: "High Memory Usage detected"
+ description: "Memory usage is above 80% for more than 1 minute."
diff --git a/roles/monitoring/files/etc/prometheus/prometheus.yml
b/roles/monitoring/files/etc/prometheus/prometheus.yml
index bf121a3..10038d9 100644
--- a/roles/monitoring/files/etc/prometheus/prometheus.yml
+++ b/roles/monitoring/files/etc/prometheus/prometheus.yml
@@ -1,7 +1,7 @@
# my global config
global:
- scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default
is every 1 minute.
- evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is
every 1 minute.
+ scrape_interval: 60s # Set the scrape interval to every 15 seconds. Default
is every 1 minute.
+ evaluation_interval: 60s # Evaluate rules every 15 seconds. The default is
every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration -- FIXME: not yet setup!
@@ -31,13 +31,13 @@ scrape_configs:
# Job, for local node exporter
- job_name: 'node_exporter_metrics'
- scrape_interval: 5s
+ scrape_interval: 60s
static_configs:
- targets: ['localhost:9100']
# Job, for local nginx exporter
- job_name: 'nginx_exporter_metrics'
- scrape_interval: 5s
+ scrape_interval: 60s
static_configs:
- targets: ['localhost:9113']
diff --git a/roles/monitoring/tasks/main.yml b/roles/monitoring/tasks/main.yml
index 4aec691..c7a4df5 100644
--- a/roles/monitoring/tasks/main.yml
+++ b/roles/monitoring/tasks/main.yml
@@ -200,6 +200,14 @@
group: root
mode: "0644"
+- name: Configure node-exporter rules for alertmanager
+ copy:
+ src: etc/prometheus/alert_rules.yml
+ dest: /etc/prometheus/alert_rules.yml
+ owner: root
+ group: root
+ mode: "0644"
+
- name: Ensure exporter services are enabled and started
service:
name: "{{ item }}"
diff --git a/roles/monitoring/templates/etc/prometheus/alertmanager.yml
b/roles/monitoring/templates/etc/prometheus/alertmanager.yml
index cb68bf2..d7474d3 100644
--- a/roles/monitoring/templates/etc/prometheus/alertmanager.yml
+++ b/roles/monitoring/templates/etc/prometheus/alertmanager.yml
@@ -3,14 +3,14 @@
global:
# The smarthost and SMTP sender used for mail notifications.
- smtp_smarthost: 'localhost:25'
+ smtp_smarthost: 'firefly.gnunet.org'
smtp_from: 'alertmanager@taler.net'
smtp_require_tls: false
#smtp_auth_username: 'alertmanager'
#smtp_auth_password: 'password'
# The directory from which notification templates are read.
-templates:
+templates:
- '/etc/prometheus/alertmanager_templates/*.tmpl'
# The root route on which each incoming alert enters.
@@ -23,7 +23,7 @@ route:
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
- # firing shortly after another are batched together on the first
+ # firing shortly after another are batched together on the first
# notification.
group_wait: 30s
@@ -33,12 +33,12 @@ route:
# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
- repeat_interval: 12h
+ repeat_interval: 12h
# A default receiver
receiver: taler-warning-mails
- # All the above attributes are inherited by all child routes and can
+ # All the above attributes are inherited by all child routes and can
# overwritten on each.
# The child route trees.
@@ -50,7 +50,7 @@ route:
# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
-# We use this to mute any warning-level notifications if the same alert is
+# We use this to mute any warning-level notifications if the same alert is
# already critical.
inhibit_rules:
- source_match:
--
To stop receiving notification emails like this one, please contact
gnunet@gnunet.org.
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [ansible-taler-exchange] branch master updated: use firefly for alerts,
Admin <=