Monitoring
Install docker and docker compose
sudo apt-get update
sudo apt-get install ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update
sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
Create monitoring folder
mkdir $HOME/monitoring
cd $HOME/monitoring
Create some files as below
version: '3.2'
services:
v2:
image: firstset/tenderduty:latest
command: ""
ports:
- "8888:8888" # Dashboard
- "28686:28686" # Prometheus exporter
volumes:
- home:/var/lib/tenderduty
- ./config.yml:/var/lib/tenderduty/config.yml
- ./chains.d:/var/lib/tenderduty/chains.d/
logging:
driver: "json-file"
options:
max-size: "20m"
max-file: "10"
restart: unless-stopped
volumes:
home:
---
# controls whether the dashboard is enabled.
enable_dashboard: yes
# What TCP port the dashboard will listen on. Only the port is controllable for now.
listen_port: 8888
# hide_logs is useful if the dashboard will be posted publicly. It disables the log feed,
# and obscures most node-related details. Be aware this isn't fully vetted for preventing
# info leaks about node names, etc.
hide_logs: no
# How long to wait before alerting that a node is down.
node_down_alert_minutes: 3
# Node Down alert Pagerduty Severity
node_down_alert_severity: critical
# whether skip the verification of TLS certificates, when set to `yes` Tenderduty will skip certificate verification and accept self-signed certs
# NOTE: this flag should be false in a production environment
tls_skip_verify: no
# Should the prometheus exporter be enabled?
prometheus_enabled: yes
# What port should it listen on? For now only port is configurable.
prometheus_listen_port: 28686
# CoinMarketCap API key for price conversions
coin_market_cap_api_token: xxxxxx
# When enabled, the cryptos will be converted into a fiat currency based on its latest price
convert_to_fiat:
enabled: true
currency: USD # or EUR, SEK, etc.
cache_expiration: 8 # cache the pricing data for 8 hours
# Default alert configuration used for all chains unless overridden
default_alert_config:
pagerduty:
# Should we use PD? Be aware that if this is set to no it overrides individual chain alerting settings.
enabled: no
# This is an API key, not oauth token, more details to follow, but check the v1 docs for more info
api_key: aaaaaaaaaaaabbbbbbbbbbbbbcccccccccccc
# Not currently used, but will be soon. This allows setting escalation priorities etc.
default_severity: alert
# Severity threshold defines the minimum severity level at which the alerts are sent to this channel
# In Tenderduty there are three severity levels: info, warning, and critical. `severity_threshold: critical` means that Tenderduty only sends critical alerts to this channel (Pagerduty)
severity_threshold: critical
discord:
# Alert to discord?
enabled: no
# The webhook is set by right-clicking on a channel, editing the settings, and configuring a webhook in the intergrations section.
webhook: https://discord.com/api/webhooks/999999999999999999/zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
# Severity threshold defines the minimum severity level at which the alerts are sent to this channel
severity_threshold: info
telegram:
# Alert via telegram? Note: also supersedes chain-specific settings
enabled: no
# API key ... talk to @BotFather
api_key: "5555555555:AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
# The group ID for the chat where messages will be sent. Google how to find this, will include better info later.
channel: "-666666666"
# Severity threshold defines the minimum severity level at which the alerts are sent to this channel
severity_threshold: info
slack:
# Send alerts to Slack?
enabled: no
# The webhook can be added in the Slack app directory.
webhook: https://hooks.slack.com/services/AAAAAAAAAAAAAAAAAAAAAAA/bbbbbbbbbbbbbbbbbbbbbbbb
# Severity threshold defines the minimum severity level at which the alerts are sent to this channel
severity_threshold: info
# Alert defaults shared by all chains
# If the chain stops seeing new blocks, should an alert be sent?
stalled_enabled: yes
# How long a halted chain takes in minutes to generate an alarm
stalled_minutes: 10
# Most basic alarm, you just missed x blocks ... would you like to know?
consecutive_enabled: yes
# How many missed blocks should trigger a notification?
consecutive_missed: 5
# Consecutive Missed alert Pagerduty Severity
consecutive_priority: critical
# For each chain there is a specific window of blocks and a percentage of missed blocks that will result in
# a downtime jail infraction. Should an alert be sent if a certain percentage of this window is exceeded?
percentage_enabled: no
# What percentage should trigger the alert
percentage_missed: 10
# Percentage Missed alert Pagerduty Severity
percentage_priority: warning
# Empty blocks notification configuration
consecutive_empty_enabled: no
# How many consecutive empty blocks should trigger a notification?
consecutive_empty: 5
# Consecutive Empty alert Pagerduty Severity
consecutive_empty_priority: warning
# For some Cosmos EVM chains, empty consensus blocks may decrease execution uptime
# since they aren't included in EVM state. Should an alert be sent if empty blocks are detected?
empty_percentage_enabled: no
# What percentage should trigger the alert
empty_percentage: 2
# Percentage Empty alert Pagerduty Severity
empty_percentage_priority: warning
# Should an alert be sent if the validator is not in the active set ie, jailed,
# tombstoned, unbonding?
alert_if_inactive: yes
# Should an alert be sent if no RPC servers are responding? (Note this alarm is instantaneous with no delay)
alert_if_no_servers: yes
# Should alerts be sent there are open governance proposals?
governance_alerts: yes
# Alert when a validator's stake change goes beyond the threshold
stake_change_alerts: yes
stake_change_drop_threshold: 0.05 # meaning 5%
stake_change_increase_threshold: 0.05 # meaning 5%
# Alert when a validator has more than the threhold value of unclaimed rewards
# The threshold is defined with a fiat currency unit like USD, so this feature requires properly configuring coin_market_cap_api_token and enabling convert_to_fiat
unclaimed_rewards_alerts: yes
unclaimed_rewards_threshold_in_fiat_currency: 10000
# Healthcheck settings (dead man's switch)
healthcheck:
# Send pings to determine if the monitor is running?
enabled: no
# URL to send pings to.
ping_url: https://hc-ping.com/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee
# Rate in which pings are sent in seconds.
ping_rate: 60
# If governance_alerts for a chain is enabled, the following defines how frequently a reminder should be sent, in hours
# Optional, the value is 6 (hours) when it is not set, but note that this cannot be configured per chain for now
governance_alerts_reminder_interval: 6
# The various chains to be monitored. Create a new entry for each chain. The name itself can be arbitrary, but a
# user-friendly name is recommended.
chains:
# The user-friendly name that will be used for labels. Highly suggest wrapping in quotes.
"Osmosis":
# chain_id is validated for a match when connecting to an RPC endpoint, also used as a label in several places.
chain_id: osmosis-1
# Hooray, in v2 we derive the valcons from abci queries so you don't have to jump through hoops to figure out how
# to convert ed25519 keys to the appropriate bech32 address.
# Use valcons address if using ICS or tendermint/PubKeyBn254
valoper_address: osmovaloper1xxxxxxx...
# Should the monitor revert to using public API endpoints if all supplied RCP nodes fail?
# This isn't always reliable, not all public nodes have websocket proxying setup correctly.
public_fallback: yes
# the name/slug of this chain, used by CoinMarketCap API to convert the price
slug: osmosis
# Without specifying this option, the inflationRate is queried from a RPC call, but it may not be available for some chains
# If the inflation rate cannot be queried, you can use this option to explicitly set the value
inflationRate: 0.05
# the following section follows the same structure defined in `default_alert_config` and is used for overriding specific values
alerts:
# an example for enabling empty blocks alert, which is disabled by default
consecutive_empty_enabled: yes
consecutive_empty: 3
consecutive_empty_priority: critical
# an example for disabling the pagerduty alert channel, which is enabled by default
pagerduty:
enabled: no
# This section covers our RPC providers. No LCD (aka REST) endpoints are used, only TM's RPC endpoints
# Multiple hosts are encouraged, and will be tried sequentially until a working endpoint is discovered.
nodes:
# URL for the endpoint. Must include protocol://hostname:port
- url: tcp://localhost:26657
# Should we send an alert if this host isn't responding?
alert_if_down: yes
# repeat hosts for monitoring redundancy
- url: https://some-other-node:443
alert_if_down: no
# Optional: Use this folder to manage multiple chains
# Example chain files: chains.d/cosmos.yml, chains.d/jackal.yml, chains.d/injective.yml
# Each chain can have its own nodes, alerts, and settings
chains:
"Osmosis":
chain_id: osmosis-1
valoper_address: osmovaloper1xxxxxxx...
public_fallback: yes
slug: osmosis
inflationRate: 0.05
alerts:
consecutive_empty_enabled: yes
consecutive_empty: 3
consecutive_empty_priority: critical
pagerduty:
enabled: no
nodes:
- url: tcp://localhost:26657
alert_if_down: yes
- url: https://some-other-node:443
alert_if_down: no
Run the monitoring container
docker compose up -d
After it starts successfully, access the dashboard at:
http://<your-server-ip>:8888
Last updated