Monitoring

Install docker and docker compose

sudo apt-get update
sudo apt-get install ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc

echo \
  "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
  $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" | \
  sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update

sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin

Create monitoring folder

mkdir $HOME/monitoring
cd $HOME/monitoring

Create some files as below

version: '3.2'
services:

  v2:
    image: firstset/tenderduty:latest
    command: ""
    ports:
      - "8888:8888" # Dashboard
      - "28686:28686" # Prometheus exporter
    volumes:
      - home:/var/lib/tenderduty
      - ./config.yml:/var/lib/tenderduty/config.yml
      - ./chains.d:/var/lib/tenderduty/chains.d/
    logging:
      driver: "json-file"
      options:
        max-size: "20m"
        max-file: "10"
    restart: unless-stopped

volumes:
  home:

---
# controls whether the dashboard is enabled.
enable_dashboard: yes
# What TCP port the dashboard will listen on. Only the port is controllable for now.
listen_port: 8888
# hide_logs is useful if the dashboard will be posted publicly. It disables the log feed,
# and obscures most node-related details. Be aware this isn't fully vetted for preventing
# info leaks about node names, etc.
hide_logs: no
# How long to wait before alerting that a node is down.
node_down_alert_minutes: 3
# Node Down alert Pagerduty Severity
node_down_alert_severity: critical
# whether skip the verification of TLS certificates, when set to `yes` Tenderduty will skip certificate verification and accept self-signed certs
# NOTE: this flag should be false in a production environment
tls_skip_verify: no

# Should the prometheus exporter be enabled?
prometheus_enabled: yes
# What port should it listen on? For now only port is configurable.
prometheus_listen_port: 28686

# CoinMarketCap API key for price conversions
coin_market_cap_api_token: xxxxxx
# When enabled, the cryptos will be converted into a fiat currency based on its latest price
convert_to_fiat:
  enabled: true
  currency: USD # or EUR, SEK, etc.
  cache_expiration: 8 # cache the pricing data for 8 hours

# Default alert configuration used for all chains unless overridden
default_alert_config:
  pagerduty:
    # Should we use PD? Be aware that if this is set to no it overrides individual chain alerting settings.
    enabled: no
    # This is an API key, not oauth token, more details to follow, but check the v1 docs for more info
    api_key: aaaaaaaaaaaabbbbbbbbbbbbbcccccccccccc
    # Not currently used, but will be soon. This allows setting escalation priorities etc.
    default_severity: alert
    # Severity threshold defines the minimum severity level at which the alerts are sent to this channel
    # In Tenderduty there are three severity levels: info, warning, and critical. `severity_threshold: critical` means that Tenderduty only sends critical alerts to this channel (Pagerduty)
    severity_threshold: critical

  discord:
    # Alert to discord?
    enabled: no
    # The webhook is set by right-clicking on a channel, editing the settings, and configuring a webhook in the intergrations section.
    webhook: https://discord.com/api/webhooks/999999999999999999/zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
    # Severity threshold defines the minimum severity level at which the alerts are sent to this channel
    severity_threshold: info

  telegram:
    # Alert via telegram? Note: also supersedes chain-specific settings
    enabled: no
    # API key ... talk to @BotFather
    api_key: "5555555555:AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
    # The group ID for the chat where messages will be sent. Google how to find this, will include better info later.
    channel: "-666666666"
    # Severity threshold defines the minimum severity level at which the alerts are sent to this channel
    severity_threshold: info

  slack:
    # Send alerts to Slack?
    enabled: no
    # The webhook can be added in the Slack app directory.
    webhook: https://hooks.slack.com/services/AAAAAAAAAAAAAAAAAAAAAAA/bbbbbbbbbbbbbbbbbbbbbbbb
    # Severity threshold defines the minimum severity level at which the alerts are sent to this channel
    severity_threshold: info

  # Alert defaults shared by all chains
  # If the chain stops seeing new blocks, should an alert be sent?
  stalled_enabled: yes
  # How long a halted chain takes in minutes to generate an alarm
  stalled_minutes: 10
  # Most basic alarm, you just missed x blocks ... would you like to know?
  consecutive_enabled: yes
  # How many missed blocks should trigger a notification?
  consecutive_missed: 5
  # Consecutive Missed alert Pagerduty Severity
  consecutive_priority: critical

  # For each chain there is a specific window of blocks and a percentage of missed blocks that will result in
  # a downtime jail infraction. Should an alert be sent if a certain percentage of this window is exceeded?
  percentage_enabled: no
  # What percentage should trigger the alert
  percentage_missed: 10
  # Percentage Missed alert Pagerduty Severity
  percentage_priority: warning

  # Empty blocks notification configuration
  consecutive_empty_enabled: no
  # How many consecutive empty blocks should trigger a notification?
  consecutive_empty: 5
  # Consecutive Empty alert Pagerduty Severity
  consecutive_empty_priority: warning

  # For some Cosmos EVM chains, empty consensus blocks may decrease execution uptime
  # since they aren't included in EVM state. Should an alert be sent if empty blocks are detected?
  empty_percentage_enabled: no
  # What percentage should trigger the alert
  empty_percentage: 2
  # Percentage Empty alert Pagerduty Severity
  empty_percentage_priority: warning

  # Should an alert be sent if the validator is not in the active set ie, jailed,
  # tombstoned, unbonding?
  alert_if_inactive: yes
  # Should an alert be sent if no RPC servers are responding? (Note this alarm is instantaneous with no delay)

  alert_if_no_servers: yes
  # Should alerts be sent there are open governance proposals?
  governance_alerts: yes

  # Alert when a validator's stake change goes beyond the threshold
  stake_change_alerts: yes
  stake_change_drop_threshold: 0.05 # meaning 5%
  stake_change_increase_threshold: 0.05 # meaning 5%

  # Alert when a validator has more than the threhold value of unclaimed rewards
  # The threshold is defined with a fiat currency unit like USD, so this feature requires properly configuring coin_market_cap_api_token and enabling convert_to_fiat
  unclaimed_rewards_alerts: yes
  unclaimed_rewards_threshold_in_fiat_currency: 10000

# Healthcheck settings (dead man's switch)
healthcheck:
  # Send pings to determine if the monitor is running?
  enabled: no
  # URL to send pings to.
  ping_url: https://hc-ping.com/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee
  # Rate in which pings are sent in seconds.
  ping_rate: 60

# If governance_alerts for a chain is enabled, the following defines how frequently a reminder should be sent, in hours
# Optional, the value is 6 (hours) when it is not set, but note that this cannot be configured per chain for now
governance_alerts_reminder_interval: 6

# The various chains to be monitored. Create a new entry for each chain. The name itself can be arbitrary, but a
# user-friendly name is recommended.
chains:
  # The user-friendly name that will be used for labels. Highly suggest wrapping in quotes.
  "Osmosis":
    # chain_id is validated for a match when connecting to an RPC endpoint, also used as a label in several places.
    chain_id: osmosis-1
    # Hooray, in v2 we derive the valcons from abci queries so you don't have to jump through hoops to figure out how
    # to convert ed25519 keys to the appropriate bech32 address.
    # Use valcons address if using ICS or tendermint/PubKeyBn254
    valoper_address: osmovaloper1xxxxxxx...
    # Should the monitor revert to using public API endpoints if all supplied RCP nodes fail?
    # This isn't always reliable, not all public nodes have websocket proxying setup correctly.
    public_fallback: yes
    # the name/slug of this chain, used by CoinMarketCap API to convert the price
    slug: osmosis

    # Without specifying this option, the inflationRate is queried from a RPC call, but it may not be available for some chains
    # If the inflation rate cannot be queried, you can use this option to explicitly set the value
    inflationRate: 0.05

    # the following section follows the same structure defined in `default_alert_config` and is used for overriding specific values
    alerts:
      # an example for enabling empty blocks alert, which is disabled by default
      consecutive_empty_enabled: yes
      consecutive_empty: 3
      consecutive_empty_priority: critical
      # an example for disabling the pagerduty alert channel, which is enabled by default
      pagerduty:
        enabled: no

    # This section covers our RPC providers. No LCD (aka REST) endpoints are used, only TM's RPC endpoints
    # Multiple hosts are encouraged, and will be tried sequentially until a working endpoint is discovered.
    nodes:
      # URL for the endpoint. Must include protocol://hostname:port
      - url: tcp://localhost:26657
        # Should we send an alert if this host isn't responding?
        alert_if_down: yes
      # repeat hosts for monitoring redundancy
      - url: https://some-other-node:443
        alert_if_down: no

# Optional: Use this folder to manage multiple chains
# Example chain files: chains.d/cosmos.yml, chains.d/jackal.yml, chains.d/injective.yml
# Each chain can have its own nodes, alerts, and settings

chains:
  "Osmosis":
    chain_id: osmosis-1
    valoper_address: osmovaloper1xxxxxxx...
    public_fallback: yes
    slug: osmosis
    inflationRate: 0.05

    alerts:
      consecutive_empty_enabled: yes
      consecutive_empty: 3
      consecutive_empty_priority: critical
      pagerduty:
        enabled: no

    nodes:
      - url: tcp://localhost:26657
        alert_if_down: yes
      - url: https://some-other-node:443
        alert_if_down: no

Run the monitoring container

docker compose up -d

After it starts successfully, access the dashboard at:

http://<your-server-ip>:8888

PreviousRestake NextExplorer

Last updated 2 months ago